In [58]:
import os
import json
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import pandas as pd
from flair.embeddings import WordEmbeddings, FlairEmbeddings, DocumentPoolEmbeddings, Sentence, DocumentRNNEmbeddings
from flair.models import TextClassifier
from flair.trainers import ModelTrainer
from flair.datasets import CSVClassificationCorpus
import os
from sklearn.metrics import f1_score, accuracy_score, recall_score, precision_score

In [2]:
DATA_PATH = '../fakenewsnet_dataset'
DATASET_NAME = 'politifact'
DATASET_PATH = '{}/{}'.format(DATA_PATH, DATASET_NAME)
REAL_DATA_PATH = '{}/real'.format(DATASET_PATH)
FAKE_DATA_PATH = '{}/fake'.format(DATASET_PATH)

In [3]:
def load_json_from_file(path):
    with open(path) as json_file:
        data = json.load(json_file)
    return data

In [4]:
class Article():
    def __init__(self, name, path):
        self.path = path
        self.name = name
        self.content = None
        self.tweets = []
        
    def load_content(self):
        content_path = "{}/news content.json".format(self.path)
        if os.path.isfile(content_path):
            self.content = load_json_from_file(content_path)
    
    def load_tweets(self):
        tweets_path = "{}/tweets".format(self.path)
        if os.path.isdir(tweets_path):
            tweets_files = os.listdir(tweets_path)
            self.tweets = [load_json_from_file("{}/{}".format(tweets_path, file)) for file in tweets_files]

In [5]:
def load_single_article(name, path):
    art = Article(name, path)
    art.load_content()
    art.load_tweets()
    return art

def load_all_articles(path):
    articles = []
    if os.path.isdir(path):
        articles_files = os.listdir(path)
        articles = [load_single_article(file, "{}/{}".format(path, file)) for file in articles_files]
    return articles

In [6]:
fake_arts = load_all_articles(FAKE_DATA_PATH)
real_arts = load_all_articles(REAL_DATA_PATH)

In [7]:
fake_arts_with_content = [art for art in fake_arts if art.content is not None]
real_arts_with_content = [art for art in real_arts if art.content is not None]

In [14]:
fake_data = [(art, 'fake') for art in fake_arts_with_content]
real_data = [(art, 'real') for art in real_arts_with_content]

In [15]:
np.random.shuffle(fake_data)
np.random.shuffle(real_data)

In [88]:
train_data = fake_data[0:int(len(fake_data)*0.8)] + real_data[0:int(len(real_data)*0.8)]
test_data = fake_data[int(len(fake_data)*0.8):] + real_data[int(len(real_data)*0.8):]
np.random.shuffle(train_data)
np.random.shuffle(test_data)

In [89]:
print(len(train_data))
print(len(test_data))
print(len(train_data) + len(test_data))
print(len(fake_data) + len(real_data))

643
162
805
805


In [91]:
def clear_text(text):
    return ' '.join(text.split())

class Classifier():
    def __init__(self, classifier):
        self.classifier = classifier
        
    def predict(self, text):
        text = clear_text(text)
        sentence = Sentence(text)
        self.classifier.predict(sentence)
        return sentence.labels[0]

def transform_data(data):
    return [{'label': label, 'text': clear_text(x)} for x, label in data]

def save_data(data, data_folder = '.'):
    if not os.path.exists(data_folder):
        os.makedirs(data_folder)
    data = transform_data(data)
    frame_data = pd.DataFrame(data)
    train_path = '{}/train.csv'.format(data_folder)
    test_path = '{}/test.csv'.format(data_folder)
    dev_path = '{}/dev.csv'.format(data_folder)
    frame_data.iloc[0:int(len(data)*0.8)].to_csv(train_path, sep='\t', index = False, header = False)
    frame_data.iloc[int(len(data)*0.8):int(len(data)*0.9)].to_csv(test_path, sep='\t', index = False, header = False)
    frame_data.iloc[int(len(data)*0.9):].to_csv(dev_path, sep='\t', index = False, header = False)

def load_corpus(data_folder = '.'):
    column_name_map = {1: "text", 0: "label"}
    return CSVClassificationCorpus(data_folder,
                                     column_name_map,
                                     delimiter='\t',
                                  test_file='test.csv',
                                  dev_file='dev.csv',
                                  train_file='train.csv')
    
def train_classifier(corpus, model_folder = '.', max_epochs = 1):
    label_dict = corpus.make_label_dictionary()

    word_embeddings = [
        WordEmbeddings('glove'),
        FlairEmbeddings('news-forward-fast'),
        FlairEmbeddings('news-backward-fast')
    ]

    document_embeddings = DocumentRNNEmbeddings(word_embeddings,
                                                hidden_size=512,
                                                reproject_words=True,
                                                reproject_words_dimension=256)

    classifier = TextClassifier(document_embeddings,
                                label_dictionary=label_dict)

    trainer = ModelTrainer(classifier, corpus)

    trainer.train(model_folder, max_epochs=max_epochs)
    
    return TextClassifier.load('{}/best-model.pt'.format(model_folder))
    
def train_model(train_data,
               data_folder = '.',
               model_folder = '.',
               max_epochs=1
               ):
    save_data(train_data, data_folder)
    corpus = load_corpus(data_folder)
    classifier = train_classifier(corpus, model_folder, max_epochs)
    return Classifier(classifier)

def calculate_metrics(y_true, y_pred, pos_label = 'fake'):
    acc = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, pos_label=pos_label)
    recall = recall_score(y_true, y_pred, pos_label=pos_label)
    f1 = f1_score(y_true, y_pred, pos_label=pos_label)
    return acc, precision, recall, f1

def validate_model(test_data, classifier):
    y_true = [label for x, label in test_data]
    y_pred = [classifier.predict(x).value for x, label in test_data]
    acc, precision, recall, f1 = calculate_metrics(y_true, y_pred)
    print("acc: ", acc)
    print("precision: ", precision)
    print("recall: ", recall)
    print("f1: ", f1)
    return acc, precision, recall, f1

def make_test(train_data, test_data, data_folder, model_folder, max_epochs):
    classifier = train_model(train_data, data_folder, model_folder, max_epochs)
    validate_model(test_data, classifier)
       

In [92]:
train_content = [(x.content, label) for x, label in train_data] 
test_content = [(x.content, label) for x, label in test_data]

In [116]:
train_title = [(x['title'], label) for x, label in train_content] 
test_title = [(x['title'], label) for x, label in test_content]
print(len([x for x, label in train_title if x == '']))
print(len([x for x, label in test_title if x == '']))
train_title = [(x, label) for x, label in train_title if x != ''] 
test_title = [(x, label) for x, label in test_title if x != '']
print(len(train_title))
print(len(test_title))

35
6
608
156


In [94]:
path = './test/title1'
make_test(train_title, test_title, path, path, 1)

2019-11-23 23:28:52,755 Reading data from test/title1
2019-11-23 23:28:52,756 Train: test/title1/train.csv
2019-11-23 23:28:52,757 Dev: test/title1/dev.csv
2019-11-23 23:28:52,760 Test: test/title1/test.csv
2019-11-23 23:28:52,767 Computing label dictionary. Progress:


100%|██████████| 486/486 [00:00<00:00, 486.07it/s]

2019-11-23 23:28:55,276 [b'real', b'fake']





2019-11-23 23:28:56,730 ----------------------------------------------------------------------------------------------------
2019-11-23 23:28:56,731 Model: "TextClassifier(
  (document_embeddings): DocumentRNNEmbeddings(
    (embeddings): StackedEmbeddings(
      (list_embedding_0): WordEmbeddings('glove')
      (list_embedding_1): FlairEmbeddings(
        (lm): LanguageModel(
          (drop): Dropout(p=0.25, inplace=False)
          (encoder): Embedding(275, 100)
          (rnn): LSTM(100, 1024)
          (decoder): Linear(in_features=1024, out_features=275, bias=True)
        )
      )
      (list_embedding_2): FlairEmbeddings(
        (lm): LanguageModel(
          (drop): Dropout(p=0.25, inplace=False)
          (encoder): Embedding(275, 100)
          (rnn): LSTM(100, 1024)
          (decoder): Linear(in_features=1024, out_features=275, bias=True)
        )
      )
    )
    (word_reprojection_map): Linear(in_features=2148, out_features=256, bias=True)
    (rnn): GRU(256, 512, ba

  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "


2019-11-23 23:29:46,025 ----------------------------------------------------------------------------------------------------
2019-11-23 23:29:46,026 Testing using best model ...
2019-11-23 23:29:46,028 loading file test/title1/best-model.pt
2019-11-23 23:30:12,689 0.7869	0.7869	0.7869
2019-11-23 23:30:12,695 
MICRO_AVG: acc 0.6486 - f1-score 0.7869
MACRO_AVG: acc 0.6477 - f1-score 0.786
fake       tp: 26 - fp: 11 - fn: 2 - tn: 22 - precision: 0.7027 - recall: 0.9286 - accuracy: 0.6667 - f1-score: 0.8000
real       tp: 22 - fp: 2 - fn: 11 - tn: 26 - precision: 0.9167 - recall: 0.6667 - accuracy: 0.6286 - f1-score: 0.7720
2019-11-23 23:30:12,696 ----------------------------------------------------------------------------------------------------
2019-11-23 23:30:12,699 loading file ./test/title1/best-model.pt
acc:  0.6923076923076923
precision:  0.6355140186915887
recall:  0.8831168831168831
f1:  0.7391304347826086


In [95]:
path = './test/title10'
make_test(train_title, test_title, path, path, 10)

2019-11-23 23:32:31,004 Reading data from test/title10
2019-11-23 23:32:31,005 Train: test/title10/train.csv
2019-11-23 23:32:31,006 Dev: test/title10/dev.csv
2019-11-23 23:32:31,007 Test: test/title10/test.csv
2019-11-23 23:32:31,012 Computing label dictionary. Progress:


100%|██████████| 486/486 [00:00<00:00, 520.11it/s] 

2019-11-23 23:32:33,404 [b'real', b'fake']





2019-11-23 23:32:34,863 ----------------------------------------------------------------------------------------------------
2019-11-23 23:32:34,864 Model: "TextClassifier(
  (document_embeddings): DocumentRNNEmbeddings(
    (embeddings): StackedEmbeddings(
      (list_embedding_0): WordEmbeddings('glove')
      (list_embedding_1): FlairEmbeddings(
        (lm): LanguageModel(
          (drop): Dropout(p=0.25, inplace=False)
          (encoder): Embedding(275, 100)
          (rnn): LSTM(100, 1024)
          (decoder): Linear(in_features=1024, out_features=275, bias=True)
        )
      )
      (list_embedding_2): FlairEmbeddings(
        (lm): LanguageModel(
          (drop): Dropout(p=0.25, inplace=False)
          (encoder): Embedding(275, 100)
          (rnn): LSTM(100, 1024)
          (decoder): Linear(in_features=1024, out_features=275, bias=True)
        )
      )
    )
    (word_reprojection_map): Linear(in_features=2148, out_features=256, bias=True)
    (rnn): GRU(256, 512, ba

  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "


2019-11-23 23:33:17,329 ----------------------------------------------------------------------------------------------------


  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "


2019-11-23 23:33:20,947 epoch 2 - iter 0/16 - loss 0.51937228 - samples/sec: 12.71
2019-11-23 23:33:23,339 epoch 2 - iter 1/16 - loss 0.54447788 - samples/sec: 13.54
2019-11-23 23:33:25,443 epoch 2 - iter 2/16 - loss 0.55576207 - samples/sec: 15.35
2019-11-23 23:33:27,783 epoch 2 - iter 3/16 - loss 0.56155623 - samples/sec: 13.76
2019-11-23 23:33:30,042 epoch 2 - iter 4/16 - loss 0.55056062 - samples/sec: 14.36
2019-11-23 23:33:31,484 epoch 2 - iter 5/16 - loss 0.54723892 - samples/sec: 22.43
2019-11-23 23:33:33,931 epoch 2 - iter 6/16 - loss 0.56366252 - samples/sec: 13.18
2019-11-23 23:33:35,435 epoch 2 - iter 7/16 - loss 0.56681088 - samples/sec: 21.64
2019-11-23 23:33:36,831 epoch 2 - iter 8/16 - loss 0.56928225 - samples/sec: 23.19
2019-11-23 23:33:38,448 epoch 2 - iter 9/16 - loss 0.56456451 - samples/sec: 19.94
2019-11-23 23:33:39,876 epoch 2 - iter 10/16 - loss 0.56214343 - samples/sec: 22.88
2019-11-23 23:33:41,119 epoch 2 - iter 11/16 - loss 0.55706996 - samples/sec: 26.02
20

2019-11-23 23:36:46,251 epoch 6 - iter 14/16 - loss 0.41754213 - samples/sec: 12.48
2019-11-23 23:36:46,987 epoch 6 - iter 15/16 - loss 0.40156722 - samples/sec: 44.35
2019-11-23 23:36:47,568 ----------------------------------------------------------------------------------------------------
2019-11-23 23:36:47,570 EPOCH 6 done: loss 0.4016 - lr 0.1000
2019-11-23 23:36:54,343 DEV : loss 0.3670271039009094 - score 0.8197
2019-11-23 23:36:54,393 BAD EPOCHS (no improvement): 1
2019-11-23 23:36:58,985 ----------------------------------------------------------------------------------------------------
2019-11-23 23:37:02,392 epoch 7 - iter 0/16 - loss 0.29282936 - samples/sec: 15.52
2019-11-23 23:37:04,414 epoch 7 - iter 1/16 - loss 0.35789925 - samples/sec: 16.25
2019-11-23 23:37:06,515 epoch 7 - iter 2/16 - loss 0.39505283 - samples/sec: 15.34
2019-11-23 23:37:08,205 epoch 7 - iter 3/16 - loss 0.37502851 - samples/sec: 19.15
2019-11-23 23:37:12,020 epoch 7 - iter 4/16 - loss 0.39226651 - 

2019-11-23 23:40:12,517 ----------------------------------------------------------------------------------------------------
2019-11-23 23:40:12,522 loading file ./test/title10/best-model.pt
acc:  0.7435897435897436
precision:  0.6907216494845361
recall:  0.8701298701298701
f1:  0.7701149425287357


In [96]:
acc = 0.691
precision = 0.638
recall = 0.789
f1 = 0.706

acc2 = 0.7435897435897436
precision2 = 0.6907216494845361
recall2 = 0.8701298701298701
f12 = 0.7701149425287357

In [107]:
def diff(old, new, name=''):
    print(name, (new-old)*100, (new/old-1)*100)

In [108]:
diff(acc, acc2, 'acc')
diff(precision, precision2, 'precision')
diff(recall, recall2, 'recall')
diff(f1, f12, 'f1')

acc 5.258974358974367 7.610672010093156
precision 5.2721649484536055 8.263581423908484
recall 8.112987012987006 10.28261978832321
f1 6.411494252873573 9.081436618801098


In [111]:
train_content[0][0].keys()

dict_keys(['url', 'text', 'images', 'top_img', 'keywords', 'authors', 'canonical_link', 'title', 'meta_data', 'movies', 'publish_date', 'source', 'summary'])

In [114]:
train_summary = [(x['summary'], label) for x, label in train_content] 
test_summary = [(x['summary'], label) for x, label in test_content]
print(len([x for x, label in train_summary if x == '']))
print(len([x for x, label in test_summary if x == '']))
train_summary = [(x, label) for x, label in train_summary if x != ''] 
test_summary = [(x, label) for x, label in test_summary if x != '']
print(len(train_summary))
print(len(test_summary))

643
162
0
0


In [118]:
train_text = [(x['text'], label) for x, label in train_content] 
test_text = [(x['text'], label) for x, label in test_content]
print(len([x for x, label in train_text if x == '']))
print(len([x for x, label in test_text if x == '']))
train_text = [(x, label) for x, label in train_text if x != ''] 
test_text = [(x, label) for x, label in test_text if x != '']
print(len(train_text))
print(len(test_text))

62
11
581
151


In [120]:
path = './test/text1'
make_test(train_text, test_text, path, path, 1)

2019-11-23 23:58:03,519 Reading data from test/text1
2019-11-23 23:58:03,521 Train: test/text1/train.csv
2019-11-23 23:58:03,522 Dev: test/text1/dev.csv
2019-11-23 23:58:03,523 Test: test/text1/test.csv
2019-11-23 23:58:03,585 Computing label dictionary. Progress:



  0%|          | 0/464 [00:00<?, ?it/s][A
  3%|▎         | 14/464 [00:00<00:08, 51.15it/s][A
  7%|▋         | 31/464 [00:00<00:07, 61.81it/s][A
  8%|▊         | 36/464 [00:00<00:08, 52.63it/s][A
  8%|▊         | 36/464 [03:20<00:08, 52.63it/s][A

RuntimeError: DataLoader worker (pid 8377) is killed by signal: Killed. 