In [1]:
import os
import json
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import pandas as pd
from flair.embeddings import WordEmbeddings, FlairEmbeddings, DocumentPoolEmbeddings, Sentence, DocumentRNNEmbeddings
from flair.models import TextClassifier
from flair.trainers import ModelTrainer
from flair.datasets import CSVClassificationCorpus
import os
from sklearn.metrics import f1_score, accuracy_score, recall_score, precision_score

In [2]:
DATA_PATH = '../fakenewsnet_dataset/dataset'
DATASET_NAME = 'politifact'
DATASET_PATH = '{}/{}'.format(DATA_PATH, DATASET_NAME)
REAL_DATA_PATH = '{}_real.csv'.format(DATASET_PATH)
FAKE_DATA_PATH = '{}_fake.csv'.format(DATASET_PATH)

In [3]:
fake_arts = pd.read_csv(FAKE_DATA_PATH, na_values=['nan'], keep_default_na=False)
real_arts = pd.read_csv(REAL_DATA_PATH, na_values=['nan'], keep_default_na=False)

In [4]:
def parse_art_data_frame(df):
    return [{'id': id, 'url': url, 'title': title} for id, url, title, tweets in df.values]
    
fake_arts_with_content = parse_art_data_frame(fake_arts)
real_arts_with_content = parse_art_data_frame(real_arts)

In [5]:
fake_data = [(art, 'fake') for art in fake_arts_with_content]
real_data = [(art, 'real') for art in real_arts_with_content]

In [6]:
# np.random.shuffle(fake_data)
# np.random.shuffle(real_data)

In [6]:
train_data = fake_data[0:int(len(fake_data)*0.8)] + real_data[0:int(len(real_data)*0.8)]
test_data = fake_data[int(len(fake_data)*0.8):] + real_data[int(len(real_data)*0.8):]
# np.random.shuffle(train_data)
# np.random.shuffle(test_data)

In [7]:
print(len(train_data))
print(len(test_data))
print(len(train_data) + len(test_data))
print(len(fake_data) + len(real_data))

844
212
1056
1056


In [8]:
def clear_text(text):
    return ' '.join(text.split())

class Classifier():
    def __init__(self, classifier):
        self.classifier = classifier
        
    def predict(self, text):
        text = clear_text(text)
        sentence = Sentence(text)
        self.classifier.predict(sentence)
        return sentence.labels[0]

def transform_data(data):
    return [{'label': label, 'text': clear_text(x)} for x, label in data]

def save_data(data, data_folder = '.'):
    if not os.path.exists(data_folder):
        os.makedirs(data_folder)
    data = transform_data(data)
    frame_data = pd.DataFrame(data)
    train_path = '{}/train.csv'.format(data_folder)
    test_path = '{}/test.csv'.format(data_folder)
    dev_path = '{}/dev.csv'.format(data_folder)
    frame_data.iloc[0:int(len(data)*0.8)].to_csv(train_path, sep='\t', index = False, header = False)
    frame_data.iloc[int(len(data)*0.8):int(len(data)*0.9)].to_csv(test_path, sep='\t', index = False, header = False)
    frame_data.iloc[int(len(data)*0.9):].to_csv(dev_path, sep='\t', index = False, header = False)

def load_corpus(data_folder = '.'):
    column_name_map = {1: "text", 0: "label"}
    return CSVClassificationCorpus(data_folder,
                                     column_name_map,
                                     delimiter='\t',
                                  test_file='test.csv',
                                  dev_file='dev.csv',
                                  train_file='train.csv')
    
def train_classifier(corpus, model_folder = '.', max_epochs = 1):
    label_dict = corpus.make_label_dictionary()

    word_embeddings = [
        WordEmbeddings('glove'),
        FlairEmbeddings('news-forward-fast'),
        FlairEmbeddings('news-backward-fast')
    ]

    document_embeddings = DocumentRNNEmbeddings(word_embeddings,
                                                hidden_size=512,
                                                reproject_words=True,
                                                reproject_words_dimension=256)

    classifier = TextClassifier(document_embeddings,
                                label_dictionary=label_dict)

    trainer = ModelTrainer(classifier, corpus)

    trainer.train(model_folder, max_epochs=max_epochs)
    
    return TextClassifier.load('{}/best-model.pt'.format(model_folder))
    
def train_model(train_data,
               data_folder = '.',
               model_folder = '.',
               max_epochs=1
               ):
    save_data(train_data, data_folder)
    corpus = load_corpus(data_folder)
    classifier = train_classifier(corpus, model_folder, max_epochs)
    return Classifier(classifier)

def calculate_metrics(y_true, y_pred, pos_label = 'fake'):
    acc = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, pos_label=pos_label)
    recall = recall_score(y_true, y_pred, pos_label=pos_label)
    f1 = f1_score(y_true, y_pred, pos_label=pos_label)
    return acc, precision, recall, f1

def validate_model(test_data, classifier):
    y_true = [label for x, label in test_data]
    y_pred = [classifier.predict(x).value for x, label in test_data]
    acc, precision, recall, f1 = calculate_metrics(y_true, y_pred)
    print("acc: ", acc)
    print("precision: ", precision)
    print("recall: ", recall)
    print("f1: ", f1)
    return acc, precision, recall, f1

def make_test(train_data, test_data, data_folder, model_folder, max_epochs):
    classifier = train_model(train_data, data_folder, model_folder, max_epochs)
    validate_model(test_data, classifier)
       

In [9]:
train_content = [(x, label) for x, label in train_data] 
test_content = [(x, label) for x, label in test_data]

In [10]:
train_title = [(x['title'], label) for x, label in train_content] 
test_title = [(x['title'], label) for x, label in test_content]
print(len([x for x, label in train_title if x == '']))
print(len([x for x, label in test_title if x == '']))
train_title = [(x, label) for x, label in train_title if x != ''] 
test_title = [(x, label) for x, label in test_title if x != '']
print(len(train_title))
print(len(test_title))

0
0
844
212


In [11]:
path = './test_csv/title1'
make_test(train_title, test_title, path, path, 1)

2019-11-27 11:33:23,010 Reading data from test_csv/title1
2019-11-27 11:33:23,011 Train: test_csv/title1/train.csv
2019-11-27 11:33:23,012 Dev: test_csv/title1/dev.csv
2019-11-27 11:33:23,012 Test: test_csv/title1/test.csv
2019-11-27 11:33:23,016 Computing label dictionary. Progress:


100%|██████████| 675/675 [00:00<00:00, 4950.52it/s]

2019-11-27 11:33:23,222 [b'fake', b'real']





2019-11-27 11:33:24,606 ----------------------------------------------------------------------------------------------------
2019-11-27 11:33:24,607 Model: "TextClassifier(
  (document_embeddings): DocumentRNNEmbeddings(
    (embeddings): StackedEmbeddings(
      (list_embedding_0): WordEmbeddings('glove')
      (list_embedding_1): FlairEmbeddings(
        (lm): LanguageModel(
          (drop): Dropout(p=0.25, inplace=False)
          (encoder): Embedding(275, 100)
          (rnn): LSTM(100, 1024)
          (decoder): Linear(in_features=1024, out_features=275, bias=True)
        )
      )
      (list_embedding_2): FlairEmbeddings(
        (lm): LanguageModel(
          (drop): Dropout(p=0.25, inplace=False)
          (encoder): Embedding(275, 100)
          (rnn): LSTM(100, 1024)
          (decoder): Linear(in_features=1024, out_features=275, bias=True)
        )
      )
    )
    (word_reprojection_map): Linear(in_features=2148, out_features=256, bias=True)
    (rnn): GRU(256, 512, ba

  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "


2019-11-27 11:34:00,771 ----------------------------------------------------------------------------------------------------
2019-11-27 11:34:00,775 Testing using best model ...
2019-11-27 11:34:00,777 loading file test_csv/title1/best-model.pt
2019-11-27 11:34:05,279 0.1905	0.1905	0.1905
2019-11-27 11:34:05,280 
MICRO_AVG: acc 0.1053 - f1-score 0.1905
MACRO_AVG: acc 0.0953 - f1-score 0.16
fake       tp: 0 - fp: 68 - fn: 0 - tn: 16 - precision: 0.0000 - recall: 0.0000 - accuracy: 0.0000 - f1-score: 0.0000
real       tp: 16 - fp: 0 - fn: 68 - tn: 0 - precision: 1.0000 - recall: 0.1905 - accuracy: 0.1905 - f1-score: 0.3200
2019-11-27 11:34:05,282 ----------------------------------------------------------------------------------------------------
2019-11-27 11:34:05,284 loading file ./test_csv/title1/best-model.pt
acc:  0.5094339622641509
precision:  0.45549738219895286
recall:  1.0
f1:  0.6258992805755396


In [12]:
path = './test_csv/title10'
make_test(train_title, test_title, path, path, 10)

2019-11-27 11:34:38,105 Reading data from test_csv/title10
2019-11-27 11:34:38,106 Train: test_csv/title10/train.csv
2019-11-27 11:34:38,107 Dev: test_csv/title10/dev.csv
2019-11-27 11:34:38,108 Test: test_csv/title10/test.csv
2019-11-27 11:34:38,111 Computing label dictionary. Progress:


100%|██████████| 675/675 [00:00<00:00, 3098.62it/s]

2019-11-27 11:34:38,470 [b'fake', b'real']





2019-11-27 11:34:39,927 ----------------------------------------------------------------------------------------------------
2019-11-27 11:34:39,928 Model: "TextClassifier(
  (document_embeddings): DocumentRNNEmbeddings(
    (embeddings): StackedEmbeddings(
      (list_embedding_0): WordEmbeddings('glove')
      (list_embedding_1): FlairEmbeddings(
        (lm): LanguageModel(
          (drop): Dropout(p=0.25, inplace=False)
          (encoder): Embedding(275, 100)
          (rnn): LSTM(100, 1024)
          (decoder): Linear(in_features=1024, out_features=275, bias=True)
        )
      )
      (list_embedding_2): FlairEmbeddings(
        (lm): LanguageModel(
          (drop): Dropout(p=0.25, inplace=False)
          (encoder): Embedding(275, 100)
          (rnn): LSTM(100, 1024)
          (decoder): Linear(in_features=1024, out_features=275, bias=True)
        )
      )
    )
    (word_reprojection_map): Linear(in_features=2148, out_features=256, bias=True)
    (rnn): GRU(256, 512, ba

  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "


2019-11-27 11:35:11,117 ----------------------------------------------------------------------------------------------------
2019-11-27 11:35:12,488 epoch 2 - iter 0/22 - loss 0.56040251 - samples/sec: 53.97
2019-11-27 11:35:14,145 epoch 2 - iter 2/22 - loss 0.52227975 - samples/sec: 39.04
2019-11-27 11:35:16,060 epoch 2 - iter 4/22 - loss 0.49964541 - samples/sec: 34.24
2019-11-27 11:35:18,717 epoch 2 - iter 6/22 - loss 0.48989876 - samples/sec: 24.22
2019-11-27 11:35:21,777 epoch 2 - iter 8/22 - loss 0.49891116 - samples/sec: 21.03
2019-11-27 11:35:24,361 epoch 2 - iter 10/22 - loss 0.49492020 - samples/sec: 24.96
2019-11-27 11:35:26,303 epoch 2 - iter 12/22 - loss 0.49768907 - samples/sec: 33.25
2019-11-27 11:35:28,908 epoch 2 - iter 14/22 - loss 0.49331783 - samples/sec: 24.74
2019-11-27 11:35:31,439 epoch 2 - iter 16/22 - loss 0.49326972 - samples/sec: 25.54
2019-11-27 11:35:34,183 epoch 2 - iter 18/22 - loss 0.48789199 - samples/sec: 23.44
2019-11-27 11:35:36,643 epoch 2 - iter 2

2019-11-27 11:38:14,768 ----------------------------------------------------------------------------------------------------
2019-11-27 11:38:17,149 epoch 8 - iter 0/22 - loss 0.47846365 - samples/sec: 31.48
2019-11-27 11:38:19,439 epoch 8 - iter 2/22 - loss 0.40870012 - samples/sec: 28.21
2019-11-27 11:38:21,175 epoch 8 - iter 4/22 - loss 0.34859429 - samples/sec: 37.14
2019-11-27 11:38:23,537 epoch 8 - iter 6/22 - loss 0.37342864 - samples/sec: 27.30
2019-11-27 11:38:27,586 epoch 8 - iter 8/22 - loss 0.36093609 - samples/sec: 15.87
2019-11-27 11:38:30,625 epoch 8 - iter 10/22 - loss 0.36863031 - samples/sec: 21.15
2019-11-27 11:38:33,066 epoch 8 - iter 12/22 - loss 0.35695172 - samples/sec: 26.45
2019-11-27 11:38:36,174 epoch 8 - iter 14/22 - loss 0.35123262 - samples/sec: 20.69
2019-11-27 11:38:38,897 epoch 8 - iter 16/22 - loss 0.36283661 - samples/sec: 23.63
2019-11-27 11:38:40,796 epoch 8 - iter 18/22 - loss 0.37077698 - samples/sec: 33.94
2019-11-27 11:38:43,387 epoch 8 - iter 2

In [13]:
train_url = [(x['url'], label) for x, label in train_content] 
test_url = [(x['url'], label) for x, label in test_content]
print(len([x for x, label in train_url if x == '']))
print(len([x for x, label in test_url if x == '']))
train_url = [(x, label) for x, label in train_url if x != ''] 
test_url = [(x, label) for x, label in test_url if x != '']
print(len(train_url))
print(len(test_url))

48
13
796
199


In [14]:
path = './test_csv/url1'
make_test(train_url, test_url, path, path, 1)

2019-11-27 11:40:39,631 Reading data from test_csv/url1
2019-11-27 11:40:39,632 Train: test_csv/url1/train.csv
2019-11-27 11:40:39,633 Dev: test_csv/url1/dev.csv
2019-11-27 11:40:39,633 Test: test_csv/url1/test.csv
2019-11-27 11:40:39,643 Computing label dictionary. Progress:


100%|██████████| 636/636 [00:00<00:00, 910.76it/s] 

2019-11-27 11:40:41,117 [b'fake', b'real']





2019-11-27 11:40:42,707 ----------------------------------------------------------------------------------------------------
2019-11-27 11:40:42,708 Model: "TextClassifier(
  (document_embeddings): DocumentRNNEmbeddings(
    (embeddings): StackedEmbeddings(
      (list_embedding_0): WordEmbeddings('glove')
      (list_embedding_1): FlairEmbeddings(
        (lm): LanguageModel(
          (drop): Dropout(p=0.25, inplace=False)
          (encoder): Embedding(275, 100)
          (rnn): LSTM(100, 1024)
          (decoder): Linear(in_features=1024, out_features=275, bias=True)
        )
      )
      (list_embedding_2): FlairEmbeddings(
        (lm): LanguageModel(
          (drop): Dropout(p=0.25, inplace=False)
          (encoder): Embedding(275, 100)
          (rnn): LSTM(100, 1024)
          (decoder): Linear(in_features=1024, out_features=275, bias=True)
        )
      )
    )
    (word_reprojection_map): Linear(in_features=2148, out_features=256, bias=True)
    (rnn): GRU(256, 512, ba

  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "


2019-11-27 11:41:29,032 ----------------------------------------------------------------------------------------------------
2019-11-27 11:41:29,034 Testing using best model ...
2019-11-27 11:41:29,037 loading file test_csv/url1/best-model.pt
2019-11-27 11:41:34,251 0.6875	0.6875	0.6875
2019-11-27 11:41:34,253 
MICRO_AVG: acc 0.5238 - f1-score 0.6875
MACRO_AVG: acc 0.3438 - f1-score 0.4074
fake       tp: 0 - fp: 25 - fn: 0 - tn: 55 - precision: 0.0000 - recall: 0.0000 - accuracy: 0.0000 - f1-score: 0.0000
real       tp: 55 - fp: 0 - fn: 25 - tn: 0 - precision: 1.0000 - recall: 0.6875 - accuracy: 0.6875 - f1-score: 0.8148
2019-11-27 11:41:34,253 ----------------------------------------------------------------------------------------------------
2019-11-27 11:41:34,255 loading file ./test_csv/url1/best-model.pt
acc:  0.6934673366834171
precision:  0.8787878787878788
recall:  0.3372093023255814
f1:  0.48739495798319327


In [None]:
path = './test_csv/url10'
make_test(train_url, test_url, path, path, 10)

2019-11-27 11:42:15,261 Reading data from test_csv/url10
2019-11-27 11:42:15,263 Train: test_csv/url10/train.csv
2019-11-27 11:42:15,263 Dev: test_csv/url10/dev.csv
2019-11-27 11:42:15,264 Test: test_csv/url10/test.csv
2019-11-27 11:42:15,268 Computing label dictionary. Progress:


100%|██████████| 636/636 [00:00<00:00, 1400.55it/s]

2019-11-27 11:42:16,385 [b'fake', b'real']





2019-11-27 11:42:17,107 ----------------------------------------------------------------------------------------------------
2019-11-27 11:42:17,108 Model: "TextClassifier(
  (document_embeddings): DocumentRNNEmbeddings(
    (embeddings): StackedEmbeddings(
      (list_embedding_0): WordEmbeddings('glove')
      (list_embedding_1): FlairEmbeddings(
        (lm): LanguageModel(
          (drop): Dropout(p=0.25, inplace=False)
          (encoder): Embedding(275, 100)
          (rnn): LSTM(100, 1024)
          (decoder): Linear(in_features=1024, out_features=275, bias=True)
        )
      )
      (list_embedding_2): FlairEmbeddings(
        (lm): LanguageModel(
          (drop): Dropout(p=0.25, inplace=False)
          (encoder): Embedding(275, 100)
          (rnn): LSTM(100, 1024)
          (decoder): Linear(in_features=1024, out_features=275, bias=True)
        )
      )
    )
    (word_reprojection_map): Linear(in_features=2148, out_features=256, bias=True)
    (rnn): GRU(256, 512, ba

  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "


2019-11-27 11:42:57,829 ----------------------------------------------------------------------------------------------------
2019-11-27 11:42:59,850 epoch 2 - iter 0/20 - loss 0.51298398 - samples/sec: 43.79
2019-11-27 11:43:02,981 epoch 2 - iter 2/20 - loss 0.45409431 - samples/sec: 20.58
2019-11-27 11:43:06,845 epoch 2 - iter 4/20 - loss 0.49931511 - samples/sec: 16.63
2019-11-27 11:43:10,623 epoch 2 - iter 6/20 - loss 0.50904400 - samples/sec: 17.03
2019-11-27 11:43:13,581 epoch 2 - iter 8/20 - loss 0.50336256 - samples/sec: 21.83
2019-11-27 11:43:16,344 epoch 2 - iter 10/20 - loss 0.50507953 - samples/sec: 23.26
2019-11-27 11:43:18,997 epoch 2 - iter 12/20 - loss 0.51089454 - samples/sec: 24.22
2019-11-27 11:43:22,152 epoch 2 - iter 14/20 - loss 0.49423039 - samples/sec: 20.43
2019-11-27 11:43:25,135 epoch 2 - iter 16/20 - loss 0.47709250 - samples/sec: 21.53
2019-11-27 11:43:28,245 epoch 2 - iter 18/20 - loss 0.46806451 - samples/sec: 20.67
2019-11-27 11:43:30,223 ----------------

In [None]:
train_mix = [(x['url'] + ', ' + x['title'], label) for x, label in train_content] 
test_mix = [(x['url'] + ', ' + x['title'], label) for x, label in test_content]
print(len([x for x, label in train_mix if x == '']))
print(len([x for x, label in test_mix if x == '']))
train_mix = [(x, label) for x, label in train_mix if x != ''] 
test_mix = [(x, label) for x, label in test_mix if x != '']
print(len(train_mix))
print(len(test_mix))

In [None]:
path = './test_csv/mix1'
make_test(train_mix, test_mix, path, path, 1)

In [None]:
path = './test_csv/mix10'
make_test(train_mix, test_mix, path, path, 10)