In [1]:
import os
import json
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import pandas as pd
from flair.embeddings import WordEmbeddings, FlairEmbeddings, DocumentPoolEmbeddings, Sentence, DocumentRNNEmbeddings
from flair.models import TextClassifier
from flair.trainers import ModelTrainer
from flair.datasets import CSVClassificationCorpus
import os
from sklearn.metrics import f1_score, accuracy_score, recall_score, precision_score

In [2]:
DATA_PATH = '../fakenewsnet_dataset'
DATASET_NAME = 'politifact'
DATASET_PATH = '{}/{}'.format(DATA_PATH, DATASET_NAME)
REAL_DATA_PATH = '{}/real'.format(DATASET_PATH)
FAKE_DATA_PATH = '{}/fake'.format(DATASET_PATH)

In [3]:
def load_json_from_file(path):
    with open(path) as json_file:
        data = json.load(json_file)
    return data

In [4]:
class Article():
    def __init__(self, name, path):
        self.path = path
        self.name = name
        self.content = None
        self.tweets = []
        
    def load_content(self):
        content_path = "{}/news content.json".format(self.path)
        if os.path.isfile(content_path):
            self.content = load_json_from_file(content_path)
    
    def load_tweets(self):
        tweets_path = "{}/tweets".format(self.path)
        if os.path.isdir(tweets_path):
            tweets_files = os.listdir(tweets_path)
            self.tweets = [load_json_from_file("{}/{}".format(tweets_path, file)) for file in tweets_files]

In [5]:
def load_single_article(name, path):
    art = Article(name, path)
    art.load_content()
#     art.load_tweets()
    return art

def load_all_articles(path):
    articles = []
    if os.path.isdir(path):
        articles_files = os.listdir(path)
        articles = [load_single_article(file, "{}/{}".format(path, file)) for file in articles_files]
    return articles

In [6]:
fake_arts = load_all_articles(FAKE_DATA_PATH)
real_arts = load_all_articles(REAL_DATA_PATH)

In [7]:
fake_arts_with_content = [art for art in fake_arts if art.content is not None]
real_arts_with_content = [art for art in real_arts if art.content is not None]

In [8]:
fake_data = [(art, 'fake') for art in fake_arts_with_content]
real_data = [(art, 'real') for art in real_arts_with_content]

In [9]:
np.random.shuffle(fake_data)
np.random.shuffle(real_data)

In [10]:
train_data = fake_data[0:int(len(fake_data)*0.8)] + real_data[0:int(len(real_data)*0.8)]
test_data = fake_data[int(len(fake_data)*0.8):] + real_data[int(len(real_data)*0.8):]
np.random.shuffle(train_data)
np.random.shuffle(test_data)

In [11]:
print(len(train_data))
print(len(test_data))
print(len(train_data) + len(test_data))
print(len(fake_data) + len(real_data))

643
162
805
805


In [12]:
def clear_text(text):
    return ' '.join(text.split())

class Classifier():
    def __init__(self, classifier):
        self.classifier = classifier
        
    def predict(self, text):
        text = clear_text(text)
        sentence = Sentence(text)
        self.classifier.predict(sentence)
        return sentence.labels[0]

def transform_data(data):
    return [{'label': label, 'text': clear_text(x)} for x, label in data]

def save_data(data, data_folder = '.'):
    if not os.path.exists(data_folder):
        os.makedirs(data_folder)
    data = transform_data(data)
    frame_data = pd.DataFrame(data)
    train_path = '{}/train.csv'.format(data_folder)
    test_path = '{}/test.csv'.format(data_folder)
    dev_path = '{}/dev.csv'.format(data_folder)
    frame_data.iloc[0:int(len(data)*0.8)].to_csv(train_path, sep='\t', index = False, header = False)
    frame_data.iloc[int(len(data)*0.8):int(len(data)*0.9)].to_csv(test_path, sep='\t', index = False, header = False)
    frame_data.iloc[int(len(data)*0.9):].to_csv(dev_path, sep='\t', index = False, header = False)

def load_corpus(data_folder = '.'):
    column_name_map = {1: "text", 0: "label"}
    return CSVClassificationCorpus(data_folder,
                                     column_name_map,
                                     delimiter='\t',
                                  test_file='test.csv',
                                  dev_file='dev.csv',
                                  train_file='train.csv')
    
def train_classifier(corpus, model_folder = '.', max_epochs = 1):
    label_dict = corpus.make_label_dictionary()

    word_embeddings = [
        WordEmbeddings('glove'),
        FlairEmbeddings('news-forward-fast'),
        FlairEmbeddings('news-backward-fast')
    ]

    document_embeddings = DocumentRNNEmbeddings(word_embeddings,
                                                hidden_size=512,
                                                reproject_words=True,
                                                reproject_words_dimension=256)

    classifier = TextClassifier(document_embeddings,
                                label_dictionary=label_dict)

    trainer = ModelTrainer(classifier, corpus)

    trainer.train(model_folder, max_epochs=max_epochs)
    
    return TextClassifier.load('{}/best-model.pt'.format(model_folder))
    
def train_model(train_data,
               data_folder = '.',
               model_folder = '.',
               max_epochs=1
               ):
    save_data(train_data, data_folder)
    corpus = load_corpus(data_folder)
    classifier = train_classifier(corpus, model_folder, max_epochs)
    return Classifier(classifier)

def calculate_metrics(y_true, y_pred, pos_label = 'fake'):
    acc = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, pos_label=pos_label)
    recall = recall_score(y_true, y_pred, pos_label=pos_label)
    f1 = f1_score(y_true, y_pred, pos_label=pos_label)
    return acc, precision, recall, f1

def validate_model(test_data, classifier):
    y_true = [label for x, label in test_data]
    y_pred = [classifier.predict(x).value for x, label in test_data]
    acc, precision, recall, f1 = calculate_metrics(y_true, y_pred)
    print("acc: ", acc)
    print("precision: ", precision)
    print("recall: ", recall)
    print("f1: ", f1)
    return acc, precision, recall, f1

def make_test(train_data, test_data, data_folder, model_folder, max_epochs):
    classifier = train_model(train_data, data_folder, model_folder, max_epochs)
    validate_model(test_data, classifier)
       

In [13]:
train_content = [(x.content, label) for x, label in train_data] 
test_content = [(x.content, label) for x, label in test_data]

In [17]:
train_title = [(x['title'], label) for x, label in train_content] 
test_title = [(x['title'], label) for x, label in test_content]
print(len([x for x, label in train_title if x == '']))
print(len([x for x, label in test_title if x == '']))
train_title = [(x, label) for x, label in train_title if x != ''] 
test_title = [(x, label) for x, label in test_title if x != '']
print(len(train_title))
print(len(test_title))

33
8
610
154


In [18]:
path = './test/title1'
make_test(train_title, test_title, path, path, 1)

2019-11-24 09:34:43,752 Reading data from test/title1
2019-11-24 09:34:43,752 Train: test/title1/train.csv
2019-11-24 09:34:43,753 Dev: test/title1/dev.csv
2019-11-24 09:34:43,754 Test: test/title1/test.csv
2019-11-24 09:34:43,758 Computing label dictionary. Progress:


100%|██████████| 488/488 [00:00<00:00, 3931.48it/s]

2019-11-24 09:34:43,975 [b'real', b'fake']





2019-11-24 09:34:45,466 ----------------------------------------------------------------------------------------------------
2019-11-24 09:34:45,467 Model: "TextClassifier(
  (document_embeddings): DocumentRNNEmbeddings(
    (embeddings): StackedEmbeddings(
      (list_embedding_0): WordEmbeddings('glove')
      (list_embedding_1): FlairEmbeddings(
        (lm): LanguageModel(
          (drop): Dropout(p=0.25, inplace=False)
          (encoder): Embedding(275, 100)
          (rnn): LSTM(100, 1024)
          (decoder): Linear(in_features=1024, out_features=275, bias=True)
        )
      )
      (list_embedding_2): FlairEmbeddings(
        (lm): LanguageModel(
          (drop): Dropout(p=0.25, inplace=False)
          (encoder): Embedding(275, 100)
          (rnn): LSTM(100, 1024)
          (decoder): Linear(in_features=1024, out_features=275, bias=True)
        )
      )
    )
    (word_reprojection_map): Linear(in_features=2148, out_features=256, bias=True)
    (rnn): GRU(256, 512, ba

  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "


2019-11-24 09:35:16,027 ----------------------------------------------------------------------------------------------------
2019-11-24 09:35:16,028 Testing using best model ...
2019-11-24 09:35:16,029 loading file test/title1/best-model.pt
2019-11-24 09:35:20,531 0.6721	0.6721	0.6721
2019-11-24 09:35:20,533 
MICRO_AVG: acc 0.5062 - f1-score 0.6721
MACRO_AVG: acc 0.4445 - f1-score 0.59225
fake       tp: 7 - fp: 0 - fn: 20 - tn: 34 - precision: 1.0000 - recall: 0.2593 - accuracy: 0.2593 - f1-score: 0.4118
real       tp: 34 - fp: 20 - fn: 0 - tn: 7 - precision: 0.6296 - recall: 1.0000 - accuracy: 0.6296 - f1-score: 0.7727
2019-11-24 09:35:20,533 ----------------------------------------------------------------------------------------------------
2019-11-24 09:35:20,535 loading file ./test/title1/best-model.pt
acc:  0.6428571428571429
precision:  0.9285714285714286
recall:  0.3291139240506329
f1:  0.48598130841121495


In [19]:
path = './test/title10'
make_test(train_title, test_title, path, path, 10)

2019-11-24 09:36:59,771 Reading data from test/title10
2019-11-24 09:36:59,772 Train: test/title10/train.csv
2019-11-24 09:36:59,773 Dev: test/title10/dev.csv
2019-11-24 09:36:59,773 Test: test/title10/test.csv
2019-11-24 09:36:59,776 Computing label dictionary. Progress:


100%|██████████| 488/488 [00:00<00:00, 2336.75it/s]

2019-11-24 09:37:00,221 [b'real', b'fake']





2019-11-24 09:37:01,406 ----------------------------------------------------------------------------------------------------
2019-11-24 09:37:01,407 Model: "TextClassifier(
  (document_embeddings): DocumentRNNEmbeddings(
    (embeddings): StackedEmbeddings(
      (list_embedding_0): WordEmbeddings('glove')
      (list_embedding_1): FlairEmbeddings(
        (lm): LanguageModel(
          (drop): Dropout(p=0.25, inplace=False)
          (encoder): Embedding(275, 100)
          (rnn): LSTM(100, 1024)
          (decoder): Linear(in_features=1024, out_features=275, bias=True)
        )
      )
      (list_embedding_2): FlairEmbeddings(
        (lm): LanguageModel(
          (drop): Dropout(p=0.25, inplace=False)
          (encoder): Embedding(275, 100)
          (rnn): LSTM(100, 1024)
          (decoder): Linear(in_features=1024, out_features=275, bias=True)
        )
      )
    )
    (word_reprojection_map): Linear(in_features=2148, out_features=256, bias=True)
    (rnn): GRU(256, 512, ba

  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "


2019-11-24 09:37:28,617 ----------------------------------------------------------------------------------------------------
2019-11-24 09:37:30,140 epoch 2 - iter 0/16 - loss 0.82085043 - samples/sec: 24.43
2019-11-24 09:37:31,368 epoch 2 - iter 1/16 - loss 0.70614460 - samples/sec: 26.64
2019-11-24 09:37:32,388 epoch 2 - iter 2/16 - loss 0.67093490 - samples/sec: 31.74
2019-11-24 09:37:34,225 epoch 2 - iter 3/16 - loss 0.65052827 - samples/sec: 17.58
2019-11-24 09:37:35,422 epoch 2 - iter 4/16 - loss 0.62875946 - samples/sec: 27.01
2019-11-24 09:37:37,100 epoch 2 - iter 5/16 - loss 0.61309786 - samples/sec: 19.40
2019-11-24 09:37:38,233 epoch 2 - iter 6/16 - loss 0.61597365 - samples/sec: 28.69
2019-11-24 09:37:39,181 epoch 2 - iter 7/16 - loss 0.58863124 - samples/sec: 34.50
2019-11-24 09:37:41,714 epoch 2 - iter 8/16 - loss 0.56685092 - samples/sec: 12.70
2019-11-24 09:37:43,576 epoch 2 - iter 9/16 - loss 0.56555685 - samples/sec: 17.36
2019-11-24 09:37:44,867 epoch 2 - iter 10/16 

2019-11-24 09:39:48,775 epoch 6 - iter 12/16 - loss 0.39615053 - samples/sec: 33.10
2019-11-24 09:39:50,503 epoch 6 - iter 13/16 - loss 0.39545979 - samples/sec: 18.72
2019-11-24 09:39:52,306 epoch 6 - iter 14/16 - loss 0.38564175 - samples/sec: 17.89
2019-11-24 09:39:52,848 epoch 6 - iter 15/16 - loss 0.38875379 - samples/sec: 60.79
2019-11-24 09:39:53,194 ----------------------------------------------------------------------------------------------------
2019-11-24 09:39:53,195 EPOCH 6 done: loss 0.3888 - lr 0.1000
2019-11-24 09:39:57,181 DEV : loss 0.5257899165153503 - score 0.7541
2019-11-24 09:39:57,224 BAD EPOCHS (no improvement): 0
2019-11-24 09:40:00,363 ----------------------------------------------------------------------------------------------------
2019-11-24 09:40:02,762 epoch 7 - iter 0/16 - loss 0.38311625 - samples/sec: 15.88
2019-11-24 09:40:03,782 epoch 7 - iter 1/16 - loss 0.33764593 - samples/sec: 31.82
2019-11-24 09:40:04,782 epoch 7 - iter 2/16 - loss 0.33280908 

2019-11-24 09:41:59,754 ----------------------------------------------------------------------------------------------------
2019-11-24 09:41:59,755 loading file ./test/title10/best-model.pt
acc:  0.7337662337662337
precision:  0.7261904761904762
recall:  0.7721518987341772
f1:  0.7484662576687117


In [111]:
train_content[0][0].keys()

dict_keys(['url', 'text', 'images', 'top_img', 'keywords', 'authors', 'canonical_link', 'title', 'meta_data', 'movies', 'publish_date', 'source', 'summary'])

In [15]:
train_summary = [(x['summary'], label) for x, label in train_content] 
test_summary = [(x['summary'], label) for x, label in test_content]
print(len([x for x, label in train_summary if x == '']))
print(len([x for x, label in test_summary if x == '']))
train_summary = [(x, label) for x, label in train_summary if x != ''] 
test_summary = [(x, label) for x, label in test_summary if x != '']
print(len(train_summary))
print(len(test_summary))

643
162
0
0


In [43]:
train_text = [(x['text'][:100], label) for x, label in train_content] 
test_text = [(x['text'][:100], label) for x, label in test_content]
print(len([x for x, label in train_text if x == '']))
print(len([x for x, label in test_text if x == '']))
train_text = [(x, label) for x, label in train_text if x != ''] 
test_text = [(x, label) for x, label in test_text if x != '']
print(len(train_text))
print(len(test_text))

61
12
582
150


In [44]:
path = './test/text1'
make_test(train_text, test_text, path, path, 1)

2019-11-24 10:09:17,387 Reading data from test/text1
2019-11-24 10:09:17,388 Train: test/text1/train.csv
2019-11-24 10:09:17,389 Dev: test/text1/dev.csv
2019-11-24 10:09:17,390 Test: test/text1/test.csv
2019-11-24 10:09:17,395 Computing label dictionary. Progress:


100%|██████████| 465/465 [00:00<00:00, 2855.60it/s]

2019-11-24 10:09:17,665 [b'real', b'fake']





2019-11-24 10:09:19,204 ----------------------------------------------------------------------------------------------------
2019-11-24 10:09:19,205 Model: "TextClassifier(
  (document_embeddings): DocumentRNNEmbeddings(
    (embeddings): StackedEmbeddings(
      (list_embedding_0): WordEmbeddings('glove')
      (list_embedding_1): FlairEmbeddings(
        (lm): LanguageModel(
          (drop): Dropout(p=0.25, inplace=False)
          (encoder): Embedding(275, 100)
          (rnn): LSTM(100, 1024)
          (decoder): Linear(in_features=1024, out_features=275, bias=True)
        )
      )
      (list_embedding_2): FlairEmbeddings(
        (lm): LanguageModel(
          (drop): Dropout(p=0.25, inplace=False)
          (encoder): Embedding(275, 100)
          (rnn): LSTM(100, 1024)
          (decoder): Linear(in_features=1024, out_features=275, bias=True)
        )
      )
    )
    (word_reprojection_map): Linear(in_features=2148, out_features=256, bias=True)
    (rnn): GRU(256, 512, ba

  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "


2019-11-24 10:09:42,279 ----------------------------------------------------------------------------------------------------
2019-11-24 10:09:42,280 Testing using best model ...
2019-11-24 10:09:42,281 loading file test/text1/best-model.pt
2019-11-24 10:09:45,363 0.569	0.569	0.569
2019-11-24 10:09:45,364 
MICRO_AVG: acc 0.3976 - f1-score 0.569
MACRO_AVG: acc 0.2999 - f1-score 0.39659999999999995
fake       tp: 1 - fp: 0 - fn: 25 - tn: 32 - precision: 1.0000 - recall: 0.0385 - accuracy: 0.0385 - f1-score: 0.0741
real       tp: 32 - fp: 25 - fn: 0 - tn: 1 - precision: 0.5614 - recall: 1.0000 - accuracy: 0.5614 - f1-score: 0.7191
2019-11-24 10:09:45,364 ----------------------------------------------------------------------------------------------------
2019-11-24 10:09:45,366 loading file ./test/text1/best-model.pt
acc:  0.5
precision:  0.6666666666666666
recall:  0.02631578947368421
f1:  0.05063291139240506


In [45]:
path = './test/text10'
make_test(train_text, test_text, path, path, 10)

2019-11-24 10:10:29,469 Reading data from test/text1
2019-11-24 10:10:29,470 Train: test/text1/train.csv
2019-11-24 10:10:29,471 Dev: test/text1/dev.csv
2019-11-24 10:10:29,472 Test: test/text1/test.csv
2019-11-24 10:10:29,478 Computing label dictionary. Progress:


100%|██████████| 465/465 [00:00<00:00, 2025.16it/s]

2019-11-24 10:10:29,929 [b'real', b'fake']





2019-11-24 10:10:31,102 ----------------------------------------------------------------------------------------------------
2019-11-24 10:10:31,103 Model: "TextClassifier(
  (document_embeddings): DocumentRNNEmbeddings(
    (embeddings): StackedEmbeddings(
      (list_embedding_0): WordEmbeddings('glove')
      (list_embedding_1): FlairEmbeddings(
        (lm): LanguageModel(
          (drop): Dropout(p=0.25, inplace=False)
          (encoder): Embedding(275, 100)
          (rnn): LSTM(100, 1024)
          (decoder): Linear(in_features=1024, out_features=275, bias=True)
        )
      )
      (list_embedding_2): FlairEmbeddings(
        (lm): LanguageModel(
          (drop): Dropout(p=0.25, inplace=False)
          (encoder): Embedding(275, 100)
          (rnn): LSTM(100, 1024)
          (decoder): Linear(in_features=1024, out_features=275, bias=True)
        )
      )
    )
    (word_reprojection_map): Linear(in_features=2148, out_features=256, bias=True)
    (rnn): GRU(256, 512, ba

  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "


2019-11-24 10:10:53,512 ----------------------------------------------------------------------------------------------------
2019-11-24 10:10:54,982 epoch 2 - iter 0/15 - loss 0.58215338 - samples/sec: 25.49
2019-11-24 10:10:56,353 epoch 2 - iter 1/15 - loss 0.60061797 - samples/sec: 23.83
2019-11-24 10:10:57,427 epoch 2 - iter 2/15 - loss 0.63327116 - samples/sec: 30.30
2019-11-24 10:10:58,443 epoch 2 - iter 3/15 - loss 0.64355038 - samples/sec: 32.81
2019-11-24 10:10:59,511 epoch 2 - iter 4/15 - loss 0.63696632 - samples/sec: 30.32
2019-11-24 10:11:00,594 epoch 2 - iter 5/15 - loss 0.63926754 - samples/sec: 29.99
2019-11-24 10:11:01,624 epoch 2 - iter 6/15 - loss 0.63069227 - samples/sec: 31.58
2019-11-24 10:11:02,695 epoch 2 - iter 7/15 - loss 0.64185501 - samples/sec: 30.21
2019-11-24 10:11:03,717 epoch 2 - iter 8/15 - loss 0.64677330 - samples/sec: 32.04
2019-11-24 10:11:04,845 epoch 2 - iter 9/15 - loss 0.64320197 - samples/sec: 29.16
2019-11-24 10:11:05,979 epoch 2 - iter 10/15 

2019-11-24 10:12:37,543 EPOCH 6 done: loss 0.5750 - lr 0.1000
2019-11-24 10:12:39,926 DEV : loss 0.7283656597137451 - score 0.5593
2019-11-24 10:12:39,968 BAD EPOCHS (no improvement): 1
2019-11-24 10:12:39,970 ----------------------------------------------------------------------------------------------------
2019-11-24 10:12:41,443 epoch 7 - iter 0/15 - loss 0.63462293 - samples/sec: 27.74
2019-11-24 10:12:42,554 epoch 7 - iter 1/15 - loss 0.56112288 - samples/sec: 29.31
2019-11-24 10:12:43,679 epoch 7 - iter 2/15 - loss 0.53502975 - samples/sec: 28.73
2019-11-24 10:12:44,704 epoch 7 - iter 3/15 - loss 0.54088059 - samples/sec: 31.82
2019-11-24 10:12:45,840 epoch 7 - iter 4/15 - loss 0.52440161 - samples/sec: 28.62
2019-11-24 10:12:46,871 epoch 7 - iter 5/15 - loss 0.53437337 - samples/sec: 31.41
2019-11-24 10:12:47,883 epoch 7 - iter 6/15 - loss 0.52869647 - samples/sec: 32.10
2019-11-24 10:12:49,461 epoch 7 - iter 7/15 - loss 0.51449392 - samples/sec: 20.51
2019-11-24 10:12:50,683 e

In [46]:
train_url = [(x['url'][:100], label) for x, label in train_content] 
test_url = [(x['url'][:100], label) for x, label in test_content]
print(len([x for x, label in train_url if x == '']))
print(len([x for x, label in test_url if x == '']))
train_url = [(x, label) for x, label in train_url if x != ''] 
test_url = [(x, label) for x, label in test_url if x != '']
print(len(train_url))
print(len(test_url))

0
0
643
162


In [47]:
path = './test/url1'
make_test(train_url, test_url, path, path, 1)

2019-11-24 10:34:08,310 Reading data from test/url1
2019-11-24 10:34:08,311 Train: test/url1/train.csv
2019-11-24 10:34:08,312 Dev: test/url1/dev.csv
2019-11-24 10:34:08,314 Test: test/url1/test.csv
2019-11-24 10:34:08,318 Computing label dictionary. Progress:


100%|██████████| 514/514 [00:00<00:00, 1674.55it/s]

2019-11-24 10:34:09,158 [b'real', b'fake']





2019-11-24 10:34:10,727 ----------------------------------------------------------------------------------------------------
2019-11-24 10:34:10,728 Model: "TextClassifier(
  (document_embeddings): DocumentRNNEmbeddings(
    (embeddings): StackedEmbeddings(
      (list_embedding_0): WordEmbeddings('glove')
      (list_embedding_1): FlairEmbeddings(
        (lm): LanguageModel(
          (drop): Dropout(p=0.25, inplace=False)
          (encoder): Embedding(275, 100)
          (rnn): LSTM(100, 1024)
          (decoder): Linear(in_features=1024, out_features=275, bias=True)
        )
      )
      (list_embedding_2): FlairEmbeddings(
        (lm): LanguageModel(
          (drop): Dropout(p=0.25, inplace=False)
          (encoder): Embedding(275, 100)
          (rnn): LSTM(100, 1024)
          (decoder): Linear(in_features=1024, out_features=275, bias=True)
        )
      )
    )
    (word_reprojection_map): Linear(in_features=2148, out_features=256, bias=True)
    (rnn): GRU(256, 512, ba

  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "


2019-11-24 10:34:39,586 ----------------------------------------------------------------------------------------------------
2019-11-24 10:34:39,587 Testing using best model ...
2019-11-24 10:34:39,587 loading file test/url1/best-model.pt
2019-11-24 10:34:42,661 0.8281	0.8281	0.8281
2019-11-24 10:34:42,662 
MICRO_AVG: acc 0.7067 - f1-score 0.8281
MACRO_AVG: acc 0.7053 - f1-score 0.8271
fake       tp: 24 - fp: 2 - fn: 9 - tn: 29 - precision: 0.9231 - recall: 0.7273 - accuracy: 0.6857 - f1-score: 0.8136
real       tp: 29 - fp: 9 - fn: 2 - tn: 24 - precision: 0.7632 - recall: 0.9355 - accuracy: 0.7250 - f1-score: 0.8406
2019-11-24 10:34:42,663 ----------------------------------------------------------------------------------------------------
2019-11-24 10:34:42,664 loading file ./test/url1/best-model.pt
acc:  0.5370370370370371
precision:  0.8888888888888888
recall:  0.0975609756097561
f1:  0.17582417582417584


In [48]:
path = './test/url10'
make_test(train_url, test_url, path, path, 10)

2019-11-24 10:35:24,737 Reading data from test/url10
2019-11-24 10:35:24,738 Train: test/url10/train.csv
2019-11-24 10:35:24,739 Dev: test/url10/dev.csv
2019-11-24 10:35:24,739 Test: test/url10/test.csv
2019-11-24 10:35:24,742 Computing label dictionary. Progress:


100%|██████████| 514/514 [00:00<00:00, 1084.66it/s]

2019-11-24 10:35:25,770 [b'real', b'fake']





2019-11-24 10:35:26,576 ----------------------------------------------------------------------------------------------------
2019-11-24 10:35:26,577 Model: "TextClassifier(
  (document_embeddings): DocumentRNNEmbeddings(
    (embeddings): StackedEmbeddings(
      (list_embedding_0): WordEmbeddings('glove')
      (list_embedding_1): FlairEmbeddings(
        (lm): LanguageModel(
          (drop): Dropout(p=0.25, inplace=False)
          (encoder): Embedding(275, 100)
          (rnn): LSTM(100, 1024)
          (decoder): Linear(in_features=1024, out_features=275, bias=True)
        )
      )
      (list_embedding_2): FlairEmbeddings(
        (lm): LanguageModel(
          (drop): Dropout(p=0.25, inplace=False)
          (encoder): Embedding(275, 100)
          (rnn): LSTM(100, 1024)
          (decoder): Linear(in_features=1024, out_features=275, bias=True)
        )
      )
    )
    (word_reprojection_map): Linear(in_features=2148, out_features=256, bias=True)
    (rnn): GRU(256, 512, ba

  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "


2019-11-24 10:35:53,191 ----------------------------------------------------------------------------------------------------


  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "


2019-11-24 10:35:55,437 epoch 2 - iter 0/17 - loss 0.63785821 - samples/sec: 17.72
2019-11-24 10:35:56,686 epoch 2 - iter 1/17 - loss 0.56365119 - samples/sec: 26.14
2019-11-24 10:35:57,774 epoch 2 - iter 2/17 - loss 0.58134416 - samples/sec: 29.83
2019-11-24 10:35:58,870 epoch 2 - iter 3/17 - loss 0.57210638 - samples/sec: 29.53
2019-11-24 10:36:00,055 epoch 2 - iter 4/17 - loss 0.57372680 - samples/sec: 27.38
2019-11-24 10:36:01,107 epoch 2 - iter 5/17 - loss 0.58562698 - samples/sec: 30.88
2019-11-24 10:36:02,244 epoch 2 - iter 6/17 - loss 0.59531125 - samples/sec: 28.68
2019-11-24 10:36:03,376 epoch 2 - iter 7/17 - loss 0.57621308 - samples/sec: 28.71
2019-11-24 10:36:04,422 epoch 2 - iter 8/17 - loss 0.56678752 - samples/sec: 30.94
2019-11-24 10:36:05,541 epoch 2 - iter 9/17 - loss 0.56836219 - samples/sec: 28.91
2019-11-24 10:36:06,658 epoch 2 - iter 10/17 - loss 0.57183625 - samples/sec: 29.01
2019-11-24 10:36:07,982 epoch 2 - iter 11/17 - loss 0.58069562 - samples/sec: 24.57
20

2019-11-24 10:37:41,817 epoch 6 - iter 9/17 - loss 0.44683191 - samples/sec: 27.68
2019-11-24 10:37:42,901 epoch 6 - iter 10/17 - loss 0.46142967 - samples/sec: 30.06
2019-11-24 10:37:43,958 epoch 6 - iter 11/17 - loss 0.45018169 - samples/sec: 30.74
2019-11-24 10:37:45,063 epoch 6 - iter 12/17 - loss 0.45047573 - samples/sec: 29.43
2019-11-24 10:37:46,232 epoch 6 - iter 13/17 - loss 0.45270126 - samples/sec: 27.73
2019-11-24 10:37:47,536 epoch 6 - iter 14/17 - loss 0.45893447 - samples/sec: 24.83
2019-11-24 10:37:48,848 epoch 6 - iter 15/17 - loss 0.45978569 - samples/sec: 24.74
2019-11-24 10:37:49,459 epoch 6 - iter 16/17 - loss 0.45627174 - samples/sec: 53.59
2019-11-24 10:37:49,686 ----------------------------------------------------------------------------------------------------
2019-11-24 10:37:49,687 EPOCH 6 done: loss 0.4563 - lr 0.0500
2019-11-24 10:37:53,074 DEV : loss 0.5907722115516663 - score 0.6769
2019-11-24 10:37:53,117 BAD EPOCHS (no improvement): 1
2019-11-24 10:37:5

2019-11-24 10:39:34,979 EPOCH 10 done: loss 0.4018 - lr 0.0500
2019-11-24 10:39:38,084 DEV : loss 0.4357866942882538 - score 0.6769
2019-11-24 10:39:38,124 BAD EPOCHS (no improvement): 3
2019-11-24 10:39:40,826 ----------------------------------------------------------------------------------------------------
2019-11-24 10:39:40,827 Testing using best model ...
2019-11-24 10:39:40,828 loading file test/url10/best-model.pt
2019-11-24 10:39:44,329 0.875	0.875	0.875
2019-11-24 10:39:44,330 
MICRO_AVG: acc 0.7778 - f1-score 0.875
MACRO_AVG: acc 0.7778 - f1-score 0.875
fake       tp: 28 - fp: 3 - fn: 5 - tn: 28 - precision: 0.9032 - recall: 0.8485 - accuracy: 0.7778 - f1-score: 0.8750
real       tp: 28 - fp: 5 - fn: 3 - tn: 28 - precision: 0.8485 - recall: 0.9032 - accuracy: 0.7778 - f1-score: 0.8750
2019-11-24 10:39:44,331 ----------------------------------------------------------------------------------------------------
2019-11-24 10:39:44,333 loading file ./test/url10/best-model.pt
acc