In [1]:
import os
import json
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import pandas as pd
from flair.embeddings import WordEmbeddings, FlairEmbeddings, DocumentPoolEmbeddings, Sentence, DocumentRNNEmbeddings
from flair.models import TextClassifier
from flair.trainers import ModelTrainer
from flair.datasets import CSVClassificationCorpus
import os
from sklearn.metrics import f1_score, accuracy_score, recall_score, precision_score

In [2]:
DATA_PATH = '../fakenewsnet_dataset/dataset'
DATASET_NAME = 'politifact'
DATASET_PATH = '{}/{}'.format(DATA_PATH, DATASET_NAME)
REAL_DATA_PATH = '{}_real.csv'.format(DATASET_PATH)
FAKE_DATA_PATH = '{}_fake.csv'.format(DATASET_PATH)

In [3]:
fake_arts = pd.read_csv(FAKE_DATA_PATH, na_values=['nan'], keep_default_na=False)
real_arts = pd.read_csv(REAL_DATA_PATH, na_values=['nan'], keep_default_na=False)

In [4]:
def parse_art_data_frame(df):
    return [{'id': id, 'url': url, 'title': title} for id, url, title, tweets in df.values]
    
fake_arts_with_content = parse_art_data_frame(fake_arts)
real_arts_with_content = parse_art_data_frame(real_arts)

In [5]:
fake_data = [(art, 'fake') for art in fake_arts_with_content]
real_data = [(art, 'real') for art in real_arts_with_content]

In [6]:
np.random.shuffle(fake_data)
np.random.shuffle(real_data)

In [7]:
train_data = fake_data[0:int(len(fake_data)*0.8)] + real_data[0:int(len(real_data)*0.8)]
test_data = fake_data[int(len(fake_data)*0.8):] + real_data[int(len(real_data)*0.8):]
np.random.shuffle(train_data)
np.random.shuffle(test_data)

In [8]:
print(len(train_data))
print(len(test_data))
print(len(train_data) + len(test_data))
print(len(fake_data) + len(real_data))

844
212
1056
1056


In [9]:
def clear_text(text):
    return ' '.join(text.split())

class Classifier():
    def __init__(self, classifier):
        self.classifier = classifier
        
    def predict(self, text):
        text = clear_text(text)
        sentence = Sentence(text)
        self.classifier.predict(sentence)
        return sentence.labels[0]

def transform_data(data):
    return [{'label': label, 'text': clear_text(x)} for x, label in data]

def save_data(data, data_folder = '.'):
    if not os.path.exists(data_folder):
        os.makedirs(data_folder)
    data = transform_data(data)
    frame_data = pd.DataFrame(data)
    train_path = '{}/train.csv'.format(data_folder)
    test_path = '{}/test.csv'.format(data_folder)
    dev_path = '{}/dev.csv'.format(data_folder)
    frame_data.iloc[0:int(len(data)*0.8)].to_csv(train_path, sep='\t', index = False, header = False)
    frame_data.iloc[int(len(data)*0.8):int(len(data)*0.9)].to_csv(test_path, sep='\t', index = False, header = False)
    frame_data.iloc[int(len(data)*0.9):].to_csv(dev_path, sep='\t', index = False, header = False)

def load_corpus(data_folder = '.'):
    column_name_map = {1: "text", 0: "label"}
    return CSVClassificationCorpus(data_folder,
                                     column_name_map,
                                     delimiter='\t',
                                  test_file='test.csv',
                                  dev_file='dev.csv',
                                  train_file='train.csv')
    
def train_classifier(corpus, model_folder = '.', max_epochs = 1):
    label_dict = corpus.make_label_dictionary()

    word_embeddings = [
        WordEmbeddings('glove'),
        FlairEmbeddings('news-forward-fast'),
        FlairEmbeddings('news-backward-fast')
    ]

    document_embeddings = DocumentRNNEmbeddings(word_embeddings,
                                                hidden_size=512,
                                                reproject_words=True,
                                                reproject_words_dimension=256)

    classifier = TextClassifier(document_embeddings,
                                label_dictionary=label_dict)

    trainer = ModelTrainer(classifier, corpus)

    trainer.train(model_folder, max_epochs=max_epochs)
    
    return TextClassifier.load('{}/best-model.pt'.format(model_folder))
    
def train_model(train_data,
               data_folder = '.',
               model_folder = '.',
               max_epochs=1
               ):
    save_data(train_data, data_folder)
    corpus = load_corpus(data_folder)
    classifier = train_classifier(corpus, model_folder, max_epochs)
    return Classifier(classifier)

def calculate_metrics(y_true, y_pred, pos_label = 'fake'):
    acc = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, pos_label=pos_label)
    recall = recall_score(y_true, y_pred, pos_label=pos_label)
    f1 = f1_score(y_true, y_pred, pos_label=pos_label)
    return acc, precision, recall, f1

def validate_model(test_data, classifier):
    y_true = [label for x, label in test_data]
    y_pred = [classifier.predict(x).value for x, label in test_data]
    acc, precision, recall, f1 = calculate_metrics(y_true, y_pred)
    print("acc: ", acc)
    print("precision: ", precision)
    print("recall: ", recall)
    print("f1: ", f1)
    return acc, precision, recall, f1

def make_test(train_data, test_data, data_folder, model_folder, max_epochs):
    classifier = train_model(train_data, data_folder, model_folder, max_epochs)
    validate_model(test_data, classifier)
       

In [10]:
train_content = [(x, label) for x, label in train_data] 
test_content = [(x, label) for x, label in test_data]

In [11]:
train_title = [(x['title'], label) for x, label in train_content] 
test_title = [(x['title'], label) for x, label in test_content]
print(len([x for x, label in train_title if x == '']))
print(len([x for x, label in test_title if x == '']))
train_title = [(x, label) for x, label in train_title if x != ''] 
test_title = [(x, label) for x, label in test_title if x != '']
print(len(train_title))
print(len(test_title))

0
0
844
212


In [13]:
path = './test_csv/title1'
make_test(train_title, test_title, path, path, 1)

2019-11-24 12:12:31,661 Reading data from test_csv/title1
2019-11-24 12:12:31,662 Train: test_csv/title1/train.csv
2019-11-24 12:12:31,663 Dev: test_csv/title1/dev.csv
2019-11-24 12:12:31,665 Test: test_csv/title1/test.csv
2019-11-24 12:12:31,672 Computing label dictionary. Progress:


100%|██████████| 675/675 [00:00<00:00, 2709.81it/s]

2019-11-24 12:12:32,142 [b'real', b'fake']





2019-11-24 12:12:33,751 ----------------------------------------------------------------------------------------------------
2019-11-24 12:12:33,752 Model: "TextClassifier(
  (document_embeddings): DocumentRNNEmbeddings(
    (embeddings): StackedEmbeddings(
      (list_embedding_0): WordEmbeddings('glove')
      (list_embedding_1): FlairEmbeddings(
        (lm): LanguageModel(
          (drop): Dropout(p=0.25, inplace=False)
          (encoder): Embedding(275, 100)
          (rnn): LSTM(100, 1024)
          (decoder): Linear(in_features=1024, out_features=275, bias=True)
        )
      )
      (list_embedding_2): FlairEmbeddings(
        (lm): LanguageModel(
          (drop): Dropout(p=0.25, inplace=False)
          (encoder): Embedding(275, 100)
          (rnn): LSTM(100, 1024)
          (decoder): Linear(in_features=1024, out_features=275, bias=True)
        )
      )
    )
    (word_reprojection_map): Linear(in_features=2148, out_features=256, bias=True)
    (rnn): GRU(256, 512, ba

  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "


2019-11-24 12:13:14,913 ----------------------------------------------------------------------------------------------------
2019-11-24 12:13:14,915 Testing using best model ...
2019-11-24 12:13:14,919 loading file test_csv/title1/best-model.pt
2019-11-24 12:13:19,557 0.75	0.75	0.75
2019-11-24 12:13:19,559 
MICRO_AVG: acc 0.6 - f1-score 0.75
MACRO_AVG: acc 0.5834 - f1-score 0.73335
fake       tp: 21 - fp: 3 - fn: 18 - tn: 42 - precision: 0.8750 - recall: 0.5385 - accuracy: 0.5000 - f1-score: 0.6667
real       tp: 42 - fp: 18 - fn: 3 - tn: 21 - precision: 0.7000 - recall: 0.9333 - accuracy: 0.6667 - f1-score: 0.8000
2019-11-24 12:13:19,559 ----------------------------------------------------------------------------------------------------
2019-11-24 12:13:19,561 loading file ./test_csv/title1/best-model.pt
acc:  0.7547169811320755
precision:  0.8888888888888888
recall:  0.45977011494252873
f1:  0.6060606060606061


In [14]:
path = './test_csv/title10'
make_test(train_title, test_title, path, path, 10)

2019-11-24 12:13:55,353 Reading data from test_csv/title10
2019-11-24 12:13:55,354 Train: test_csv/title10/train.csv
2019-11-24 12:13:55,355 Dev: test_csv/title10/dev.csv
2019-11-24 12:13:55,356 Test: test_csv/title10/test.csv
2019-11-24 12:13:55,361 Computing label dictionary. Progress:


100%|██████████| 675/675 [00:00<00:00, 1783.95it/s]

2019-11-24 12:13:56,093 [b'real', b'fake']





2019-11-24 12:13:57,721 ----------------------------------------------------------------------------------------------------
2019-11-24 12:13:57,725 Model: "TextClassifier(
  (document_embeddings): DocumentRNNEmbeddings(
    (embeddings): StackedEmbeddings(
      (list_embedding_0): WordEmbeddings('glove')
      (list_embedding_1): FlairEmbeddings(
        (lm): LanguageModel(
          (drop): Dropout(p=0.25, inplace=False)
          (encoder): Embedding(275, 100)
          (rnn): LSTM(100, 1024)
          (decoder): Linear(in_features=1024, out_features=275, bias=True)
        )
      )
      (list_embedding_2): FlairEmbeddings(
        (lm): LanguageModel(
          (drop): Dropout(p=0.25, inplace=False)
          (encoder): Embedding(275, 100)
          (rnn): LSTM(100, 1024)
          (decoder): Linear(in_features=1024, out_features=275, bias=True)
        )
      )
    )
    (word_reprojection_map): Linear(in_features=2148, out_features=256, bias=True)
    (rnn): GRU(256, 512, ba

  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "


2019-11-24 12:14:33,986 ----------------------------------------------------------------------------------------------------


  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "


2019-11-24 12:14:36,127 epoch 2 - iter 0/22 - loss 0.83200014 - samples/sec: 34.47
2019-11-24 12:14:39,152 epoch 2 - iter 2/22 - loss 0.60657870 - samples/sec: 21.31
2019-11-24 12:14:41,253 epoch 2 - iter 4/22 - loss 0.56062165 - samples/sec: 30.73
2019-11-24 12:14:43,446 epoch 2 - iter 6/22 - loss 0.56072778 - samples/sec: 29.44
2019-11-24 12:14:45,830 epoch 2 - iter 8/22 - loss 0.53798973 - samples/sec: 26.98
2019-11-24 12:14:47,950 epoch 2 - iter 10/22 - loss 0.50823598 - samples/sec: 30.63
2019-11-24 12:14:50,729 epoch 2 - iter 12/22 - loss 0.50241212 - samples/sec: 23.14
2019-11-24 12:14:54,127 epoch 2 - iter 14/22 - loss 0.50276741 - samples/sec: 18.90
2019-11-24 12:14:56,614 epoch 2 - iter 16/22 - loss 0.49032348 - samples/sec: 25.99
2019-11-24 12:15:00,199 epoch 2 - iter 18/22 - loss 0.49319046 - samples/sec: 17.93
2019-11-24 12:15:03,699 epoch 2 - iter 20/22 - loss 0.48475509 - samples/sec: 18.43
2019-11-24 12:15:04,759 ---------------------------------------------------------

2019-11-24 12:18:23,027 epoch 8 - iter 2/22 - loss 0.25025801 - samples/sec: 27.59
2019-11-24 12:18:26,214 epoch 8 - iter 4/22 - loss 0.29075018 - samples/sec: 20.23
2019-11-24 12:18:29,271 epoch 8 - iter 6/22 - loss 0.26344068 - samples/sec: 21.11
2019-11-24 12:18:31,161 epoch 8 - iter 8/22 - loss 0.24776872 - samples/sec: 34.20
2019-11-24 12:18:33,903 epoch 8 - iter 10/22 - loss 0.27362899 - samples/sec: 23.55
2019-11-24 12:18:36,990 epoch 8 - iter 12/22 - loss 0.30034924 - samples/sec: 20.87
2019-11-24 12:18:40,108 epoch 8 - iter 14/22 - loss 0.29200192 - samples/sec: 20.62
2019-11-24 12:18:43,150 epoch 8 - iter 16/22 - loss 0.31749090 - samples/sec: 21.21
2019-11-24 12:18:46,267 epoch 8 - iter 18/22 - loss 0.31883654 - samples/sec: 20.64
2019-11-24 12:18:48,601 epoch 8 - iter 20/22 - loss 0.31931001 - samples/sec: 27.60
2019-11-24 12:18:49,279 ----------------------------------------------------------------------------------------------------
2019-11-24 12:18:49,280 EPOCH 8 done: l

In [15]:
train_url = [(x['url'], label) for x, label in train_content] 
test_url = [(x['url'], label) for x, label in test_content]
print(len([x for x, label in train_url if x == '']))
print(len([x for x, label in test_url if x == '']))
train_url = [(x, label) for x, label in train_url if x != ''] 
test_url = [(x, label) for x, label in test_url if x != '']
print(len(train_url))
print(len(test_url))

55
6
789
206


In [16]:
path = './test_csv/url1'
make_test(train_url, test_url, path, path, 1)

2019-11-24 12:20:50,843 Reading data from test_csv/url1
2019-11-24 12:20:50,845 Train: test_csv/url1/train.csv
2019-11-24 12:20:50,846 Dev: test_csv/url1/dev.csv
2019-11-24 12:20:50,849 Test: test_csv/url1/test.csv
2019-11-24 12:20:50,854 Computing label dictionary. Progress:


100%|██████████| 631/631 [00:00<00:00, 1204.56it/s]

2019-11-24 12:20:52,213 [b'real', b'fake']





2019-11-24 12:20:53,423 ----------------------------------------------------------------------------------------------------
2019-11-24 12:20:53,424 Model: "TextClassifier(
  (document_embeddings): DocumentRNNEmbeddings(
    (embeddings): StackedEmbeddings(
      (list_embedding_0): WordEmbeddings('glove')
      (list_embedding_1): FlairEmbeddings(
        (lm): LanguageModel(
          (drop): Dropout(p=0.25, inplace=False)
          (encoder): Embedding(275, 100)
          (rnn): LSTM(100, 1024)
          (decoder): Linear(in_features=1024, out_features=275, bias=True)
        )
      )
      (list_embedding_2): FlairEmbeddings(
        (lm): LanguageModel(
          (drop): Dropout(p=0.25, inplace=False)
          (encoder): Embedding(275, 100)
          (rnn): LSTM(100, 1024)
          (decoder): Linear(in_features=1024, out_features=275, bias=True)
        )
      )
    )
    (word_reprojection_map): Linear(in_features=2148, out_features=256, bias=True)
    (rnn): GRU(256, 512, ba

  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "


2019-11-24 12:21:54,010 ----------------------------------------------------------------------------------------------------
2019-11-24 12:21:54,010 Testing using best model ...
2019-11-24 12:21:54,011 loading file test_csv/url1/best-model.pt
2019-11-24 12:22:01,503 0.7089	0.7089	0.7089
2019-11-24 12:22:01,505 
MICRO_AVG: acc 0.549 - f1-score 0.7089
MACRO_AVG: acc 0.5447 - f1-score 0.70415
fake       tp: 23 - fp: 7 - fn: 16 - tn: 33 - precision: 0.7667 - recall: 0.5897 - accuracy: 0.5000 - f1-score: 0.6667
real       tp: 33 - fp: 16 - fn: 7 - tn: 23 - precision: 0.6735 - recall: 0.8250 - accuracy: 0.5893 - f1-score: 0.7416
2019-11-24 12:22:01,506 ----------------------------------------------------------------------------------------------------
2019-11-24 12:22:01,508 loading file ./test_csv/url1/best-model.pt
acc:  0.5776699029126213
precision:  0.0
recall:  0.0
f1:  0.0


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


In [17]:
path = './test_csv/url10'
make_test(train_url, test_url, path, path, 10)

2019-11-24 12:22:56,141 Reading data from test_csv/url10
2019-11-24 12:22:56,142 Train: test_csv/url10/train.csv
2019-11-24 12:22:56,143 Dev: test_csv/url10/dev.csv
2019-11-24 12:22:56,143 Test: test_csv/url10/test.csv
2019-11-24 12:22:56,150 Computing label dictionary. Progress:


100%|██████████| 631/631 [00:01<00:00, 519.12it/s] 

2019-11-24 12:22:58,546 [b'real', b'fake']





2019-11-24 12:23:01,499 ----------------------------------------------------------------------------------------------------
2019-11-24 12:23:01,500 Model: "TextClassifier(
  (document_embeddings): DocumentRNNEmbeddings(
    (embeddings): StackedEmbeddings(
      (list_embedding_0): WordEmbeddings('glove')
      (list_embedding_1): FlairEmbeddings(
        (lm): LanguageModel(
          (drop): Dropout(p=0.25, inplace=False)
          (encoder): Embedding(275, 100)
          (rnn): LSTM(100, 1024)
          (decoder): Linear(in_features=1024, out_features=275, bias=True)
        )
      )
      (list_embedding_2): FlairEmbeddings(
        (lm): LanguageModel(
          (drop): Dropout(p=0.25, inplace=False)
          (encoder): Embedding(275, 100)
          (rnn): LSTM(100, 1024)
          (decoder): Linear(in_features=1024, out_features=275, bias=True)
        )
      )
    )
    (word_reprojection_map): Linear(in_features=2148, out_features=256, bias=True)
    (rnn): GRU(256, 512, ba

  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "


2019-11-24 12:23:55,715 ----------------------------------------------------------------------------------------------------


  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "


2019-11-24 12:23:58,291 epoch 2 - iter 0/20 - loss 0.48626927 - samples/sec: 34.59
2019-11-24 12:24:01,466 epoch 2 - iter 2/20 - loss 0.44420094 - samples/sec: 20.34
2019-11-24 12:24:05,637 epoch 2 - iter 4/20 - loss 0.45653231 - samples/sec: 15.39
2019-11-24 12:24:09,732 epoch 2 - iter 6/20 - loss 0.48278280 - samples/sec: 15.70
2019-11-24 12:24:14,451 epoch 2 - iter 8/20 - loss 0.48724232 - samples/sec: 13.63
2019-11-24 12:24:18,707 epoch 2 - iter 10/20 - loss 0.48095619 - samples/sec: 15.09
2019-11-24 12:24:23,062 epoch 2 - iter 12/20 - loss 0.48030254 - samples/sec: 14.79
2019-11-24 12:24:27,948 epoch 2 - iter 14/20 - loss 0.48882272 - samples/sec: 13.16
2019-11-24 12:24:32,958 epoch 2 - iter 16/20 - loss 0.47248939 - samples/sec: 12.81
2019-11-24 12:24:37,197 epoch 2 - iter 18/20 - loss 0.46479473 - samples/sec: 15.16
2019-11-24 12:24:39,287 ----------------------------------------------------------------------------------------------------
2019-11-24 12:24:39,288 EPOCH 2 done: lo

2019-11-24 12:29:22,282 epoch 8 - iter 14/20 - loss 0.33203665 - samples/sec: 15.94
2019-11-24 12:29:25,706 epoch 8 - iter 16/20 - loss 0.33968798 - samples/sec: 18.76
2019-11-24 12:29:28,980 epoch 8 - iter 18/20 - loss 0.34494895 - samples/sec: 19.64
2019-11-24 12:29:30,512 ----------------------------------------------------------------------------------------------------
2019-11-24 12:29:30,513 EPOCH 8 done: loss 0.3557 - lr 0.1000
2019-11-24 12:29:36,236 DEV : loss 0.543102502822876 - score 0.8481
2019-11-24 12:29:36,277 BAD EPOCHS (no improvement): 1
2019-11-24 12:29:39,347 ----------------------------------------------------------------------------------------------------
2019-11-24 12:29:41,858 epoch 9 - iter 0/20 - loss 0.30012253 - samples/sec: 36.82
2019-11-24 12:29:45,286 epoch 9 - iter 2/20 - loss 0.39369624 - samples/sec: 18.77
2019-11-24 12:29:48,799 epoch 9 - iter 4/20 - loss 0.44359489 - samples/sec: 18.31
2019-11-24 12:29:52,297 epoch 9 - iter 6/20 - loss 0.41848513 - 

In [12]:
train_mix = [(x['url'] + ', ' + x['title'], label) for x, label in train_content] 
test_mix = [(x['url'] + ', ' + x['title'], label) for x, label in test_content]
print(len([x for x, label in train_mix if x == '']))
print(len([x for x, label in test_mix if x == '']))
train_mix = [(x, label) for x, label in train_mix if x != ''] 
test_mix = [(x, label) for x, label in test_mix if x != '']
print(len(train_mix))
print(len(test_mix))

0
0
844
212


In [22]:
path = './test_csv/mix1'
make_test(train_mix, test_mix, path, path, 1)

2019-11-24 12:40:23,281 Reading data from test_csv/mix1
2019-11-24 12:40:23,282 Train: test_csv/mix1/train.csv
2019-11-24 12:40:23,283 Dev: test_csv/mix1/dev.csv
2019-11-24 12:40:23,284 Test: test_csv/mix1/test.csv
2019-11-24 12:40:23,289 Computing label dictionary. Progress:


100%|██████████| 675/675 [00:01<00:00, 415.61it/s] 

2019-11-24 12:40:26,444 [b'real', b'fake']





2019-11-24 12:40:28,183 ----------------------------------------------------------------------------------------------------
2019-11-24 12:40:28,184 Model: "TextClassifier(
  (document_embeddings): DocumentRNNEmbeddings(
    (embeddings): StackedEmbeddings(
      (list_embedding_0): WordEmbeddings('glove')
      (list_embedding_1): FlairEmbeddings(
        (lm): LanguageModel(
          (drop): Dropout(p=0.25, inplace=False)
          (encoder): Embedding(275, 100)
          (rnn): LSTM(100, 1024)
          (decoder): Linear(in_features=1024, out_features=275, bias=True)
        )
      )
      (list_embedding_2): FlairEmbeddings(
        (lm): LanguageModel(
          (drop): Dropout(p=0.25, inplace=False)
          (encoder): Embedding(275, 100)
          (rnn): LSTM(100, 1024)
          (decoder): Linear(in_features=1024, out_features=275, bias=True)
        )
      )
    )
    (word_reprojection_map): Linear(in_features=2148, out_features=256, bias=True)
    (rnn): GRU(256, 512, ba

  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "


2019-11-24 12:42:05,598 ----------------------------------------------------------------------------------------------------
2019-11-24 12:42:05,599 Testing using best model ...
2019-11-24 12:42:05,600 loading file test_csv/mix1/best-model.pt
2019-11-24 12:42:16,177 0.5357	0.5357	0.5357
2019-11-24 12:42:16,178 
MICRO_AVG: acc 0.3659 - f1-score 0.5357
MACRO_AVG: acc 0.2678 - f1-score 0.34885
fake       tp: 0 - fp: 0 - fn: 39 - tn: 45 - precision: 0.0000 - recall: 0.0000 - accuracy: 0.0000 - f1-score: 0.0000
real       tp: 45 - fp: 39 - fn: 0 - tn: 0 - precision: 0.5357 - recall: 1.0000 - accuracy: 0.5357 - f1-score: 0.6977
2019-11-24 12:42:16,179 ----------------------------------------------------------------------------------------------------
2019-11-24 12:42:16,181 loading file ./test_csv/mix1/best-model.pt
acc:  0.6084905660377359
precision:  1.0
recall:  0.04597701149425287
f1:  0.08791208791208792


In [13]:
path = './test_csv/mix10'
make_test(train_mix, test_mix, path, path, 10)

2019-11-24 12:51:15,602 Reading data from test_csv/mix10
2019-11-24 12:51:15,603 Train: test_csv/mix10/train.csv
2019-11-24 12:51:15,603 Dev: test_csv/mix10/dev.csv
2019-11-24 12:51:15,604 Test: test_csv/mix10/test.csv
2019-11-24 12:51:15,608 Computing label dictionary. Progress:


100%|██████████| 675/675 [00:00<00:00, 2648.10it/s]

2019-11-24 12:51:15,962 [b'real', b'fake']





2019-11-24 12:51:17,488 ----------------------------------------------------------------------------------------------------
2019-11-24 12:51:17,489 Model: "TextClassifier(
  (document_embeddings): DocumentRNNEmbeddings(
    (embeddings): StackedEmbeddings(
      (list_embedding_0): WordEmbeddings('glove')
      (list_embedding_1): FlairEmbeddings(
        (lm): LanguageModel(
          (drop): Dropout(p=0.25, inplace=False)
          (encoder): Embedding(275, 100)
          (rnn): LSTM(100, 1024)
          (decoder): Linear(in_features=1024, out_features=275, bias=True)
        )
      )
      (list_embedding_2): FlairEmbeddings(
        (lm): LanguageModel(
          (drop): Dropout(p=0.25, inplace=False)
          (encoder): Embedding(275, 100)
          (rnn): LSTM(100, 1024)
          (decoder): Linear(in_features=1024, out_features=275, bias=True)
        )
      )
    )
    (word_reprojection_map): Linear(in_features=2148, out_features=256, bias=True)
    (rnn): GRU(256, 512, ba

  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "


2019-11-24 12:52:34,710 ----------------------------------------------------------------------------------------------------
2019-11-24 12:52:37,888 epoch 2 - iter 0/22 - loss 0.33519256 - samples/sec: 21.99
2019-11-24 12:52:44,030 epoch 2 - iter 2/22 - loss 0.39973787 - samples/sec: 10.44
2019-11-24 12:52:50,705 epoch 2 - iter 4/22 - loss 0.43782155 - samples/sec: 9.61
2019-11-24 12:52:55,908 epoch 2 - iter 6/22 - loss 0.48082323 - samples/sec: 12.41
2019-11-24 12:53:02,305 epoch 2 - iter 8/22 - loss 0.47924662 - samples/sec: 10.03
2019-11-24 12:53:07,976 epoch 2 - iter 10/22 - loss 0.47223765 - samples/sec: 11.33
2019-11-24 12:53:15,072 epoch 2 - iter 12/22 - loss 0.47226195 - samples/sec: 9.06
2019-11-24 12:53:24,344 epoch 2 - iter 14/22 - loss 0.49208934 - samples/sec: 6.91
2019-11-24 12:53:29,415 epoch 2 - iter 16/22 - loss 0.49555907 - samples/sec: 12.66
2019-11-24 12:53:35,999 epoch 2 - iter 18/22 - loss 0.50252665 - samples/sec: 10.47
2019-11-24 12:53:42,699 epoch 2 - iter 20/2

2019-11-24 13:00:18,445 epoch 8 - iter 0/22 - loss 0.51767123 - samples/sec: 28.86
2019-11-24 13:00:22,837 epoch 8 - iter 2/22 - loss 0.53006965 - samples/sec: 14.69
2019-11-24 13:00:26,833 epoch 8 - iter 4/22 - loss 0.48883155 - samples/sec: 16.09
2019-11-24 13:00:31,043 epoch 8 - iter 6/22 - loss 0.45756572 - samples/sec: 15.25
2019-11-24 13:00:34,854 epoch 8 - iter 8/22 - loss 0.43291649 - samples/sec: 16.95
2019-11-24 13:00:40,815 epoch 8 - iter 10/22 - loss 0.40402703 - samples/sec: 10.78
2019-11-24 13:00:46,028 epoch 8 - iter 12/22 - loss 0.40344779 - samples/sec: 12.32
2019-11-24 13:00:51,045 epoch 8 - iter 14/22 - loss 0.39504810 - samples/sec: 12.84
2019-11-24 13:00:55,450 epoch 8 - iter 16/22 - loss 0.38915904 - samples/sec: 14.58
2019-11-24 13:01:00,171 epoch 8 - iter 18/22 - loss 0.37311395 - samples/sec: 13.59
2019-11-24 13:01:04,463 epoch 8 - iter 20/22 - loss 0.37794078 - samples/sec: 14.98
2019-11-24 13:01:05,227 ---------------------------------------------------------