In [1]:
import os
import json
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import pandas as pd
from flair.embeddings import WordEmbeddings, FlairEmbeddings, DocumentPoolEmbeddings, Sentence, DocumentRNNEmbeddings
from flair.models import TextClassifier
from flair.trainers import ModelTrainer
from flair.datasets import CSVClassificationCorpus
import os
from sklearn.metrics import f1_score, accuracy_score, recall_score, precision_score

from collections import Counter

In [2]:
DATA_PATH = '../fakenewsnet_dataset/dataset'
DATASET_NAME = 'politifact'
DATASET_PATH = '{}/{}'.format(DATA_PATH, DATASET_NAME)
REAL_DATA_PATH = '{}_real.csv'.format(DATASET_PATH)
FAKE_DATA_PATH = '{}_fake.csv'.format(DATASET_PATH)

In [3]:
fake_arts = pd.read_csv(FAKE_DATA_PATH, na_values=['nan'], keep_default_na=False)
real_arts = pd.read_csv(REAL_DATA_PATH, na_values=['nan'], keep_default_na=False)

In [4]:
real_arts

Unnamed: 0,id,news_url,title,tweet_ids
0,politifact14984,http://www.nfib-sbet.org/,National Federation of Independent Business,967132259869487105\t967164368768196609\t967215...
1,politifact12944,http://www.cq.com/doc/newsmakertranscripts-494...,comments in Fayetteville NC,942953459\t8980098198\t16253717352\t1668513250...
2,politifact333,https://web.archive.org/web/20080204072132/htt...,"Romney makes pitch, hoping to close deal : Ele...",
3,politifact4358,https://web.archive.org/web/20110811143753/htt...,Democratic Leaders Say House Democrats Are Uni...,
4,politifact779,https://web.archive.org/web/20070820164107/htt...,"Budget of the United States Government, FY 2008",89804710374154240\t91270460595109888\t96039619...
...,...,...,...,...
619,politifact14731,https://www.flake.senate.gov/public/index.cfm/...,Flake: “Religious tests should have no place i...,
620,politifact329,https://web.archive.org/web/20080131000131/htt...,Change We Can Believe In,634287923135909888\t946743411100536832\t946816...
621,politifact1576,http://www.youtube.com/watch?v=4O8CxZ1OD58,deputy director of national health statistics ...,
622,politifact4720,http://www.youtube.com/watch?v=EhyMplwY6HY,Romneys ProLife Conversion Myth or Reality Jun...,188871706637647874


In [5]:
def parse_art_data_frame(df):
    return [{'id': id, 'url': url, 'title': title} for id, url, title, tweets in df.values]
    
fake_arts_with_content = parse_art_data_frame(fake_arts)
real_arts_with_content = parse_art_data_frame(real_arts)

In [6]:
fake_data = [(art, 'fake') for art in fake_arts_with_content]
real_data = [(art, 'real') for art in real_arts_with_content]

In [7]:
np.random.shuffle(fake_data)
np.random.shuffle(real_data)

In [8]:
train_data = fake_data[0:int(len(fake_data)*0.8)] + real_data[0:int(len(real_data)*0.8)]
test_data = fake_data[int(len(fake_data)*0.8):] + real_data[int(len(real_data)*0.8):]
np.random.shuffle(train_data)
np.random.shuffle(test_data)

In [9]:
print(len(train_data))
print(len(test_data))
print(len(train_data) + len(test_data))
print(len(fake_data) + len(real_data))

844
212
1056
1056


In [10]:
train_content = [(x, label) for x, label in train_data] 
test_content = [(x, label) for x, label in test_data]

In [11]:
all_content = train_content + test_content

In [12]:
all_content[0][0].keys()

dict_keys(['id', 'url', 'title'])

In [13]:
title = Counter([x['title'] for x, label in all_content])
title.most_common()

[('Outlook, Office, Skype, Bing, Breaking News, and Latest Videos', 13),
 ('Transcripts', 10),
 ('Political TV Ad Archive » PolAd', 4),
 ('LexisNexis(R) Publisher', 4),
 ('The Democratic Debate in Cleveland', 3),
 ('John McCain for President', 3),
 ('Change We Can Believe In', 3),
 ('MoveOn.org Political Action: 10 things to know about McCain', 3),
 ('Account Suspended', 3),
 ('Fox News Sunday', 3),
 ('Employment, Hours, and Earnings from the Current Employment Statistics survey (National) Home Page',
  3),
 ('Organizing for Action', 2),
 ('Democratic National Convention Committee', 2),
 ('Interest Group Ratings', 2),
 ("'This Week' Transcript: Geithner", 2),
 ("Transcript of Wednesday's presidential debate", 2),
 ('Information for the Nation', 2),
 ('Remarks by the President on the Economy -- Knox College, Galesburg, IL', 2),
 ('Debbie Wasserman Schultz on Twitter', 2),
 ('Obama for America TV Ad: "Firms"', 2),
 ('American News is under construction', 2),
 ('CQ Vote Studies', 2),
 ('P

In [14]:
search = 'Outlook, Office, Skype, Bing, Breaking News, and Latest Videos'
title_search = [(x,label) for x, label in all_content if x['title'] == search]
title_search

[({'id': 'politifact3468',
   'url': 'http://www.msnbc.msn.com/id/21134540/#42160354',
   'title': 'Outlook, Office, Skype, Bing, Breaking News, and Latest Videos'},
  'real'),
 ({'id': 'politifact539',
   'url': 'http://www.msnbc.msn.com/id/7003226/',
   'title': 'Outlook, Office, Skype, Bing, Breaking News, and Latest Videos'},
  'real'),
 ({'id': 'politifact959',
   'url': 'http://www.msnbc.msn.com/id/29705720/',
   'title': 'Outlook, Office, Skype, Bing, Breaking News, and Latest Videos'},
  'real'),
 ({'id': 'politifact2131',
   'url': 'http://www.msnbc.msn.com/id/38384219/ns/meet_the_press-transcripts',
   'title': 'Outlook, Office, Skype, Bing, Breaking News, and Latest Videos'},
  'real'),
 ({'id': 'politifact340',
   'url': 'http://www.msnbc.msn.com/id/3080247/',
   'title': 'Outlook, Office, Skype, Bing, Breaking News, and Latest Videos'},
  'real'),
 ({'id': 'politifact1741',
   'url': 'http://www.msnbc.msn.com/id/21134540/vp/36687523#36687523',
   'title': 'Outlook, Office,

In [15]:
search = 'YouTube'
title_search = [(x,label) for x, label in all_content if x['title'] == search]
title_search

[]

In [16]:
text = Counter([x['url'] for x, label in all_content])
text.most_common()

[('', 61),
 ('https://web.archive.org/web/20050322064340/http://www6.lexisnexis.com:80/publisher/EndUser?Action=UserDisplayFullDocument',
  4),
 ('https://web.archive.org/web/20080506120114/http://pol.moveon.org:80/mccain10/email.html?',
  3),
 ('http://www.bls.gov/ces/', 3),
 ('http://www.cnn.com/2012/10/03/politics/debate-transcript/index.html', 2),
 ('http://www.whitehouse.gov/the-press-office/2013/07/24/remarks-president-economy-knox-college-galesburg-il',
  2),
 ('http://www.youtube.com/watch?v=Ud3mMj0AZZk', 2),
 ('https://web.archive.org/web/20080817004723/http://www.cqpolitics.com:80/cq-assets/cqmultimedia/flash/votestudy/index.html',
  2),
 ('http://politicaladarchive.org/ad/polad_donaldtrump_k1mkc/', 2),
 ('http://transcripts.cnn.com/TRANSCRIPTS/0706/05/se.01.html', 2),
 ('http://www.youtube.com/watch?v=mGur36uVWxA', 2),
 ('http://www.desmoinesregister.com/article/20100324/OPINION01/3250323/1036',
  2),
 ('https://www.politico.com/story/2018/01/17/full-text-jeff-flake-on-trump

In [17]:
search = ''
url_search = [(x,label) for x, label in all_content if x['url'] == search]
url_search

[({'id': 'politifact649',
   'url': '',
   'title': 'Democratic National Convention Committee'},
  'real'),
 ({'id': 'politifact10889',
   'url': '',
   'title': 'Declining Business Dynamism in the United States A Look at States and Metros'},
  'real'),
 ({'id': 'politifact906',
   'url': '',
   'title': 'Report on Geithners tax issues'},
  'real'),
 ({'id': 'politifact648',
   'url': '',
   'title': 'Democratic National Convention Committee'},
  'real'),
 ({'id': 'politifact914',
   'url': '',
   'title': 'Press release Rep Barton Democrats stimulus will waste money abuse people and lead to unintended consequences'},
  'real'),
 ({'id': 'politifact93',
   'url': '',
   'title': 'Social Security Trustees Report 2007'},
  'real'),
 ({'id': 'politifact86',
   'url': '',
   'title': 'New York Election Results November 7'},
  'real'),
 ({'id': 'politifact15242',
   'url': '',
   'title': ' Actress Emma Stone ‘For the first time in history we have a president that…'},
  'fake'),
 ({'id': 'p

In [18]:
search = 'https://web.archive.org/web/20050322064340/http://www6.lexisnexis.com:80/publisher/EndUser?Action=UserDisplayFullDocument'
url_search = [(x,label) for x, label in all_content if x['url'] == search]
url_search

[({'id': 'politifact3228',
   'url': 'https://web.archive.org/web/20050322064340/http://www6.lexisnexis.com:80/publisher/EndUser?Action=UserDisplayFullDocument',
   'title': 'LexisNexis(R) Publisher'},
  'real'),
 ({'id': 'politifact2298',
   'url': 'https://web.archive.org/web/20050322064340/http://www6.lexisnexis.com:80/publisher/EndUser?Action=UserDisplayFullDocument',
   'title': 'LexisNexis(R) Publisher'},
  'real'),
 ({'id': 'politifact954',
   'url': 'https://web.archive.org/web/20050322064340/http://www6.lexisnexis.com:80/publisher/EndUser?Action=UserDisplayFullDocument',
   'title': 'LexisNexis(R) Publisher'},
  'real'),
 ({'id': 'politifact681',
   'url': 'https://web.archive.org/web/20050322064340/http://www6.lexisnexis.com:80/publisher/EndUser?Action=UserDisplayFullDocument',
   'title': 'LexisNexis(R) Publisher'},
  'real')]

In [19]:
url_search = [(x,label) for x, label in all_content if len(x['url']) >= 100]
url_search

[({'id': 'politifact15158',
   'url': 'https://web.archive.org/web/20180327213804/http://stluciantimes.com:80/breaking-irish-superstar-saoirse-ronan-dies-after-on-set-accident-in-st-lucia/',
   'title': 'BREAKING: Irish superstar Saoirse Ronan dies after on-set accident in St. Lucia'},
  'fake'),
 ({'id': 'politifact379',
   'url': 'http://www.nytimes.com/2008/02/26/us/politics/26text-debate.html?pagewanted=13&_r=1&adxnnlx=1204132144-xyVy8pN0n1xCgMNXI73z6A',
   'title': 'The Democratic Debate in Cleveland'},
  'real'),
 ({'id': 'politifact12945',
   'url': 'https://medium.com/hillary-for-america/to-young-people-who-are-undocumented-this-is-your-country-too-e0184e858b40#.2uv6onq62',
   'title': 'To young people who are undocumented: This is your country, too.'},
  'real'),
 ({'id': 'politifact201',
   'url': 'http://query.nytimes.com/gst/fullpage.html?res=940DEFDD133AF933A25755C0A962958260&sec=&spon=&partner=permalink&exprod=permalink',
   'title': 'New York Officials Welcome Immigrants