In [16]:
import pandas as pd
import numpy as np
from nltk.tokenize import RegexpTokenizer
from nltk import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
pd.options.display.max_rows = 999

from sklearn.model_selection import train_test_split

from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.callbacks import EarlyStopping
from keras.utils import to_categorical

## Reading in data

In [2]:
filepath = '../datasets/news_cleaned_2018_02_13.csv'
nlinesfile = 9408908
nlinesrandomsample = 100_000
lines2skip = np.random.choice(np.arange(1,nlinesfile+1), (nlinesfile-nlinesrandomsample), replace=False)
df = pd.read_csv(filepath, skiprows=lines2skip)


df.shape

  interactivity=interactivity, compiler=compiler, result=result)


(90698, 17)

In [4]:
df.sample(5)

Unnamed: 0.1,Unnamed: 0,id,domain,type,url,content,scraped_at,inserted_at,updated_at,title,authors,keywords,meta_keywords,meta_description,tags,summary,source
48048,1506,5213061,dailykos.com,political,https://www.dailykos.com/stories/2008/10/8/624...,My response to the Republican email blaming th...,2017-11-27T01:14:21.395055,2018-02-07 23:39:33.852671,2018-02-07 23:39:33.852696,Refuting GOP attempts to blame Dems for meltdown,"Backgroundurl Avatar_Large, Nickname, Joined, ...",,[''],,,,
2690,8742,302589,lifenews.com,bias,https://consciouslifenews.com/tag/antibiotic-r...,FAIR USE NOTICE. Many of the stories on this s...,2018-01-25 20:13:50.426130,2018-02-02 01:19:41.756632,2018-02-02 01:19:41.756664,antibiotic-resistent bacteria Archives,,,[''],,,,
6802,9766,765779,dailykos.com,political,https://www.dailykos.com/user/willy%20be%20fra...,"Canada 2015- from Conservative hell, to ""Sunny...",2018-01-25 20:13:50.426130,2018-02-02 01:19:41.756632,2018-02-02 01:19:41.756664,willy be frantic's Followed People,"Happy Cog Studios - Http, Www.Happycog.Com, Da...",,[''],,,,
7300,7629,822290,beforeitsnews.com,fake,http://beforeitsnews.com/environment/2012/01/n...,"New Overnight Diapers Promise Peaceful Nights,...",2018-01-25 20:13:50.426130,2018-02-02 01:19:41.756632,2018-02-02 01:19:41.756664,"New Overnight Diapers Promise Peaceful Nights,...",Seventh Generation,,[''],,,,
2534,2593,285557,dcclothesline.com,conspiracy,http://www.dcclothesline.com/2015/02/03/muslim...,"With Barack Obama in office, you knew this was...",2018-01-25 20:13:50.426130,2018-02-02 01:19:41.756632,2018-02-02 01:19:41.756664,Muslim Brotherhood Political Party formed in U...,Posted On,,[''],,,,


In [5]:
df.isna().mean()

Unnamed: 0          0.000000
id                  0.000000
domain              0.000000
type                0.046594
url                 0.000000
content             0.000011
scraped_at          0.000011
inserted_at         0.000000
updated_at          0.000000
title               0.008501
authors             0.446625
keywords            1.000000
meta_keywords       0.040045
meta_description    0.527983
tags                0.767360
summary             1.000000
source              0.781340
dtype: float64

In [6]:
df = df.drop(columns = ['authors', 'keywords', 'meta_description', 'tags', 'summary', 'source', 'id', 'Unnamed: 0', 'url', 'scraped_at', 'inserted_at', 'updated_at'])

In [7]:
df.sample(5)

Unnamed: 0,domain,type,content,title,meta_keywords
10737,express.co.uk,rumor,SUPPLIED CCTV The money was stolen before it c...,"Vicar’s fury as two women steal £8,000 destine...",['']
35833,americanthinker.com,bias,They ought to put that on a travel brochure. T...,"Blog: Welcome to Yemen, al-Qaeda terrorists",['']
16382,theonion.com,satire,"""Hopefully, Bush has learned a great deal on t...",Bush In Jordan,"['American Voices', 'Vol 42 Issue 48', 'The On..."
23236,conservapedia.com,bias,Jump to: navigation\n\nWhat links here Page: N...,"Pages that link to ""Omar Torrijos""",['']
44272,theeventchronicle.com,conspiracy,984 SHARES Share Tweet Google+ Mail Reddit Buf...,Vladimir Putin – Agent of the Awakening?,['']


In [8]:
df = df.drop(columns = ['meta_keywords'])

In [9]:
df.isna().mean()

domain     0.000000
type       0.046594
content    0.000011
title      0.008501
dtype: float64

In [11]:
df.dropna(inplace = True)

In [14]:
df = df[df['type'] != 'rumor']
df = df[df['type'] != 'unknown']
df = df[df['type'] != 'satire']

In [15]:
df['type'].value_counts(normalize = True)

reliable      0.267789
political     0.234307
bias          0.160657
fake          0.126711
conspiracy    0.108524
unreliable    0.042144
clickbait     0.032487
junksci       0.016874
hate          0.010506
Name: type, dtype: float64

In [24]:
df['label'] = df['type'].map({'reliable' : 0, 'political' : 1, 'bias' : 2, 'fake' : 3, 'conspiracy' : 4, 'unreliable' : 5, 'clickbait' : 6, 'junksci' : 7, 'hate' : 8})

In [25]:
df['label'].value_counts(normalize = True)

0    0.267789
1    0.234307
2    0.160657
3    0.126711
4    0.108524
5    0.042144
6    0.032487
7    0.016874
8    0.010506
Name: label, dtype: float64

In [45]:
y = to_categorical(df['label'])

## Processing input

In [17]:
def tokenize(x):
    tokenizer = RegexpTokenizer(r'\w+')
    return tokenizer.tokenize(x)

df['tokens'] = df['title'].map(tokenize)
    
def lemmatize(x):
    lemmatizer = WordNetLemmatizer()
    return ' '.join([lemmatizer.lemmatize(word) for word in x])

df['lemma'] = df['tokens'].map(lemmatize)



In [46]:
cvec = CountVectorizer(stop_words='english', ngram_range=(1, 2), max_df = .8, min_df = 3)
X = cvec.fit_transform(df['title'])



## Basic model

In [47]:
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    random_state=42, stratify = y)

In [48]:
X_train.shape

(56538, 23884)

In [49]:
X_test.shape

(18846, 23884)

In [50]:
y_train.shape

(56538, 9)

185746.0

In [59]:
model = Sequential()

model.add(Dense(16384, activation='relu', input_shape=(23884,)))
model.add(Dropout(.5))
model.add(Dense(8192, activation='relu'))
model.add(Dropout(.5))

model.add(Dense(9, activation = 'softmax'))

In [60]:
model.compile(optimizer='adam',
              loss = 'categorical_crossentropy',
              metrics = ['accuracy'])

In [61]:
h = model.fit(X_train, y_train, 
              epochs = 8, batch_size= 2048, 
              validation_data= (X_test, y_test))

Train on 56538 samples, validate on 18846 samples
Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8

KeyboardInterrupt: 