In [24]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn import feature_extraction, model_selection, preprocessing
from keras.layers import Activation, Dropout, Input, Embedding
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
import glob 
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.tokenize import word_tokenize
import spacy
from collections import Counter
import re
import random
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
from keras.callbacks import EarlyStopping
from keras.layers import Dropout
from string import punctuation
import h5py
import pickle

for package in ['punkt','stopwords','wordnet','punkt']:
    try:
        nltk.data.find('tokenizers/'+package)                
    except LookupError:
         nltk.download(package)

lemmatizer = WordNetLemmatizer()
stop = set(stopwords.words('english'))
stop.update(list(punctuation))

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/mohnishdevadiga/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/mohnishdevadiga/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
from tqdm.auto import tqdm
tqdm.pandas()

In [3]:
def get_datasets(shuffle=False,processed=False):
    df = pd.DataFrame()
    path = 'Datasets/*/*_*.csv'
    if processed:
        path = 'processed_dataset/*.csv'
    for file in tqdm(glob.glob(path)):
        df = df.append(pd.read_csv(file), ignore_index=True)
    if shuffle:
    	df = df.reindex(np.random.permutation(df.index)).reset_index(drop=True)
    return df

In [4]:
def normalize(df,difference=300):
    df_list = [df[df['fake']==0],df[df['fake']==1]] # [fact,fake]
    lst = [len(df_list[0]),len(df_list[1])]
    if not bool(lst.index(min(lst))):
        """ if fake is larger then swap """
        df_list[0], df_list[1] = df_list[1], df_list[0]
    size = len(df_list[0]) - len(df_list[1]) + difference
    to_delete = random.sample(range(0, len(df_list[0])), size)
    df_list[0] = df_list[0].drop(df_list[0].index[to_delete])
    return df_list[0].append(df_list[1], ignore_index=True).sample(frac=1)

In [5]:
def text_clean(text):
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'http\S+', '', text)
    text = " ".join(x for x in word_tokenize(str(text)) if x.strip().lower() not in stop)
    text = re.sub(r'\d+', '', text)
    text = " ".join(lemmatizer.lemmatize(x.lower()) for x in text.split())
    return(text)

In [6]:
def pre_process(norm=True):
    if norm:
        df = normalize(get_datasets())
        print("Normalized")
    else:
        df = get_datasets(shuffle=True)
    df = df.replace(np.nan, '', regex=True)
    df['news'] = df['title'].str.cat(df['text'],sep=" ")
    print("Cleaning")
    df['news'] = df['news'].progress_apply(text_clean)
    df = df.drop(['title','text'], axis=1)
    return df

In [60]:
def format_data(x, y=None, train=True, tokenizer=None,MAX_NB_WORDS=50000,MAX_SEQUENCE_LENGTH=300):
    if not tokenizer:
        tokenizer = Tokenizer(num_words=MAX_NB_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~')
        tokenizer.fit_on_texts(x)
        print('Found %s unique tokens.' % len(tokenizer.word_index))
    x = tokenizer.texts_to_sequences(x)
    x = pad_sequences(x, maxlen=MAX_SEQUENCE_LENGTH)
    if not train:
        return x, tokenizer
    return x, y, tokenizer

In [8]:
df = get_datasets(shuffle=True,processed=True)
df['news'] = df['news'].apply(str)
df.head()

HBox(children=(FloatProgress(value=0.0, max=30.0), HTML(value='')))




Unnamed: 0,fake,news
0,0,congressman met sanctioned putin friend moscow...
1,0,johnny cash manager holiff dy former manager j...
2,0,devos reveals chat fiercely critical teacher u...
3,0,keith schiller man trump sent fire comey washi...
4,1,rush finish obama slap billion regulation amer...


Getting the size of Vocab

In [9]:
vocab = set()
threshold = 5

def get_vocab(df, threshold):
    global vocab
    for i in tqdm(df['news'].values):
        for key,val in {key: count for key, count in Counter(i.split()).items() if count >= threshold}.items():
            vocab.add(key) 

get_vocab(df, threshold)
print(len(vocab))

HBox(children=(FloatProgress(value=0.0, max=209367.0), HTML(value='')))


53291


<h3>Using GloVe to try and improve accuracy <a href='https://nlp.stanford.edu/projects/glove/'>Read More</a></h3>

In [10]:
def loadGloveModel(path):
    print("Loading Glove Model")
    gloveModel = {}
    for File in tqdm(glob.glob(path+'*')):
        f = open(File,'r')
        for line in f:
            splitLines = line.split()
            word = splitLines[0]
            wordEmbedding = np.array([float(value) for value in splitLines[1:]])
            gloveModel[word] = wordEmbedding
    print(len(gloveModel)," words loaded!")
    return gloveModel

embeddings_index = loadGloveModel('Glove/')

Loading Glove Model


HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))


400000  words loaded!


In [11]:
X, y = df['news'].values, df['fake'].values
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=2020)
x_dev, x_test, y_dev, y_test = train_test_split(x_test, y_test, test_size=0.5, random_state=2020)
print("Split data into\ntest: {}, {} \ndev: {}, {}\ntest: {}, {}".format(len(y_train),Counter(y_train),len(y_dev),Counter(y_dev),len(y_test),Counter(y_test)))

Split data into
test: 146556, Counter({0: 121021, 1: 25535}) 
dev: 31405, Counter({0: 25890, 1: 5515})
test: 31406, Counter({0: 25976, 1: 5430})


In [12]:
x_train, y_train, tokenizer = format_data(x_train, y_train, MAX_NB_WORDS=len(vocab))
x_dev, y_dev, tokenizer = format_data(x_dev, y_dev, MAX_NB_WORDS=len(vocab),tokenizer=tokenizer)

Found 341289 unique tokens.


In [44]:
all_embs = np.stack(embeddings_index.values())
emb_mean,emb_std = all_embs.mean(), all_embs.std()
embed_size = all_embs.shape[1]
max_features = len(vocab)
word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix = embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector

<h3>Trying 2 different models</h3>

In [51]:
def sequential(inp):
    model = Sequential()
    model.add(Embedding(len(vocab), 100, input_length=inp,weights=[embedding_matrix]))
    model.add(SpatialDropout1D(0.2))
    model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
    model.add(Dense(50, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [49]:
from keras.layers import LSTM, Activation, Dense, Dropout, Input, Embedding
from keras.models import Model
from keras.optimizers import RMSprop
from keras.callbacks import EarlyStopping
def RNN():
    inputs = Input(name='inputs',shape=[300])
    layer = Embedding(max_features,output_dim=embed_size,weights=[embedding_matrix],input_length=300)(inputs)
    layer = LSTM(50)(layer)
    layer = Dense(25,name='FC1')(layer)
    layer = Activation('relu')(layer)
    layer = Dropout(0.5)(layer)
    layer = Dense(1,name='out_layer')(layer)
    layer = Activation('sigmoid')(layer)
    model = Model(inputs=[inputs],outputs=[layer])
    model.compile(loss='binary_crossentropy',optimizer=RMSprop(),metrics=['accuracy'])
    return model

In [52]:
from keras.callbacks import EarlyStopping

callback = EarlyStopping(monitor='val_accuracy', mode='max', verbose=1)
              
epochs = 10
batch_size = 500
try:
    model_1 = sequential(x_train.shape[1])
    model_2 = RNN()
    history_1, history_2 = [], []
    batches = 10
    i = 0
    for batch_x, batch_y in tqdm(zip(np.array_split(x_train, batches),np.array_split(y_train, batches))):
        print('*'*5,i,'*'*5)
        h_1 = model_1.fit(batch_x, batch_y, epochs=epochs, batch_size=batch_size,validation_data=(x_dev, y_dev), callbacks=[callback])
        history_1.append(h_1)
        print("RNN")
        h_2 = model_1.fit(batch_x, batch_y, epochs=epochs, batch_size=batch_size,validation_data=(x_dev, y_dev), callbacks=[callback])
        history_2.append(h_2)
        i+=1
except Exception as e:
    print(e)

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

***** 0 *****
Train on 14656 samples, validate on 31405 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 00010: early stopping
RNN
Train on 14656 samples, validate on 31405 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 00004: early stopping
***** 1 *****
Train on 14656 samples, validate on 31405 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 00009: early stopping
RNN
Train on 14656 samples, validate on 31405 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 00003: early stopping
***** 2 *****
Train on 14656 samples, validate on 31405 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 00003: early stopping
RNN
Train on 14656 samples, validate on 31405 samples
Epoch 1/10
Epoch 2/10
Epoch 00002: early stopping
***** 3 *****
Train on 14656 samples, validate on 31405 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 00005: earl

In [68]:
pd.DataFrame({'news': x_test,'fake':y_test}).head()

Unnamed: 0,news,fake
0,trump japan stress unity north korea talk trad...,0
1,melania trump ’ ‘ america first ’ inaugural wa...,0
2,house democrat frustrated trump ’ national-sec...,0
3,investigation chairman house russia probe step...,0
4,dem sen manchin ’ truly believe ’ trump ’ abil...,0


In [65]:
x_trial, tokenizer = format_data(x_test, train=False, tokenizer=tokenizer)

In [81]:
print("Accuracy Score Model_1: ",accuracy_score(y_test,model_1.predict_classes(x_trial)))
print("Accuracy Score Model_2: ",accuracy_score(y_test,[1 if x > 0.6 else 0 for x in model_2.predict(x_trial)]))

Accuracy Score Model_1:  0.9857988919314781
Accuracy Score Model_2:  0.8271031013182194


<h3>Saving and testing</h3>

In [87]:
fact, fake = [], []
for txt, target in zip(x_test,y_test):
    (fact,fake)[target == 1].append(txt)
print("fact,fake : ",len(fact),len(fake))

fact,fake :  25976 5430


In [88]:
with open("test_data/fact.txt", 'w') as output:
    for row in fact:
        output.write(str(row) + '\n')
with open("test_data/fake.txt", 'w') as output:
    for row in fake:
        output.write(str(row) + '\n')

In [89]:
import pickle

with open('model_data/tokenizer.pickle', 'wb') as file:
    pickle.dump(tokenizer, file, protocol=pickle.HIGHEST_PROTOCOL)

In [91]:
with open('model_data/tokenizer.pickle', 'rb') as file:
    tk = pickle.load(file)

In [94]:
x_trial1 , tk = format_data(x_test,train=False,tokenizer=tk)

In [105]:
np.any(x_trial != x_trial1)

False

In [107]:
model_1.save('model_data/model.h5')
model_json = model_1.to_json()
with open("model_data/model.json", "w") as json_file:
    json_file.write(model_json)

In [112]:
from keras.models import model_from_json

def load_model():
    with open('model_data/model.json', 'r') as json_file:
        model = model_from_json(json_file.read())
    model.load_weights("model_data/model.h5")
    model._make_predict_function()
    return model

model_trial = load_model()

In [113]:
print("Accuracy Score Model_1: ",accuracy_score(y_test,model_trial.predict_classes(x_trial)))

Accuracy Score Model_1:  0.9857988919314781
