In [18]:
import pandas as pd
import numpy as np
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn import feature_extraction, model_selection, preprocessing
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
import glob 
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.tokenize import word_tokenize
import spacy
from collections import Counter
import re
import random
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
from keras.callbacks import EarlyStopping
from keras.layers import Dropout
from string import punctuation
import h5py
import pickle
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
lemmatizer = WordNetLemmatizer()
stop = set(stopwords.words('english'))
stop.update(list(punctuation))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [0]:
from tqdm.auto import tqdm
tqdm.pandas()

In [0]:
MAX_NB_WORDS = 50000
MAX_SEQUENCE_LENGTH = 250
EMBEDDING_DIM = 100

In [0]:
def get_datasets(shuffle=False,processed=False):
    df = pd.DataFrame()
    path = 'Datasets/*/*_*.csv'
    if processed:
        path = 'processed_dataset/*.csv'
    for file in tqdm(glob.glob(path)):
        df = df.append(pd.read_csv(file), ignore_index=True)
    if shuffle:
    	df = df.reindex(np.random.permutation(df.index)).reset_index(drop=True)
    return df

In [0]:
def normalize(df,difference=300):
    df_list = [df[df['fake']==0],df[df['fake']==1]] # [fact,fake]
    lst = [len(df_list[0]),len(df_list[1])]
    if not bool(lst.index(min(lst))):
        """ if fake is larger then swap """
        df_list[0], df_list[1] = df_list[1], df_list[0]
    size = len(df_list[0]) - len(df_list[1]) + difference
    to_delete = random.sample(range(0, len(df_list[0])), size)
    df_list[0] = df_list[0].drop(df_list[0].index[to_delete])
    return df_list[0].append(df_list[1], ignore_index=True).sample(frac=1)

In [0]:
def text_clean(text):
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'http\S+', '', text)
    text = " ".join(x for x in word_tokenize(str(text)) if x.strip().lower() not in stop)
    text = re.sub(r'\d+', '', text)
    text = " ".join(lemmatizer.lemmatize(x.lower()) for x in text.split())
    return(text)

In [0]:
def pre_process(norm=True):
    if norm:
        df = normalize(get_datasets())
        print("Normalized")
    else:
        df = get_datasets(shuffle=True)
    df = df.replace(np.nan, '', regex=True)
    df['news'] = df['title'].str.cat(df['text'],sep=" ")
    print("Cleaning")
    df['news'] = df['news'].progress_apply(text_clean)
    df = df.drop(['title','text'], axis=1)
    return df

In [0]:
def format_data(df, train=True, tokenizer=None):
    x = df["news"].values
    if train:
        y = df['fake'].values
    if not tokenizer:
        tokenizer = Tokenizer(num_words=MAX_NB_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True)
        tokenizer.fit_on_texts(x)
        print('Found %s unique tokens.' % len(tokenizer.word_index))
    x = tokenizer.texts_to_sequences(x)
    x = pad_sequences(x, maxlen=MAX_SEQUENCE_LENGTH)
    if not train:
        return x, tokenizer
    return x, y, tokenizer

We had already preprocessed dataset in XGBoost.ipynb

In [26]:
df = get_datasets(shuffle=True,processed=True)
df['news'] = df['news'].apply(str)
df.head()

Unnamed: 0,fake,news
0,0,israel approves building plan settler home wes...
1,0,america wrong conversation income inequality w...
2,0,sen. rand paul lawmaker must take back power b...
3,0,ted cruz count army volunteer latest poll show...
4,0,louisville took wake forest ’ play ’ fault may...


In [29]:
X, y, tokenizer = format_data(df)
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2020)

Found 411098 unique tokens.


In [30]:
print("train: 1:{}, 0:{} | test: 1:{}, 0:{}".format(y_train.tolist().count(1),y_train.tolist().count(0),y_test.tolist().count(1),y_test.tolist().count(0)))

train: 1:29276, 0:138217 | test: 1:7204, 0:34670


<h3>Keras Sequential with Early Stopping</h3>

Model

In [0]:
def sequential(inp):
    model = Sequential()
    model.add(Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length=inp))
    model.add(SpatialDropout1D(0.2))
    model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
    model.add(Dense(50, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    print(model.summary())
    return model

Training in batches as full dataset does not fit memory

In [32]:
from keras.callbacks import EarlyStopping

callback = EarlyStopping(monitor='accuracy', mode='max', verbose=1)
              
epochs = 10
batch_size = 500
try:
    model_1 = sequential(x_train.shape[1])
    history_1 = []
    batches = 10
    i = 0
    for batch_x, batch_y in tqdm(zip(np.array_split(x_train, batches),np.array_split(y_train, batches))):
        print('*'*5,i,'*'*5)
        h = model_1.fit(batch_x, batch_y, epochs=epochs, batch_size=batch_size,validation_data=(x_test, y_test), callbacks=[callback])
        history_1.append(h)
        i+=1
except Exception as e:
    print(e)

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 250, 100)          5000000   
_________________________________________________________________
spatial_dropout1d_2 (Spatial (None, 250, 100)          0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 100)               80400     
_________________________________________________________________
dense_3 (Dense)              (None, 50)                5050      
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 51        
Total params: 5,085,501
Trainable params: 5,085,501
Non-trainable params: 0
_________________________________________________________________
None


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

***** 0 *****


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 16750 samples, validate on 41874 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
***** 1 *****
Train on 16750 samples, validate on 41874 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
***** 2 *****
Train on 16750 samples, validate on 41874 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 00007: early stopping
***** 3 *****
Train on 16749 samples, validate on 41874 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
***** 4 *****
Train on 16749 samples, validate on 41874 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
***** 5 *****
Train on 16749 samples, validate on 41874 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoc

Accuracy of the model

In [38]:
accuracy_score(y_test,model_1.predict_classes(x_test))

0.9716530544012991