In [5]:
import pandas as pd
import numpy as np
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn import feature_extraction, model_selection, preprocessing
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
import glob 
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.tokenize import word_tokenize
import spacy
from collections import Counter
import re
import random
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
from keras.callbacks import EarlyStopping
from keras.layers import Dropout
from string import punctuation
import h5py
import pickle
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
lemmatizer = WordNetLemmatizer()
stop = set(stopwords.words('english'))
stop.update(list(punctuation))

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/mohnishdevadiga/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/mohnishdevadiga/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/mohnishdevadiga/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [14]:
from tqdm.auto import tqdm
tqdm.pandas()

  from pandas import Panel


In [6]:
MAX_NB_WORDS = 50000
MAX_SEQUENCE_LENGTH = 250
EMBEDDING_DIM = 100

In [7]:
def get_datasets(shuffle=False,processed=False):
    df = pd.DataFrame()
    path = 'Datasets/*/*_*.csv'
    if processed:
        path = 'processed_dataset/*.csv'
    for file in tqdm(glob.glob(path)):
        df = df.append(pd.read_csv(file), ignore_index=True)
    if shuffle:
    	df = df.reindex(np.random.permutation(df.index)).reset_index(drop=True)
    return df

In [9]:
def normalize(df,difference=300):
    df_list = [df[df['fake']==0],df[df['fake']==1]] # [fact,fake]
    lst = [len(df_list[0]),len(df_list[1])]
    if not bool(lst.index(min(lst))):
        """ if fake is larger then swap """
        df_list[0], df_list[1] = df_list[1], df_list[0]
    size = len(df_list[0]) - len(df_list[1]) + difference
    to_delete = random.sample(range(0, len(df_list[0])), size)
    df_list[0] = df_list[0].drop(df_list[0].index[to_delete])
    return df_list[0].append(df_list[1], ignore_index=True).sample(frac=1)

In [10]:
def text_clean(text):
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'http\S+', '', text)
    text = " ".join(x for x in word_tokenize(str(text)) if x.strip().lower() not in stop)
    text = re.sub(r'\d+', '', text)
    text = " ".join(lemmatizer.lemmatize(x.lower()) for x in text.split())
    return(text)

In [11]:
def pre_process(norm=True):
    if norm:
        df = normalize(get_datasets())
    else:
        df = get_datasets(shuffle=True)
    print("Normalized")
    df = df.replace(np.nan, '', regex=True)
    df['news'] = df['title'].str.cat(df['text'],sep=" ")
    print("Cleaning")
    df['news'] = df['news'].progress_apply(text_clean)
    df = df.drop(['title','text'], axis=1)
    return df

In [12]:
def format_data(df, train=True, tokenizer=None):
    x = df["news"].values
    if train:
        y = df['fake'].values
    if not tokenizer:
        tokenizer = Tokenizer(num_words=MAX_NB_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True)
        tokenizer.fit_on_texts(x)
        print('Found %s unique tokens.' % len(tokenizer.word_index))
    x = tokenizer.texts_to_sequences(x)
    x = pad_sequences(x, maxlen=MAX_SEQUENCE_LENGTH)
    if not train:
        return x, tokenizer
    return x, y, tokenizer

As we had already cleaned data in XGBoost.ipynb

In [15]:
#df = pre_process(norm=False)
df = get_datasets(processed=True)
df['news'] = df['news'].apply(str)
df.head()

HBox(children=(FloatProgress(value=0.0, max=30.0), HTML(value='')))




Unnamed: 0,fake,news
0,0,ok google burger king hijacked speaker ... fai...
1,0,box office ‘ doctor strange ’ lucky marvel hit...
2,0,supreme court issue stay transgender bathroom ...
3,1,man nostalgic simpler era hour ago onion ameri...
4,0,nobel peace prize recognize u continue writing...


In [16]:
X, y, tokenizer = format_data(df)
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2020)

Found 410973 unique tokens.


In [17]:
print("train: 1:{}, 0:{} | test: 1:{}, 0:{}".format(y_train.tolist().count(1),y_train.tolist().count(0),y_test.tolist().count(1),y_test.tolist().count(0)))

train: 1:29120, 0:138373 | test: 1:7360, 0:34514


In [18]:
def sequential(inp):
    model = Sequential()
    model.add(Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length=inp))
    model.add(SpatialDropout1D(0.2))
    model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
    model.add(Dense(50, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    print(model.summary())
    return model

In [14]:
epochs = 10
batch_size = 500
try:
    model = sequential(x_train.shape[1])
    history = []
    batches = 10
    i = 0
    for batch_x, batch_y in zip(np.array_split(x_train, batches),np.array_split(y_train, batches)):
        print('*'*5,i,'*'*5)
        h = model.fit(batch_x, batch_y, epochs=epochs, batch_size=batch_size,validation_data=(x_test, y_test))
        history.append(h)
        i+=1
except Exception as e:
    print(e)

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 250, 100)          5000000   
_________________________________________________________________
spatial_dropout1d_1 (Spatial (None, 250, 100)          0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 100)               80400     
_________________________________________________________________
dense_1 (Dense)              (None, 50)                5050      
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 51        
Total params: 5,085,501
Trainable params: 5,085,501
Non-trainable params: 0
_________________________________________________________________
None
***** 0 ***

KeyboardInterrupt: 

In [None]:
def save(vector_model, NN_model, NN_model_data, Version):
    pickle.dump(vector_model,open("VEC_model_"+str(Version)+".sav", 'wb'))
    pickle.dump(NN_model_data,open("MODEL_data_"+str(Version)+".txt", 'wb'))
    NN_model.save("model"+str(Version)+".h5")
    print("Saved model and Data")

save(tokenizer, model, history, version)

In [None]:
from keras.callbacks import EarlyStopping

callback = EarlyStopping(monitor='accuracy', mode='max', verbose=1)
              
epochs = 10
batch_size = 500
try:
    model_1 = sequential(x_train.shape[1])
    history_1 = []
    batches = 10
    i = 0
    for batch_x, batch_y in tqdm(zip(np.array_split(x_train, batches),np.array_split(y_train, batches))):
        print('*'*5,i,'*'*5)
        h = model_1.train_on_batch(batch_x, batch_y)#, epochs=epochs, batch_size=batch_size,validation_data=(x_test, y_test), callbacks=[callback])
        history_1.append(h)
        i+=1
except Exception as e:
    print(e)

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 250, 100)          5000000   
_________________________________________________________________
spatial_dropout1d_1 (Spatial (None, 250, 100)          0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 100)               80400     
_________________________________________________________________
dense_1 (Dense)              (None, 50)                5050      
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 51        
Total params: 5,085,501
Trainable params: 5,085,501
Non-trainable params: 0
_________________________________________________________________
None


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

***** 0 *****

***** 1 *****
***** 2 *****
***** 3 *****
***** 4 *****
***** 5 *****
***** 6 *****


In [None]:
from keras.callbacks import EarlyStopping

callback = EarlyStopping(monitor='accuracy', mode='max', verbose=1) 

epochs = 10
batch_size = 1000
try:
    model_0 = sequential(x_train.shape[1])
    history_0 = model_0.fit(batch_x, batch_y, epochs=epochs, batch_size=batch_size,validation_data=(x_test, y_test), callbacks=[callback])
except Exception as e:
    print(e)