In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer
import re
import string
from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import f1_score, accuracy_score

In [2]:
T = pd.read_csv("True.csv")
F = pd.read_csv("Fake.csv")

In [3]:
T["label"] = 0
F["label"] = 1

Merge the 2 datasets, drop the duplicates and shuffle them

In [4]:
data = pd.concat([T, F])

In [5]:
data = data.drop_duplicates()

In [6]:
data = data.sample(frac = 1)
data = data.reset_index(drop = True)

Clean the text from all the possible impurities, like punctuation and errors in formatting. We also removed the word "reuters" because it was linked almost pefectly with the "True" dataset

In [7]:
def clean_text(text):
    '''Make text lowercase, remove text in square brackets,remove links,remove punctuation
    and remove words containing numbers.'''
    text = text.lower()
    text = re.sub('\[.*?\]', ' ', text)
    text = re.sub('https?://\S+|www\.\S+', ' ', text)
    text = re.sub('<.*?>+', ' ', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), ' ', text)
    text = re.sub('\n', ' ', text)
    text = re.sub('\w*\d\w*', ' ', text)
    text = re.sub("\xa0", ' ', text)
    text = re.sub(' +', ' ', text)
    text = re.sub('reuters', '', text)
    return text

In [8]:
stop_words = set(stopwords.words('english'))

We tokenize the data, remove stopwords, stem the words using the Snowball stemmer (to reduce the dimentionality of the data) and then rejoin the words to get a cleaned text

In [9]:
def preprocess_text(text, stop_words):
    cleaned = clean_text(text)
    tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')
    tokenized = tokenizer.tokenize(cleaned)
    no_stop = [w for w in tokenized if w not in stop_words]
    snowball = SnowballStemmer('english')
    stemmed = [snowball.stem(x) for x in no_stop]
    combined_text = ' '.join(stemmed)
    return combined_text

In [10]:
data["text"] = data["text"].apply(lambda x: preprocess_text(x, stop_words))

In [11]:
data["title"] = data["title"].apply(lambda x: preprocess_text(x, stop_words))

We merge the text and the title in the same text, then we retokenize it so we can make a list of all the words in the dataset

In [12]:
data['full'] = data['title'] + ' ' + data['text']

In [13]:
tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')
data["full"] = data['full'].apply(lambda x: tokenizer.tokenize(x))

In [14]:
list_of_words = []
for i in data.full:
    for j in i:
        list_of_words.append(j)

We find the total number of words 

In [15]:
total_words = len(list(set(list_of_words)))

In [16]:
data['full'] = data['full'].apply(lambda x: " ".join(x))

In [17]:
from tensorflow.keras.preprocessing.text import one_hot, Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten, Embedding, Input, LSTM, Conv1D, MaxPool1D, Bidirectional
from tensorflow.keras.models import Model

We use the text as X of the model and the labels as y

In [18]:
X_train, X_test, y_train, y_test = train_test_split(data["full"], data["label"], test_size = 0.2, random_state = 101)

We tokenize the dataset into sequences

In [19]:
tokenizer = Tokenizer(num_words = total_words)
tokenizer.fit_on_texts(X_train)
train_sequences = tokenizer.texts_to_sequences(X_train)
test_sequences = tokenizer.texts_to_sequences(X_test)

We pad the sequences 

In [20]:
padded_train = pad_sequences(train_sequences,maxlen = 170, padding = 'post', truncating = 'post')
padded_test = pad_sequences(test_sequences,maxlen = 170, truncating = 'post')

We write a Bidirectional LSTM model

In [36]:
model = Sequential()

# Embeddidng layer
model.add(Embedding(total_words, output_dim = 128))


# Bi-Directional RNN and LSTM
model.add(Bidirectional(LSTM(128)))

# Dense layers
model.add(Dense(128, activation = 'relu'))
model.add(Dense(1,activation= 'sigmoid'))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])

In [37]:
y_train = np.asarray(y_train)
y_test = np.asarray(y_test)

In [38]:
model.fit(padded_train, y_train, batch_size = 512, validation_data = (padded_test, y_test), epochs = 3)

Train on 35751 samples, validate on 8938 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x219d64611c8>

In [39]:
pred = model.predict_classes(padded_test)

In [40]:
f1_score(y_test, pred)

0.8603863407488787