In [2]:
import pandas as pd
import numpy as np
import re
import time
import pickle
import zlib

import nltk 
from nltk.tokenize import word_tokenize 

from tqdm import tqdm

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier

In [3]:
data = pd.read_csv('Data_Preperation/final_data')
data.shape

(89886, 3)

In [4]:
data.head()

Unnamed: 0.1,Unnamed: 0,text,labels
0,66717,Pentagon making list of Iraqis who worked alon...,real
1,75363,"Two weeks to unlock Brexit, EU tells May GOTHE...",real
2,82633,One Direction Is my pick for http://t.co/q2eBl...,fake
3,82203,Downfall of ex-Samsung strategy chief leaves '...,real
4,26430,Texas has a higher incarceration rate than Rus...,real


In [5]:
data.drop('Unnamed: 0', axis=1, inplace=True)

In [6]:
data.head()

Unnamed: 0,text,labels
0,Pentagon making list of Iraqis who worked alon...,real
1,"Two weeks to unlock Brexit, EU tells May GOTHE...",real
2,One Direction Is my pick for http://t.co/q2eBl...,fake
3,Downfall of ex-Samsung strategy chief leaves '...,real
4,Texas has a higher incarceration rate than Rus...,real


In [7]:
# Finding NaN values in the dataset

nan = data['text'].loc[data['text'].isnull()]
print(" There are {} NaN values in text".format(len(nan)))

nan = data['labels'].loc[data['labels'].isnull()]
print(" There are {} NaN values in labels".format(len(nan)))

 There are 597 NaN values in text
 There are 0 NaN values in labels


In [8]:
# dropping the rows with NaN values
data = data.dropna()

In [9]:
# Finding NaN values in the dataset

nan = data['text'].loc[data['text'].isnull()]
print(" There are {} NaN values in text".format(len(nan)))

nan = data['labels'].loc[data['labels'].isnull()]
print(" There are {} NaN values in labels".format(len(nan)))

 There are 0 NaN values in text
 There are 0 NaN values in labels


In [10]:
data.shape

(89289, 2)

In [11]:
# Train Text

preprocessed_text = []
# tqdm is for printing the status bar
for sentance in tqdm(data['text'].values):
    #sent = decontracted(sentance)
    sentance = sentance.replace('\\r', ' ')
    sentance = sentance.replace('\\"', ' ')
    sentance = sentance.replace('\\n', ' ')
    sentance = re.sub('[^A-Za-z0-9]+', ' ', sentance)
    # https://gist.github.com/sebleier/554280
    #sentance = ' '.join(e for e in sentance.split() if e.lower() not in stopwords)
    preprocessed_text.append(sentance.lower().strip())

data['text'] = preprocessed_text

100%|██████████████████████████████████████████████████████████████████████████| 89289/89289 [00:27<00:00, 3244.49it/s]


In [12]:
data.head()

Unnamed: 0,text,labels
0,pentagon making list of iraqis who worked alon...,real
1,two weeks to unlock brexit eu tells may gothen...,real
2,one direction is my pick for http t co q2eblok...,fake
3,downfall of ex samsung strategy chief leaves s...,real
4,texas has a higher incarceration rate than rus...,real


In [13]:
y_train = data['labels']
x_train = data.drop(['labels'], axis=1)

In [14]:
x_train.head()

Unnamed: 0,text
0,pentagon making list of iraqis who worked alon...
1,two weeks to unlock brexit eu tells may gothen...
2,one direction is my pick for http t co q2eblok...
3,downfall of ex samsung strategy chief leaves s...
4,texas has a higher incarceration rate than rus...


In [15]:
y_train.head()

0    real
1    real
2    fake
3    real
4    real
Name: labels, dtype: object

In [16]:
# Train test split
x_train, x_test, y_train, y_test = train_test_split(x_train, y_train, test_size=0.2, stratify=y_train)

In [17]:
print(x_train.shape)
print(x_test.shape)

(71431, 1)
(17858, 1)


In [18]:
print(y_train.shape)
print(y_test.shape)

(71431,)
(17858,)


In [19]:
x_train.head()

Unnamed: 0,text
74652,says that three of these five republicans ben ...
37214,north carolina lawmaker dismisses u s deadline...
24784,week of clashes in eastern ethiopia kill 50 di...
9832,the u n arms treatyinfringes upon our right to...
23004,sean spicer fires off tweet after resignation ...


In [20]:
print(y_train.shape)
print(y_test.shape)

(71431,)
(17858,)


In [21]:
# Vectorizing the 'text' data

vectorizer = TfidfVectorizer(max_features=5000)

train_text = vectorizer.fit_transform(x_train['text'])
test_text = vectorizer.transform(x_test['text'])

In [22]:
print(train_text.shape)
print(test_text.shape)

(71431, 5000)
(17858, 5000)


In [23]:
# Saving the vectorizer into disk

filepath = 'Deployment\\vectorizer.pkl'

pickle.dump(vectorizer, open(filepath, 'wb'))   # Uncomment this line to save file into disk

In [33]:
# Machine Learning Model: Random Forest

%%time
clf  = RandomForestClassifier(n_estimators=100, random_state=1, n_jobs=-1, class_weight='balanced')
clf.fit(train_text, y_train)

Wall time: 6min 9s


In [34]:
clf.score(train_text, y_train)

0.9983200571180579

In [35]:
clf.score(test_text, y_test)

0.8913652144697054

In [None]:
# Saving the classifier into disk

filepath = 'Deployment\\classifier.pkl'

pickle.dump(clf, open(filepath, 'wb'))  # Uncomment this line to save file into disk