In [None]:
import tensorflow as tf
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

Found GPU at: /device:GPU:0


In [None]:
import tensorflow as tf
print(tf.__version__)

In [None]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import nltk
import re
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize

from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

import tensorflow as tf
from tensorflow.keras.preprocessing import text, sequence
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, LSTM, Dropout
from tensorflow.keras.callbacks import Callback, ReduceLROnPlateau, EarlyStopping
from tensorflow.keras.callbacks import History

from wordcloud import WordCloud, STOPWORDS

In [None]:
fake = pd.read_csv('/content/drive/My Drive/data/Fake.csv', delimiter = ',')
true = pd.read_csv('/content/drive/My Drive/data/True.csv', delimiter = ',')

In [None]:
fake_and_true = pd.read_csv('/content/drive/My Drive/data/fake_or_real_news.csv', delimiter=',')
fake_and_true.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


In [None]:
fake_and_true["text"] = fake_and_true["title"] + " " + fake_and_true["text"]

fake_and_true.loc[fake_and_true["label"]=='FAKE',"label"] = 0
fake_and_true.loc[fake_and_true["label"]=='REAL',"label"]= 1
fake_and_true.drop(columns= ['title','Unnamed: 0'], inplace=True)

In [None]:
fake['label']= 0
true['label']= 1

dataset =pd.DataFrame()
dataset = true.append(fake)

In [None]:
fake_and_true.head()

Unnamed: 0,text,label
0,You Can Smell Hillary’s Fear Daniel Greenfield...,0
1,Watch The Exact Moment Paul Ryan Committed Pol...,0
2,Kerry to go to Paris in gesture of sympathy U....,1
3,Bernie supporters on Twitter erupt in anger ag...,0
4,The Battle of New York: Why This Primary Matte...,1


In [None]:
dataset.head()

Unnamed: 0,title,text,subject,date,label
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017",1
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017",1
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017",1
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017",1
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017",1


In [None]:
dataset["text"] = dataset["title"] + " " + dataset["text"]
dataset.drop(columns= ['title','subject', 'date'], inplace=True)

dataset = pd.concat([fake_and_true, dataset], ignore_index=True)
dataset.info

<bound method DataFrame.info of                                                     text label
0      You Can Smell Hillary’s Fear Daniel Greenfield...     0
1      Watch The Exact Moment Paul Ryan Committed Pol...     0
2      Kerry to go to Paris in gesture of sympathy U....     1
3      Bernie supporters on Twitter erupt in anger ag...     0
4      The Battle of New York: Why This Primary Matte...     1
...                                                  ...   ...
51228  McPain: John McCain Furious That Iran Treated ...     0
51229  JUSTICE? Yahoo Settles E-mail Privacy Class-ac...     0
51230  Sunnistan: US and Allied ‘Safe Zone’ Plan to T...     0
51231  How to Blow $700 Million: Al Jazeera America F...     0
51232  10 U.S. Navy Sailors Held by Iranian Military ...     0

[51233 rows x 2 columns]>

In [None]:
dataset.head()

Unnamed: 0,text,label
0,You Can Smell Hillary’s Fear Daniel Greenfield...,0
1,Watch The Exact Moment Paul Ryan Committed Pol...,0
2,Kerry to go to Paris in gesture of sympathy U....,1
3,Bernie supporters on Twitter erupt in anger ag...,0
4,The Battle of New York: Why This Primary Matte...,1


In [None]:
dataset['label'] = dataset['label'].astype('int')

### clean data

In [None]:
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
nltk.download('wordnet')
porter_stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

stemmed_text = []
lemmatized_text = []
for text in dataset['text']:
    result = re.sub('[^a-zA-Z]', ' ', text)
    result = result.lower()
    result = result.split()
    result = [r for r in result if r not in set(stopwords.words('english'))]
    stemmed_result = [porter_stemmer.stem(r) for r in result]
    stemmed_text.append(" ".join(stemmed_result))
    lemmatized_result = [lemmatizer.lemmatize(r) for r in result]
    lemmatized_text.append(" ".join(lemmatized_result))


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


In [None]:
nltk.download('stopwords')
porter_stemmer = PorterStemmer()

stemmed_text = []
for text in dataset['text']:
    result = re.sub('[^a-zA-Z]', ' ', text)
    result = result.lower()
    result = result.split()
    result = [r for r in result if r not in set(stopwords.words('english'))]
    stemmed_result = [porter_stemmer.stem(r) for r in result]
    stemmed_text.append(" ".join(stemmed_result))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


## pipeline for selecting classifier and text feature extractor

In [None]:
def get_prediction(vectorizer, classifier, X_train, X_test, y_train, y_test):
    pipe = Pipeline([('vector', vectorizer),
                    ('model', classifier)])
    model = pipe.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print("Accuarcy: {}".format(round(accuracy_score(y_test, y_pred)*100,2)))
    cm = confusion_matrix(y_test, y_pred)
    print("Confusion Matrix: \n", cm)
    print("Classification Report: \n", classification_report(y_test, y_pred))

In [None]:
print(len(stemmed_text))
print(len(lemmatized_text))

51233
51233


### use stemmed text

In [None]:
X_train, X_test, y_train, y_test = train_test_split(stemmed_text, dataset['label'], test_size = 0.3, random_state= 42)
classifiers = [LogisticRegression(), SGDClassifier(), MultinomialNB(), BernoulliNB(), LinearSVC(),
              KNeighborsClassifier(n_neighbors=5), DecisionTreeClassifier(), GradientBoostingClassifier(), 
               RandomForestClassifier(), XGBClassifier()]
for classifier in classifiers:
    print("\n\n", classifier)
    print("***********Usng Count Vectorizer****************")
    get_prediction(CountVectorizer(), classifier, X_train, X_test, y_train, y_test)
    print("***********Usng TFIDF Vectorizer****************")
    get_prediction(TfidfVectorizer(), classifier, X_train, X_test, y_train, y_test)
    



 LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)
***********Usng Count Vectorizer****************


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Accuarcy: 97.49
Confusion Matrix: 
 [[7847  155]
 [ 231 7137]]
Classification Report: 
               precision    recall  f1-score   support

           0       0.97      0.98      0.98      8002
           1       0.98      0.97      0.97      7368

    accuracy                           0.97     15370
   macro avg       0.98      0.97      0.97     15370
weighted avg       0.97      0.97      0.97     15370

***********Usng TFIDF Vectorizer****************
Accuarcy: 96.43
Confusion Matrix: 
 [[7780  222]
 [ 326 7042]]
Classification Report: 
               precision    recall  f1-score   support

           0       0.96      0.97      0.97      8002
           1       0.97      0.96      0.96      7368

    accuracy                           0.96     15370
   macro avg       0.96      0.96      0.96     15370
weighted avg       0.96      0.96      0.96     15370



 SGDClassifier(alpha=0.0001, average=False, class_weight=None,
              early_stopping=False, epsilon=0.1, eta0=0.



Accuarcy: 96.66
Confusion Matrix: 
 [[7787  215]
 [ 298 7070]]
Classification Report: 
               precision    recall  f1-score   support

           0       0.96      0.97      0.97      8002
           1       0.97      0.96      0.96      7368

    accuracy                           0.97     15370
   macro avg       0.97      0.97      0.97     15370
weighted avg       0.97      0.97      0.97     15370

***********Usng TFIDF Vectorizer****************
Accuarcy: 97.81
Confusion Matrix: 
 [[7872  130]
 [ 206 7162]]
Classification Report: 
               precision    recall  f1-score   support

           0       0.97      0.98      0.98      8002
           1       0.98      0.97      0.98      7368

    accuracy                           0.98     15370
   macro avg       0.98      0.98      0.98     15370
weighted avg       0.98      0.98      0.98     15370



 KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jo

### use lemmatized text

In [None]:
X_train, X_test, y_train, y_test = train_test_split(lemmatized_text, dataset['label'], test_size = 0.3, random_state= 0)
classifiers = [LogisticRegression(), SGDClassifier(), MultinomialNB(), BernoulliNB(), LinearSVC(),
              KNeighborsClassifier(n_neighbors=5), DecisionTreeClassifier(), GradientBoostingClassifier(), 
               RandomForestClassifier(), XGBClassifier()]
for classifier in classifiers:
    print("\n\n", classifier)
    print("***********Usng Count Vectorizer****************")
    get_prediction(CountVectorizer(), classifier, X_train, X_test, y_train, y_test)
    print("***********Usng TFIDF Vectorizer****************")
    get_prediction(TfidfVectorizer(), classifier, X_train, X_test, y_train, y_test)



 LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)
***********Usng Count Vectorizer****************


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Accuarcy: 97.46
Confusion Matrix: 
 [[7814  163]
 [ 228 7165]]
Classification Report: 
               precision    recall  f1-score   support

           0       0.97      0.98      0.98      7977
           1       0.98      0.97      0.97      7393

    accuracy                           0.97     15370
   macro avg       0.97      0.97      0.97     15370
weighted avg       0.97      0.97      0.97     15370

***********Usng TFIDF Vectorizer****************
Accuarcy: 96.47
Confusion Matrix: 
 [[7734  243]
 [ 300 7093]]
Classification Report: 
               precision    recall  f1-score   support

           0       0.96      0.97      0.97      7977
           1       0.97      0.96      0.96      7393

    accuracy                           0.96     15370
   macro avg       0.96      0.96      0.96     15370
weighted avg       0.96      0.96      0.96     15370



 SGDClassifier(alpha=0.0001, average=False, class_weight=None,
              early_stopping=False, epsilon=0.1, eta0=0.



Accuarcy: 96.81
Confusion Matrix: 
 [[7768  209]
 [ 281 7112]]
Classification Report: 
               precision    recall  f1-score   support

           0       0.97      0.97      0.97      7977
           1       0.97      0.96      0.97      7393

    accuracy                           0.97     15370
   macro avg       0.97      0.97      0.97     15370
weighted avg       0.97      0.97      0.97     15370

***********Usng TFIDF Vectorizer****************
Accuarcy: 97.62
Confusion Matrix: 
 [[7822  155]
 [ 211 7182]]
Classification Report: 
               precision    recall  f1-score   support

           0       0.97      0.98      0.98      7977
           1       0.98      0.97      0.98      7393

    accuracy                           0.98     15370
   macro avg       0.98      0.98      0.98     15370
weighted avg       0.98      0.98      0.98     15370



 KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jo

from above, we can see that stemmed text with KNeighborsClassifier gives the best accuracy: 97.81
***********Usng TFIDF Vectorizer****************
Accuarcy: 97.81
Confusion Matrix: 
 [[7872  130]
 [ 206 7162]]
Classification Report: 
               precision    recall  f1-score   support

           0       0.97      0.98      0.98      8002
           1       0.98      0.97      0.98      7368

    accuracy                           0.98     15370
   macro avg       0.98      0.98      0.98     15370
weighted avg       0.98      0.98      0.98     15370



 KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

## pipeline for hyper parameter tuning

In [None]:
from pprint import pprint
from time import time
import logging

from sklearn.model_selection import GridSearchCV

In [None]:
pipeline = Pipeline([
    #('vect', CountVectorizer()),
    ('tfidf', TfidfVectorizer()),
    ('clf', KNeighborsClassifier()),
])

In [None]:
parameters = {
    #'vect__max_df': (0.5, 0.75, 1.0),
    #'vect__max_features': (None, 5000, 10000, 50000),
    #'vect__ngram_range': ((1, 1), (2, 2)),
    #'tfidf__use_idf': (True, False),
    'tfidf__norm': ('l1', 'l2'),
    'clf__leaf_size': (20, 30, 50),
    'clf__n_neighbors': list(range(2,8,2)),
    #'clf__p': (1,2),
}

In [None]:
grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1)

print("Performing grid search...")
print("pipeline:", [name for name, _ in pipeline.steps])
print("parameters:")
pprint(parameters)
t0 = time()
grid_search.fit(stemmed_text, dataset['label'])
print("done in %0.3fs" % (time() - t0))
print()

print("Best score: %0.3f" % grid_search.best_score_)
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

Performing grid search...
pipeline: ['tfidf', 'clf']
parameters:
{'clf__leaf_size': (20, 30, 50),
 'clf__n_neighbors': [2, 4, 6],
 'tfidf__norm': ('l1', 'l2')}
Fitting 5 folds for each of 18 candidates, totalling 90 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed: 25.2min
[Parallel(n_jobs=-1)]: Done  90 out of  90 | elapsed: 49.4min finished


done in 2972.196s

Best score: 0.766
Best parameters set:
	clf__leaf_size: 20
	clf__n_neighbors: 2
	tfidf__norm: 'l2'


In [None]:
get_prediction(TfidfVectorizer(norm = 'l2'), KNeighborsClassifier(leaf_size = 20, n_neighbor), X_train, X_test, y_train, y_test)