In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer
import re
import string
from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import f1_score, accuracy_score
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [85]:
T = pd.read_csv("True.csv")
F = pd.read_csv("Fake.csv")

In [86]:
T["label"] = 0
F["label"] = 1

Merge the 2 datasets, drop the duplicates and shuffle them

In [87]:
data = pd.concat([T, F])

In [88]:
data = data.drop_duplicates()

In [89]:
data = data.sample(frac = 1)
data = data.reset_index(drop = True)

Clean the text from all the possible impurities, like punctuation and errors in formatting. We also removed the word "reuters" because it was linked almost pefectly with the "True" datasets

In [90]:
def clean_text(text):
    '''Make text lowercase, remove text in square brackets,remove links,remove punctuation
    and remove words containing numbers.'''
    text = text.lower()
    text = re.sub('\[.*?\]', ' ', text)
    text = re.sub('https?://\S+|www\.\S+', ' ', text)
    text = re.sub('<.*?>+', ' ', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), ' ', text)
    text = re.sub('\n', ' ', text)
    text = re.sub('\w*\d\w*', ' ', text)
    text = re.sub("\xa0", ' ', text)
    text = re.sub(' +', ' ', text)
    text = re.sub('reuters', '', text)
    return text

In [91]:
stop_words = set(stopwords.words('english'))

We tokenize the data, remove stopwords, stem the words using the Snowball stemmer (to reduce the dimentionality of the data) and then rejoin the words to get a cleaned text

In [92]:
def preprocess_text(text, stop_words):
    cleaned = clean_text(text)
    tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')
    tokenized = tokenizer.tokenize(cleaned)
    no_stop = [w for w in tokenized if w not in stop_words]
    snowball = SnowballStemmer('english')
    stemmed = [snowball.stem(x) for x in no_stop]
    combined_text = ' '.join(stemmed)
    return combined_text

In [93]:
data["text"] = data["text"].apply(lambda x: preprocess_text(x, stop_words))

We applied the same processes on a second dataset, completely of fake news

In [94]:
new_fake = pd.read_csv("new_data_fake.csv")

In [95]:
new_fake["label"] = 1

In [96]:
ff = new_fake[["text", "label"]]

In [97]:
ff.dropna(inplace = True)
ff = ff.reset_index(drop = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [98]:
ff['text'] = ff['text'].apply(lambda x : preprocess_text(x, stop_words))

We applied the same process of a third dataset, more balanced

In [99]:
third_try = pd.read_csv("fake_or_real_news.csv")

In [100]:
third_try = third_try[["text", "label"]]

In [101]:
third_try["label"] = third_try.label.map({"FAKE": 1, "REAL": 0})

In [102]:
third_try['text'] = third_try['text'].apply(lambda x : preprocess_text(x, stop_words))

We Vectorize the text in order to get an array that we can feed to our model  
We tried both with ngram_range = (1, 1) and = (1,3) but writing the same script twice seemed redundant  
The double script can be found in the TFIDF

In [148]:
count_vectorizer = CountVectorizer(ngram_range = (1, 3))
count_vectors = count_vectorizer.fit_transform(data['text'])
count_vectors2 = count_vectorizer.transform(ff['text'])
count_vectors3 = count_vectorizer.transform(third_try['text'])

We use the countvectors as the X of our model and the label as the y

In [149]:
X_train, X_test, y_train, y_test = train_test_split(count_vectors, data["label"], test_size = 0.2, random_state = 101)

Applied Logistic Regression training on the first dataset and testing on all 3 of them

In [150]:
lg1 = LogisticRegression()
lg1.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression()

In [181]:
pred = lg1.predict(X_test)

In [182]:
f1_score(y_test, pred)

0.9999734063771508

In [153]:
pred2 = lg1.predict(count_vectors2)

In [154]:
f1_score(ff["label"], pred2)

0.9528275193015077

In [155]:
pred3 = lg1.predict(count_vectors3)

In [156]:
f1_score(third_try["label"], pred3)

0.7084367245657568

Applied a Multinomial Naive Bayes training on the first dataset and testing on all 3 of them

In [157]:
nb = MultinomialNB()

In [158]:
nb.fit(X_train, y_train)

MultinomialNB()

In [183]:
pred = nb.predict(X_test)

In [184]:
f1_score(y_test, pred)

0.9992283334663793

In [161]:
pred2 = nb.predict(count_vectors2)

In [162]:
f1_score(ff["label"], pred2)

0.9095424506461254

In [163]:
pred3 = nb.predict(count_vectors3)

In [164]:
f1_score(third_try["label"], pred3)

0.6658243840808591

Applied GridSearchCV to a Random Forest Classifier training on the first dataset, to find the best parameters

In [59]:
rfc = RandomForestClassifier()

In [82]:
params ={"n_estimators": [50, 100, 200],
         "min_impurity_decrease": [0.01, 0.05, 0.1, 0.0], 
         "max_depth": [5, 10, 20, None],
         "min_samples_split": [ 5, 10, 20]}

In [85]:
model = GridSearchCV(rfc, params, cv=5)

In [None]:
model.fit(X_train, y_train)

In [None]:
model.best_params_

Applied a Random Forest Classifier training on the first dataset and testing on all 3 of them using the best_params_ found before

In [165]:
rfc2 = RandomForestClassifier(max_depth =  None,
 min_impurity_decrease =  0.0,
 min_samples_split = 20,
 n_estimators = 200)

In [166]:
rfc2.fit(X_train, y_train)

RandomForestClassifier(min_samples_split=20, n_estimators=200)

In [185]:
pred = rfc2.predict(X_test)

In [186]:
f1_score(y_test, pred)

0.9999734063771508

In [169]:
pred2 = rfc2.predict(count_vectors2)

In [170]:
f1_score(ff["label"], pred2)

0.9137034552163703

In [171]:
pred3 = rfc2.predict(count_vectors3)

In [172]:
f1_score(third_try["label"], pred3)

0.6646263708237695

Applied a Support Vector Classifier training on the first dataset and testing on all 3 of them

In [173]:
svc = SVC()

In [174]:
svc.fit(X_train, y_train)

SVC()

In [None]:
pred = svc.predict(X_test)

In [188]:
f1_score(y_test, pred)

0.9967806305707064

In [177]:
pred2 = svc.predict(count_vectors2)

In [178]:
f1_score(ff["label"], pred2)

0.9543491422805247

In [179]:
pred3 = svc.predict(count_vectors3)

In [180]:
f1_score(third_try["label"], pred3)

0.7039230199851962