In [2]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer
import re
import string
from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import f1_score, accuracy_score
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [5]:
T = pd.read_csv("True.csv")
F = pd.read_csv("Fake.csv")

In [6]:
T["label"] = 0
F["label"] = 1

Merge the 2 datasets, drop the duplicates and shuffle them

In [7]:
data = pd.concat([T, F])

In [8]:
data = data.drop_duplicates()

In [9]:
data = data.sample(frac = 1)
data = data.reset_index(drop = True)

Clean the text from all the possible impurities, like punctuation and errors in formatting. We also removed the word "reuters" because it was linked almost pefectly with the "True" datasets

In [11]:
def clean_text(text):
    '''Make text lowercase, remove text in square brackets,remove links,remove punctuation
    and remove words containing numbers.'''
    text = text.lower()
    text = re.sub('\[.*?\]', ' ', text)
    text = re.sub('https?://\S+|www\.\S+', ' ', text)
    text = re.sub('<.*?>+', ' ', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), ' ', text)
    text = re.sub('\n', ' ', text)
    text = re.sub('\w*\d\w*', ' ', text)
    text = re.sub("\xa0", ' ', text)
    text = re.sub(' +', ' ', text)
    text = re.sub('reuters', '', text)
    return text

In [12]:
stop_words = set(stopwords.words('english'))  

We tokenize the data, remove stopwords, stem the words using the Snowball stemmer (to reduce the dimentionality of the data) and then rejoin the words to get a cleaned text

In [13]:
def preprocess_text(text, stop_words):
    cleaned = clean_text(text)
    tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')
    tokenized = tokenizer.tokenize(cleaned)
    no_stop = [w for w in tokenized if w not in stop_words]
    snowball = SnowballStemmer('english')
    stemmed = [snowball.stem(x) for x in no_stop]
    combined_text = ' '.join(stemmed)
    return combined_text

In [14]:
data["text"] = data["text"].apply(lambda x: preprocess_text(x, stop_words))

We applied the same processes on a second dataset, completely of fake news

In [15]:
new_fake = pd.read_csv("new_data_fake.csv")

In [16]:
new_fake["label"] = 1

In [17]:
ff = new_fake[["text", "label"]]

In [18]:
ff.dropna(inplace = True)
ff = ff.reset_index(drop = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [19]:
ff['text'] = ff['text'].apply(lambda x : preprocess_text(x, stop_words))

We applied the same process of a third dataset, more balanced

In [20]:
third_try = pd.read_csv("fake_or_real_news.csv")

In [21]:
third_try = third_try[["text", "label"]]

In [22]:
third_try["label"] = third_try.label.map({"FAKE": 1, "REAL": 0})

In [23]:
third_try['text'] = third_try['text'].apply(lambda x : preprocess_text(x, stop_words))

We Vectorize the text in order to get an array that we can feed to our model  
We tried both with ngram_range = (1, 1) and = (1,3) but writing the same script twice seemed redundant  
The double script can be found in the TFIDF

In [24]:
hash_vectorizer = HashingVectorizer(ngram_range = (1, 1), stop_words = 'english')
hash_vectors = hash_vectorizer.fit_transform(data['text'])
hash_vectors2 = hash_vectorizer.transform(ff['text'])
hash_vectors3 = hash_vectorizer.transform(third_try['text'])

We use the hashvectors as the X of our model and the label as the y

In [25]:
X_train, X_test, y_train, y_test = train_test_split(hash_vectors, data["label"], test_size = 0.2, random_state = 101)

Applied Logistic Regression training on the first dataset and testing on all 3 of them

In [26]:
lg1 = LogisticRegression()
lg1.fit(X_train, y_train)

LogisticRegression()

In [27]:
pred = lg1.predict(X_test)

In [28]:
f1_score(y_test, pred)

0.9884935009588749

In [29]:
pred2 = lg1.predict(hash_vectors2)

In [30]:
f1_score(ff["label"], pred2)

0.9814021153619313

In [31]:
pred3 = lg1.predict(hash_vectors3)

In [32]:
f1_score(third_try["label"], pred3)

0.6947731188971856

Applied GridSearchCV to a Random Forest Classifier training on the first dataset, to find the best parameters

In [33]:
rfc = RandomForestClassifier()

In [46]:
params ={"n_estimators": [50, 100, 200],
         "min_impurity_decrease": [0.01, 0.05, 0.1, 0.0], 
         "max_depth": [5, 10, 20, None],
         "min_samples_split": [ 5, 10, 20]}

In [47]:
model = GridSearchCV(rfc, params, cv=5)

In [48]:
model.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=RandomForestClassifier(),
             param_grid={'max_depth': [5, 10, 20, None],
                         'min_impurity_decrease': [0.01, 0.05, 0.1, 0.0],
                         'min_samples_split': [5, 10, 20],
                         'n_estimators': [50, 100, 200]})

In [49]:
model.best_params_

{'max_depth': None,
 'min_impurity_decrease': 0.0,
 'min_samples_split': 20,
 'n_estimators': 200}

Applied a Random Forest Classifier training on the first dataset and testing on all 3 of them using the best_params_ found before

In [34]:
rfc2 = RandomForestClassifier(max_depth =  None,
 min_impurity_decrease =  0.0,
 min_samples_split = 20,
 n_estimators = 200)

In [35]:
rfc2.fit(X_train, y_train)

RandomForestClassifier(min_samples_split=20, n_estimators=200)

In [36]:
pred = rfc2.predict(X_test)

In [37]:
f1_score(y_test, pred)

0.979609175870858

In [38]:
pred2 = rfc2.predict(hash_vectors2)

In [39]:
f1_score(ff["label"], pred2)

0.9224241920053242

In [40]:
pred3 = rfc2.predict(hash_vectors3)

In [41]:
f1_score(third_try["label"], pred3)

0.6657462030877368

Applied a Support Vector Classifier training on the first dataset and testing on all 3 of them

In [44]:
svc = SVC()

In [45]:
svc.fit(X_train, y_train)

SVC()

In [46]:
pred = svc.predict(X_test)

In [47]:
f1_score(y_test, pred)

0.9951042997020009

In [48]:
pred2 = svc.predict(hash_vectors2)

In [49]:
f1_score(ff["label"], pred2)

0.9841581052466473

In [50]:
pred3 = svc.predict(hash_vectors3)

In [51]:
f1_score(third_try["label"], pred3)

0.685823754789272