In [3]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer
import re
import string
from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import f1_score, accuracy_score
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [4]:
T = pd.read_csv("True.csv")
F = pd.read_csv("Fake.csv")

In [5]:
T["label"] = 0
F["label"] = 1

Merge the 2 datasets, drop the duplicates and shuffle them

In [6]:
data = pd.concat([T, F])

In [7]:
data = data.drop_duplicates()

In [8]:
data = data.sample(frac = 1)
data = data.reset_index(drop = True)

Clean the text from all the possible impurities, like punctuation and errors in formatting. We also removed the word "reuters" because it was linked almost pefectly with the "True" dataset

In [9]:
def clean_text(text):
    '''Make text lowercase, remove text in square brackets,remove links,remove punctuation
    and remove words containing numbers.'''
    text = text.lower()
    text = re.sub('\[.*?\]', ' ', text)
    text = re.sub('https?://\S+|www\.\S+', ' ', text)
    text = re.sub('<.*?>+', ' ', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), ' ', text)
    text = re.sub('\n', ' ', text)
    text = re.sub('\w*\d\w*', ' ', text)
    text = re.sub("\xa0", ' ', text)
    text = re.sub(' +', ' ', text)
    text = re.sub('reuters', '', text)
    return text

In [10]:
stop_words = set(stopwords.words('english'))  

We tokenize the data, remove stopwords, stem the words using the Snowball stemmer (to reduce the dimentionality of the data) and then rejoin the words to get a cleaned text

In [11]:
def preprocess_text(text, stop_words):
    cleaned = clean_text(text)
    tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')
    tokenized = tokenizer.tokenize(cleaned)
    no_stop = [w for w in tokenized if w not in stop_words]
    snowball = SnowballStemmer('english')
    stemmed = [snowball.stem(x) for x in no_stop]
    combined_text = ' '.join(stemmed)
    return combined_text

In [12]:
data["text"] = data["text"].apply(lambda x: preprocess_text(x, stop_words))

We applied the same processes on a second dataset, completely of fake news

In [13]:
new_fake = pd.read_csv("new_data_fake.csv")

In [14]:
new_fake["label"] = 1

In [15]:
ff = new_fake[["text", "label"]]

In [16]:
ff.dropna(inplace = True)
ff = ff.reset_index(drop = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [17]:
ff['text'] = ff['text'].apply(lambda x : preprocess_text(x, stop_words))

We applied the same process of a third dataset, more balanced

In [18]:
third_try = pd.read_csv("fake_or_real_news.csv")

In [19]:
third_try = third_try[["text", "label"]]

In [20]:
third_try["label"] = third_try.label.map({"FAKE": 1, "REAL": 0})

In [21]:
third_try['text'] = third_try['text'].apply(lambda x : preprocess_text(x, stop_words))

We Vectorize the text in order to get an array that we can feed to our model  

In [82]:
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1,1))
tfidf_vectors = tfidf_vectorizer.fit_transform(data['text'])
tfidf_vectors2 = tfidf_vectorizer.transform(ff['text'])
tfidf_vectors3 = tfidf_vectorizer.transform(third_try['text'])

We use the tfidfvectors as the X of our model and the label as the y

In [83]:
X_train, X_test, y_train, y_test = train_test_split(tfidf_vectors, data["label"], test_size = 0.2, random_state = 101)

Applied Logistic Regression training on the first dataset and testing on all 3 of them

In [116]:
lg1 = LogisticRegression()
lg1.fit(X_train, y_train)

LogisticRegression()

In [121]:
pred0 = lg1.predict(X_train)

In [123]:
f1_score(y_train, pred0)

0.9860179120438444

In [117]:
pred = lg1.predict(X_test)

In [86]:
f1_score(y_test, pred)

0.9823895391753664

In [87]:
pred2 = lg1.predict(tfidf_vectors2)

In [88]:
f1_score(ff["label"], pred2)

0.9590549282758067

In [89]:
pred3 = lg1.predict(tfidf_vectors3)

In [90]:
f1_score(third_try["label"], pred3)

0.7110509475756829

Applied a Multinomial Naive Bayes training on the first dataset and testing on all 3 of them

In [124]:
nb = MultinomialNB()

In [125]:
nb.fit(X_train, y_train)

MultinomialNB()

In [126]:
pred0 = nb.predict(X_train)

In [127]:
f1_score(y_train, pred0)

0.9376993856943235

In [93]:
pred = nb.predict(X_train)

In [94]:
f1_score(y_train, pred)

0.9376993856943235

In [95]:
pred2 = nb.predict(tfidf_vectors2)

In [96]:
f1_score(ff["label"], pred2)

0.8949837911619178

In [97]:
pred3 = nb.predict(tfidf_vectors3)

In [98]:
f1_score(third_try["label"], pred3)

0.6466616654163541

Applied GridSearchCV to a Random Forest Classifier training on the first dataset, to find the best parameters

In [128]:
rfc = RandomForestClassifier()

In [38]:
params ={"n_estimators": [50, 100, 200],
         "min_impurity_decrease": [0.01, 0.05, 0.1, 0.0], 
         "max_depth": [5, 10, 20, None],
         "min_samples_split": [ 5, 10, 20]}

In [39]:
model = GridSearchCV(rfc, params, cv=5)

In [40]:
model.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=RandomForestClassifier(),
             param_grid={'max_depth': [5, 10, 20, None],
                         'min_impurity_decrease': [0.01, 0.05, 0.1, 0.0],
                         'min_samples_split': [5, 10, 20],
                         'n_estimators': [50, 100, 200]})

In [41]:
model.best_params_

{'max_depth': None,
 'min_impurity_decrease': 0.0,
 'min_samples_split': 20,
 'n_estimators': 200}

Applied a Random Forest Classifier training on the first dataset and testing on all 3 of them using the best_params_ found before

In [129]:
rfc2 = RandomForestClassifier(max_depth =  None,
 min_impurity_decrease =  0.0,
 min_samples_split = 20,
 n_estimators = 200)

In [130]:
rfc2.fit(X_train, y_train)

RandomForestClassifier(min_samples_split=20, n_estimators=200)

In [133]:
pred0 = rfc2.predict(X_train)

In [134]:
f1_score(y_train, pred0)

0.999946606866357

In [102]:
pred = rfc2.predict(X_test)

In [103]:
f1_score(y_test, pred)

0.983458012854283

In [104]:
pred2 = rfc2.predict(tfidf_vectors2)

In [105]:
f1_score(ff["label"], pred2)

0.9178328250971219

In [106]:
pred3 = rfc2.predict(tfidf_vectors3)

In [107]:
f1_score(third_try["label"], pred3)

0.6720471432231617

Applied a Support Vector Classifier training on the first dataset and testing on all 3 of them

In [135]:
svc = SVC()

In [None]:
svc.fit(X_train, y_train)

In [None]:
pred0 = svc.predict(X_train)

In [None]:
f1_score(y_train, pred0)

In [110]:
pred = svc.predict(X_test)

In [111]:
f1_score(y_test, pred)

0.9898840885142254

In [112]:
pred2 = svc.predict(tfidf_vectors2)

In [113]:
f1_score(ff["label"], pred2)

0.9534197875005049

In [114]:
pred3 = svc.predict(tfidf_vectors3)

In [115]:
f1_score(third_try["label"], pred3)

0.7025168815224064

We now repeated the same process again using the ngram_range (1,3)

# trying with different n-grams

In [50]:
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1,3))
tfidf_vectors = tfidf_vectorizer.fit_transform(data['text'])
tfidf_vectors2 = tfidf_vectorizer.transform(ff['text'])
tfidf_vectors3 = tfidf_vectorizer.transform(third_try['text'])

In [51]:
X_train, X_test, y_train, y_test = train_test_split(tfidf_vectors, data["label"], test_size = 0.2, random_state = 101)

In [52]:
lg1 = LogisticRegression()
lg1.fit(X_train, y_train)

LogisticRegression()

In [53]:
pred = lg1.predict(X_test)

In [54]:
f1_score(y_test, pred)

0.9823008849557522

In [55]:
pred2 = lg1.predict(tfidf_vectors2)

In [56]:
f1_score(ff["label"], pred2)

0.9344356696281673

In [57]:
pred3 = lg1.predict(tfidf_vectors3)

In [58]:
f1_score(third_try["label"], pred3)

0.7048673705897502

In [59]:
nb = MultinomialNB()

In [60]:
nb.fit(X_train, y_train)

MultinomialNB()

In [61]:
pred = nb.predict(X_train)

In [62]:
f1_score(y_train, pred)

0.9851271389732928

In [63]:
pred2 = nb.predict(tfidf_vectors2)

In [64]:
f1_score(ff["label"], pred2)

0.9113296352328122

In [65]:
pred3 = nb.predict(tfidf_vectors3)

In [66]:
f1_score(third_try["label"], pred3)

0.6540366518078257

In [67]:
rfc = RandomForestClassifier()

In [70]:
params ={"n_estimators": [50, 100, 200],
         "min_impurity_decrease": [0.01, 0.05, 0.1, 0.0], 
         "max_depth": [5, 10, 20, None],
         "min_samples_split": [ 5, 10, 20]}

In [71]:
model = GridSearchCV(rfc, params, cv=5)

In [None]:
model.fit(X_train, y_train)

In [None]:
model.best_params_

In [73]:
rfc2 = RandomForestClassifier(max_depth =  None,
 min_impurity_decrease =  0.0,
 min_samples_split = 20,
 n_estimators = 200)

In [75]:
rfc2.fit(X_train, y_train)

RandomForestClassifier(min_samples_split=20, n_estimators=200)

In [76]:
pred = rfc2.predict(X_test)

In [77]:
f1_score(y_test, pred)

0.9815225424981523

In [78]:
pred2 = rfc2.predict(tfidf_vectors2)

In [79]:
f1_score(ff["label"], pred2)

0.9035889622481801

In [80]:
pred3 = rfc2.predict(tfidf_vectors3)

In [81]:
f1_score(third_try["label"], pred3)

0.6584798345398138

In [54]:
svc = SVC()

In [55]:
svc.fit(X_train, y_train)

SVC()

In [56]:
pred = svc.predict(X_test)

In [57]:
f1_score(y_test, pred)

0.9930458970792767

In [58]:
pred2 = svc.predict(tfidf_vectors2)

In [59]:
f1_score(ff["label"], pred2)

0.9659934541390596

In [60]:
pred3 = svc.predict(tfidf_vectors3)

In [61]:
f1_score(third_try["label"], pred3)

0.6845182413470533