In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer
import re
import string
from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import f1_score, accuracy_score
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [2]:
T = pd.read_csv("True.csv")
F = pd.read_csv("Fake.csv")

In [3]:
T["label"] = 0
F["label"] = 1

Merge the 2 datasets, drop the duplicates and shuffle them

In [4]:
data = pd.concat([T, F])

In [5]:
data = data.drop_duplicates()

In [6]:
data = data.sample(frac = 1)
data = data.reset_index(drop = True)

Clean the text from all the possible impurities, like punctuation and errors in formatting. We also removed the word "reuters" because it was linked almost pefectly with the "True" datasets

In [7]:
def clean_text(text):
    '''Make text lowercase, remove text in square brackets,remove links,remove punctuation
    and remove words containing numbers.'''
    text = text.lower()
    text = re.sub('\[.*?\]', ' ', text)
    text = re.sub('https?://\S+|www\.\S+', ' ', text)
    text = re.sub('<.*?>+', ' ', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), ' ', text)
    text = re.sub('\n', ' ', text)
    text = re.sub('\w*\d\w*', ' ', text)
    text = re.sub("\xa0", ' ', text)
    text = re.sub(' +', ' ', text)
    text = re.sub('reuters', '', text)
    return text

In [8]:
stop_words = set(stopwords.words('english'))

We tokenize the data, remove stopwords, stem the words using the Snowball stemmer (to reduce the dimentionality of the data) and then rejoin the words to get a cleaned text

In [9]:
def preprocess_text(text, stop_words):
    cleaned = clean_text(text)
    tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')
    tokenized = tokenizer.tokenize(cleaned)
    no_stop = [w for w in tokenized if w not in stop_words]
    snowball = SnowballStemmer('english')
    stemmed = [snowball.stem(x) for x in no_stop]
    combined_text = ' '.join(stemmed)
    return combined_text

In [10]:
data["text"] = data["text"].apply(lambda x: preprocess_text(x, stop_words))

We used the vader package for sentiment analysis to get the sentiment of each text and use it as feature for our models

In [11]:
sentiment = SentimentIntensityAnalyzer()

In [13]:
data[['neg', 'neu', 'pos', 'compound']] = data['text'].apply(sentiment.polarity_scores).apply(pd.Series)

We applied the same processes on a second dataset, completely of fake news

In [48]:
new_fake = pd.read_csv("new_data_fake.csv")

In [49]:
new_fake["label"] = 1

In [50]:
ff = new_fake[["text", "label"]]

In [51]:
ff.dropna(inplace = True)
ff = ff.reset_index(drop = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [52]:
ff['text'] = ff['text'].apply(lambda x : preprocess_text(x, stop_words))

In [53]:
ff[['neg', 'neu', 'pos', 'compound']] = ff['text'].apply(sentiment.polarity_scores).apply(pd.Series)

We applied the same process of a third dataset, more balanced

In [54]:
third_try = pd.read_csv("fake_or_real_news.csv")

In [55]:
third_try = third_try[["text", "label"]]

In [56]:
third_try["label"] = third_try.label.map({"FAKE": 1, "REAL": 0})

In [57]:
third_try['text'] = third_try['text'].apply(lambda x : preprocess_text(x, stop_words))

In [58]:
third_try[['neg', 'neu', 'pos', 'compound']] = third_try['text'].apply(sentiment.polarity_scores).apply(pd.Series)

In [59]:
sent = ['neg', 'neu', 'pos', 'compound']

We used the sentiment as X for the model and the labels as the y

In [133]:
X_train, X_test, y_train, y_test = train_test_split(data[sent], data["label"], test_size = 0.2, random_state = 101)

Applied Logistic Regression training on the first dataset and testing on all 3 of them    

In [134]:
lg1 = LogisticRegression()
lg1.fit(X_train, y_train)

LogisticRegression()

In [135]:
pred = lg1.predict(X_test)

In [136]:
f1_score(y_test, pred)

0.6933445661331087

In [137]:
pred2 = lg1.predict(ff[sent])

In [138]:
f1_score(ff["label"], pred2)

0.7417553062314829

In [139]:
pred3 = lg1.predict(third_try[sent])

In [140]:
f1_score(third_try["label"], pred3)

0.5353863540048028

Applied a Random Forest Classifier training on the first dataset and testing on all 3 of them

In [141]:
rfc2 = RandomForestClassifier(max_depth =  None,
 min_impurity_decrease =  0.0,
 min_samples_split = 20,
 n_estimators = 200)

In [142]:
rfc2.fit(X_train, y_train)

RandomForestClassifier(min_samples_split=20, n_estimators=200)

In [143]:
pred = rfc2.predict(X_test)

In [144]:
f1_score(y_test, pred)

0.7338545571382672

In [145]:
pred2 = rfc2.predict(ff[sent])

In [146]:
f1_score(ff["label"], pred2)

0.6989101501682486

In [147]:
pred3 = rfc2.predict(third_try[sent])

In [148]:
f1_score(third_try["label"], pred3)

0.5304240934234787

Applied a Support Vector Classifier training on the first dataset and testing on all 3 of them

In [129]:
svc = SVC()

In [78]:
svc.fit(X_train, y_train)

SVC()

In [85]:
pred = svc.predict(X_test)

In [86]:
f1_score(y_test, pred)

0.6740006286253108

In [81]:
pred2 = svc.predict(ff[sent])

In [82]:
f1_score(ff["label"], pred2)

0.6910211712394524

In [83]:
pred3 = svc.predict(third_try[sent])

In [84]:
f1_score(third_try["label"], pred3)

0.5184961497810661