# Importing the data

In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from nltk.tokenize import word_tokenize
import nltk
from nltk.stem import WordNetLemmatizer

true = pd.read_csv('../raw_data/True.csv')
fake = pd.read_csv('../raw_data/Fake.csv')

In [2]:
true.drop(columns = ['subject','date'], inplace = True)
fake.drop(columns = ['subject','date'], inplace = True)

In [3]:
true_copy = true.copy()
fake_copy = fake.copy()

# Removing Getty

In [4]:
stop_words =['/Getty Images']
pat = '|'.join(r"\b{}\b".format(x) for x in stop_words)
true_copy['text'] = true_copy['text'].str.replace(pat, '')

stop_words =['/Getty Images']
pat = '|'.join(r"\b{}\b".format(x) for x in stop_words)
fake_copy['text'] = fake_copy['text'].str.replace(pat, '')

# Feature Engineering


In [5]:
true_copy['title_length_char'] = true_copy.title.str.len()
fake_copy['title_length_char'] = fake_copy.title.str.len()

In [6]:
true_copy['title_Upper'] = true_copy['title'].str.count(r'[A-Z]')

fake_copy['title_Upper'] = fake_copy['title'].str.count(r'[A-Z]')

In [7]:
true_copy['title_lower_ratio'] = true_copy.title_Upper / true_copy.title_length_char
true_copy.drop(columns = 'title_Upper', inplace = True)





In [8]:
fake_copy['title_lower_ratio'] = fake_copy.title_Upper / fake_copy.title_length_char
fake_copy.drop(columns = 'title_Upper', inplace = True)

## Setting the target

In [9]:
true_copy['score'] = 1
fake_copy['score'] = 0

In [10]:
data = pd.concat([true_copy, fake_copy], ignore_index=True)

## Creating text + title

In [11]:
data['title_text'] = data['title'] + data['text']

In [12]:
data.drop(columns = ['title','text'], inplace = True)

In [13]:
data.head()

Unnamed: 0,title_length_char,title_lower_ratio,score,title_text
0,64,0.0625,1,"As U.S. budget fight looms, Republicans flip t..."
1,64,0.0625,1,U.S. military to accept transgender recruits o...
2,60,0.116667,1,Senior U.S. Republican senator: 'Let Mr. Muell...
3,59,0.135593,1,FBI Russia probe helped by Australian diplomat...
4,69,0.057971,1,Trump wants Postal Service to charge 'much mor...


# Preprocessing

In [14]:
def lower_case(text):
    text = text.lower()
    return text
data['title_text'] = data['title_text'].map(lower_case)



In [15]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
data['title_text'] = data['title_text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))



In [16]:
import string
punc = string.punctuation + '“' + '”' + '’' + '‘'
def remove_punctuation(text):
    for punctuation in punc:
        text = text.replace(punctuation, '')
    return text


data['title_text'] = data['title_text'].apply(remove_punctuation)




In [17]:
def remove_numbers(text):
    text = ''.join(word for word in text if not word.isdigit())
    return text
data['title_text'] = data['title_text'].map(remove_numbers)




In [18]:
data = data[['title_text', 'title_length_char', 'title_lower_ratio', 'score']]
data.head()

Unnamed: 0,title_text,title_length_char,title_lower_ratio,score
0,us budget fight looms republicans flip fiscal ...,64,0.0625,1
1,us military accept transgender recruits monday...,64,0.0625,1
2,senior us republican senator let mr mueller jo...,60,0.116667,1
3,fbi russia probe helped australian diplomat ti...,59,0.135593,1
4,trump wants postal service charge much more am...,69,0.057971,1


# Lemmatizer

In [19]:
def lemmatize_text(text):
    lemmatizer = WordNetLemmatizer()
    return [lemmatizer.lemmatize(w) for w in text]

data['title_text'].apply(lemmatize_text)

0        [u, s,  , b, u, d, g, e, t,  , f, i, g, h, t, ...
1        [u, s,  , m, i, l, i, t, a, r, y,  , a, c, c, ...
2        [s, e, n, i, o, r,  , u, s,  , r, e, p, u, b, ...
3        [f, b, i,  , r, u, s, s, i, a,  , p, r, o, b, ...
4        [t, r, u, m, p,  , w, a, n, t, s,  , p, o, s, ...
5        [w, h, i, t, e,  , h, o, u, s, e,  , c, o, n, ...
6        [t, r, u, m, p,  , s, a, y, s,  , r, u, s, s, ...
7        [f, a, c, t, b, o, x,  , t, r, u, m, p,  , t, ...
8        [t, r, u, m, p,  , t, w, i, t, t, e, r,  , d, ...
9        [a, l, a, b, a, m, a,  , o, f, f, i, c, i, a, ...
10       [j, o, n, e, s,  , c, e, r, t, i, f, i, e, d, ...
11       [n, e, w,  , y, o, r, k,  , g, o, v, e, r, n, ...
12       [f, a, c, t, b, o, x,  , t, r, u, m, p,  , t, ...
13       [t, r, u, m, p,  , t, w, i, t, t, e, r,  , d, ...
14       [m, a, n,  , s, a, y, s,  , d, e, l, i, v, e, ...
15       [v, i, r, g, i, n, i, a,  , o, f, f, i, c, i, ...
16       [u, s,  , l, a, w, m, a, k, e, r, s,  , q, u, .

## X and Y

In [20]:
x = data.drop(columns = 'score', axis=1)
y = data['score']

## Splitting the dataset

In [21]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,random_state=0,test_size=0.3)

# Initializing the model

In [32]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.svm import SVC

preprocessor = ColumnTransformer([
    ('vectorizer_title_text', CountVectorizer(), 'title_text'),
    ('MinMaxScaler', MinMaxScaler(), ['title_length_char'])
])
final_pipe = Pipeline([
    ('preprocessing', preprocessor),
    ('randomforrest', RandomForestClassifier())])




In [35]:
from sklearn.model_selection import GridSearchCV
parameters = {
    'randomforrest__n_estimators' : (100, 200),
    'randomforrest__oob_score' : ('True', 'False'),
    'randomforrest__min_samples_split' : (2,4),
    'randomforrest__max_depth' : (1,2,3)}
    
grid_search = GridSearchCV(final_pipe,
                           parameters,
                           scoring = ["f1", "accuracy", "recall", "precision"],
                           refit= "accuracy",
                           cv=3,
                           verbose = 1)

In [36]:
import time

start = time.time()
grid_search.fit(x_train,y_train)
stop = time.time()
print(f"Training time: {stop - start}s")

Fitting 3 folds for each of 24 candidates, totalling 72 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
  return self.partial_f

  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
  return

  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
[Parallel(n_jobs=1)]: Done  72 out of  72 | elapsed: 52.8min finished
  return self.partial_fit(X, y)


Training time: 3182.88547205925s


In [37]:
tuned_pipe = grid_search.best_estimator_

In [38]:
tuned_pipe

Pipeline(memory=None,
     steps=[('preprocessing', ColumnTransformer(n_jobs=None, remainder='drop', sparse_threshold=0.3,
         transformer_weights=None,
         transformers=[('vectorizer_title_text', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf...bs=None,
            oob_score='True', random_state=None, verbose=0,
            warm_start=False))])

In [39]:
grid_search.best_params_

{'randomforrest__max_depth': 3,
 'randomforrest__min_samples_split': 2,
 'randomforrest__n_estimators': 200,
 'randomforrest__oob_score': 'True'}

In [40]:
grid_search.score(x_test,y_test)

0.8067557535263549