# Importing datasets

In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from nltk.tokenize import word_tokenize
import nltk
from nltk.stem import WordNetLemmatizer

true = pd.read_csv('../raw_data/True.csv')
fake = pd.read_csv('../raw_data/Fake.csv')

In [2]:
true.drop(columns = ['subject','date'], inplace = True)
fake.drop(columns = ['subject','date'], inplace = True)

In [3]:
true_copy = true.copy()
fake_copy = fake.copy()

# Processing

In [4]:
stop_words =['/Getty Images']
pat = '|'.join(r"\b{}\b".format(x) for x in stop_words)
true_copy['text'] = true_copy['text'].str.replace(pat, '')

stop_words =['/Getty Images']
pat = '|'.join(r"\b{}\b".format(x) for x in stop_words)
fake_copy['text'] = fake_copy['text'].str.replace(pat, '')

In [5]:
true_copy['title_length_char'] = true_copy.title.str.len()
fake_copy['title_length_char'] = fake_copy.title.str.len()

In [6]:
true_copy['title_Upper'] = true_copy['title'].str.count(r'[A-Z]')

fake_copy['title_Upper'] = fake_copy['title'].str.count(r'[A-Z]')


In [7]:
true_copy['title_lower_ratio'] = true_copy.title_Upper / true_copy.title_length_char
true_copy.drop(columns = 'title_Upper', inplace = True)


In [8]:
fake_copy['title_lower_ratio'] = fake_copy.title_Upper / fake_copy.title_length_char
fake_copy.drop(columns = 'title_Upper', inplace = True)


# Data Visualization

# Starting a new model - Preparing the data

In [19]:
true_copy['score'] = 1
fake_copy['score'] = 0

In [20]:
data = pd.concat([true_copy, fake_copy], ignore_index=True)

In [21]:
def lower_case(text):
    text = text.lower()
    return text
data['text'] = data['text'].map(lower_case)
data['title'] = data['title'].map(lower_case)

In [22]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
data['title'] = data['title'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))
data['text'] = data['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))

In [23]:
import string
punc = string.punctuation + '“' + '”' + '’' + '‘'
def remove_punctuation(text):
    for punctuation in punc:
        text = text.replace(punctuation, '')
    return text


data['text'] = data['text'].apply(remove_punctuation)
data['title'] = data['title'].apply(remove_punctuation)

In [24]:
def remove_numbers(text):
    text = ''.join(word for word in text if not word.isdigit())
    return text
data['text'] = data['text'].map(remove_numbers)
data['title'] = data['title'].map(remove_numbers)

In [25]:
data.text[0]

'washington reuters  head conservative republican faction us congress voted month huge expansion national debt pay tax cuts called fiscal conservative sunday urged budget restraint  keeping sharp pivot way among republicans us representative mark meadows speaking cbs face nation drew hard line federal spending lawmakers bracing battle january return holidays wednesday lawmakers begin trying pass federal budget fight likely linked issues immigration policy even november congressional election campaigns approach republicans seek keep control congress president donald trump republicans want big budget increase military spending democrats also want proportional increases nondefense discretionary spending programs support education scientific research infrastructure public health environmental protection the trump administration already willing say were going increase nondefense discretionary spending   percent meadows chairman small influential house freedom caucus said program now democra

In [26]:
data['title_text'] = data['title'] + data['text']
data.drop(columns = ['title', 'text'], inplace=True)

In [27]:
data = data[['title_text', 'title_length_char' ,'title_lower_ratio', 'score']]

In [28]:
data.head()

Unnamed: 0,title_text,title_length_char,title_lower_ratio,score
0,us budget fight looms republicans flip fiscal ...,64,0.0625,1
1,us military accept transgender recruits monday...,64,0.0625,1
2,senior us republican senator let mr mueller jo...,60,0.116667,1
3,fbi russia probe helped australian diplomat ti...,59,0.135593,1
4,trump wants postal service charge much more am...,69,0.057971,1


# Initialize the model

In [40]:
x = data.drop(columns = 'score')
y = data['score']

In [41]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,random_state=0,test_size=0.3)

In [42]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer

preprocessor = ColumnTransformer([
    ('vectorizer_title_text', TfidfVectorizer(), 'title_text'),
    ('MinMaxScaler', MinMaxScaler(), ['title_length_char'])
])
final_pipe = Pipeline([
    ('preprocessing', preprocessor),
    ('nb', MultinomialNB())])

In [43]:
from sklearn.model_selection import GridSearchCV
parameters = {
    'preprocessing__vectorizer_title_text__ngram_range': ((2,2),(3,3)),
    'preprocessing__vectorizer_title_text__ngram_range': ((2,2),(3,3)),
    'preprocessing__vectorizer_title_text__max_df' : [0.8,1.0],
    'preprocessing__vectorizer_title_text__min_df' : [0.05, 0.1],
    'nb__alpha': (2.0,2.5,3.0)}
grid_search = GridSearchCV(final_pipe,
                           parameters,
                           scoring = ["f1", "accuracy", "recall"],
                           refit= "accuracy",
                           cv=3,
                           verbose = 1)

In [44]:
import time

start = time.time()
grid_search.fit(x_train,y_train)
stop = time.time()
print(f"Training time: {stop - start}s")

Fitting 3 folds for each of 24 candidates, totalling 72 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
  return self.partial_f

  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
  return

  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
[Parallel(n_jobs=1)]: Done  72 out of  72 | elapsed: 59.8min finished


Training time: 3614.813202381134s


  return self.partial_fit(X, y)


In [45]:
tuned_pipe = grid_search.best_estimator_

In [46]:
tuned_pipe

Pipeline(memory=None,
     steps=[('preprocessing', ColumnTransformer(n_jobs=None, remainder='drop', sparse_threshold=0.3,
         transformer_weights=None,
         transformers=[('vectorizer_title_text', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='u... 1)), ['title_length_char'])])), ('nb', MultinomialNB(alpha=2.0, class_prior=None, fit_prior=True))])

In [47]:
grid_search.best_params_

{'nb__alpha': 2.0,
 'preprocessing__vectorizer_title_text__max_df': 0.8,
 'preprocessing__vectorizer_title_text__min_df': 0.05,
 'preprocessing__vectorizer_title_text__ngram_range': (2, 2)}

In [48]:
pd.DataFrame(grid_search.cv_results_)



Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_nb__alpha,param_preprocessing__vectorizer_title_text__max_df,param_preprocessing__vectorizer_title_text__min_df,param_preprocessing__vectorizer_title_text__ngram_range,params,split0_test_f1,...,split1_test_recall,split2_test_recall,mean_test_recall,std_test_recall,rank_test_recall,split0_train_recall,split1_train_recall,split2_train_recall,mean_train_recall,std_train_recall
0,18.748703,0.06364,9.033744,0.08129,2.0,0.8,0.05,"(2, 2)","{'nb__alpha': 2.0, 'preprocessing__vectorizer_...",0.836936,...,0.789211,0.796803,0.794219,0.003542,1,0.796603,0.799321,0.787134,0.794353,0.005223
1,26.595128,0.584025,9.160397,0.503306,2.0,0.8,0.05,"(3, 3)","{'nb__alpha': 2.0, 'preprocessing__vectorizer_...",0.439131,...,0.287712,0.293706,0.294086,0.005366,13,0.290709,0.297273,0.294276,0.294086,0.002683
2,18.495689,0.848289,9.142374,0.10816,2.0,0.8,0.1,"(2, 2)","{'nb__alpha': 2.0, 'preprocessing__vectorizer_...",0.575223,...,0.458541,0.463936,0.466103,0.007222,7,0.461538,0.470283,0.465788,0.46587,0.00357
3,28.838066,0.822704,9.166054,0.231735,2.0,0.8,0.1,"(3, 3)","{'nb__alpha': 2.0, 'preprocessing__vectorizer_...",0.341401,...,0.206993,0.205994,0.209044,0.003629,19,0.206494,0.210069,0.210568,0.209044,0.001815
4,18.878147,0.417137,8.997642,0.115161,2.0,1.0,0.05,"(2, 2)","{'nb__alpha': 2.0, 'preprocessing__vectorizer_...",0.836936,...,0.789211,0.796803,0.794219,0.003542,1,0.796603,0.799321,0.787134,0.794353,0.005223
5,27.692537,0.563324,9.022271,0.129751,2.0,1.0,0.05,"(3, 3)","{'nb__alpha': 2.0, 'preprocessing__vectorizer_...",0.439131,...,0.287712,0.293706,0.294086,0.005366,13,0.290709,0.297273,0.294276,0.294086,0.002683
6,18.569958,0.383439,8.532561,0.040233,2.0,1.0,0.1,"(2, 2)","{'nb__alpha': 2.0, 'preprocessing__vectorizer_...",0.575223,...,0.458541,0.463936,0.466103,0.007222,7,0.461538,0.470283,0.465788,0.46587,0.00357
7,26.76019,0.586269,9.097304,0.287524,2.0,1.0,0.1,"(3, 3)","{'nb__alpha': 2.0, 'preprocessing__vectorizer_...",0.341401,...,0.206993,0.205994,0.209044,0.003629,19,0.206494,0.210069,0.210568,0.209044,0.001815
8,18.199466,0.128835,8.466021,0.046717,2.5,0.8,0.05,"(2, 2)","{'nb__alpha': 2.5, 'preprocessing__vectorizer_...",0.836936,...,0.789011,0.796603,0.794086,0.003589,3,0.796503,0.799221,0.787134,0.794286,0.005177
9,26.871384,0.310311,8.74067,0.069025,2.5,0.8,0.05,"(3, 3)","{'nb__alpha': 2.5, 'preprocessing__vectorizer_...",0.439131,...,0.287712,0.293706,0.294086,0.005366,13,0.290709,0.297273,0.294276,0.294086,0.002683


In [49]:
results = pd.DataFrame(grid_search.cv_results_)

In [50]:
results.iloc[0]

mean_fit_time                                                                                        18.7487
std_fit_time                                                                                       0.0636397
mean_score_time                                                                                      9.03374
std_score_time                                                                                     0.0812901
param_nb__alpha                                                                                            2
param_preprocessing__vectorizer_title_text__max_df                                                       0.8
param_preprocessing__vectorizer_title_text__min_df                                                      0.05
param_preprocessing__vectorizer_title_text__ngram_range                                               (2, 2)
params                                                     {'nb__alpha': 2.0, 'preprocessing__vectorizer_...
split0_test_f1     

In [None]:
import pickle
# Export pipeline as pickle file
with open("pipeline.pkl", "wb") as file:
    pickle.dump(tuned_pipe, file )