# Importing the dataset

In [7]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from nltk.tokenize import word_tokenize
import nltk
from nltk.stem import WordNetLemmatizer

true = pd.read_csv('../raw_data/True.csv')
fake = pd.read_csv('../raw_data/Fake.csv')

In [8]:
true.drop(columns = ['subject','date'], inplace = True)
fake.drop(columns = ['subject','date'], inplace = True)

In [9]:
true_copy = true.copy()
fake_copy = fake.copy()

# Removing Getty

In [10]:
stop_words =['/Getty Images']
pat = '|'.join(r"\b{}\b".format(x) for x in stop_words)
true_copy['text'] = true_copy['text'].str.replace(pat, '')

stop_words =['/Getty Images']
pat = '|'.join(r"\b{}\b".format(x) for x in stop_words)
fake_copy['text'] = fake_copy['text'].str.replace(pat, '')

# Feature Engineering

In [11]:
true_copy['title_length_char'] = true_copy.title.str.len()
fake_copy['title_length_char'] = fake_copy.title.str.len()

In [12]:
true_copy['title_Upper'] = true_copy['title'].str.count(r'[A-Z]')

fake_copy['title_Upper'] = fake_copy['title'].str.count(r'[A-Z]')

In [13]:
true_copy['title_lower_ratio'] = true_copy.title_Upper / true_copy.title_length_char
true_copy.drop(columns = 'title_Upper', inplace = True)



In [14]:
fake_copy['title_lower_ratio'] = fake_copy.title_Upper / fake_copy.title_length_char
fake_copy.drop(columns = 'title_Upper', inplace = True)

# Setting the target

In [15]:
true_copy['score'] = 1
fake_copy['score'] = 0

In [16]:
data = pd.concat([true_copy, fake_copy], ignore_index=True)

# Data title + Data text

In [17]:
data['title_text'] = data['title'] + data['text']

In [18]:
data.drop(columns = ['title','text'], inplace = True)


In [19]:
data.head()

Unnamed: 0,title_length_char,title_lower_ratio,score,title_text
0,64,0.0625,1,"As U.S. budget fight looms, Republicans flip t..."
1,64,0.0625,1,U.S. military to accept transgender recruits o...
2,60,0.116667,1,Senior U.S. Republican senator: 'Let Mr. Muell...
3,59,0.135593,1,FBI Russia probe helped by Australian diplomat...
4,69,0.057971,1,Trump wants Postal Service to charge 'much mor...


# Preprocessing

In [20]:
def lower_case(text):
    text = text.lower()
    return text
data['title_text'] = data['title_text'].map(lower_case)


In [21]:
data['title_text'][0]

'as u.s. budget fight looms, republicans flip their fiscal scriptwashington (reuters) - the head of a conservative republican faction in the u.s. congress, who voted this month for a huge expansion of the national debt to pay for tax cuts, called himself a “fiscal conservative” on sunday and urged budget restraint in 2018. in keeping with a sharp pivot under way among republicans, u.s. representative mark meadows, speaking on cbs’ “face the nation,” drew a hard line on federal spending, which lawmakers are bracing to do battle over in january. when they return from the holidays on wednesday, lawmakers will begin trying to pass a federal budget in a fight likely to be linked to other issues, such as immigration policy, even as the november congressional election campaigns approach in which republicans will seek to keep control of congress. president donald trump and his republicans want a big budget increase in military spending, while democrats also want proportional increases for non-

In [22]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
data['title_text'] = data['title_text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))


In [23]:
import string
punc = string.punctuation + '“' + '”' + '’' + '‘'
def remove_punctuation(text):
    for punctuation in punc:
        text = text.replace(punctuation, '')
    return text


data['title_text'] = data['title_text'].apply(remove_punctuation)


In [24]:
def remove_numbers(text):
    text = ''.join(word for word in text if not word.isdigit())
    return text
data['title_text'] = data['title_text'].map(remove_numbers)


# Tokenizing

In [25]:
#from nltk.tokenize import word_tokenize
#def tokenize(text):
    #token= word_tokenize(text)
    #return token
#data['title_text'] = data['title_text'].apply(tokenize)

# Lemmatizer

In [26]:
def lemmatize_text(text):
    lemmatizer = WordNetLemmatizer()
    return [lemmatizer.lemmatize(w) for w in text]

data['title_text'].apply(lemmatize_text)

0        [u, s,  , b, u, d, g, e, t,  , f, i, g, h, t, ...
1        [u, s,  , m, i, l, i, t, a, r, y,  , a, c, c, ...
2        [s, e, n, i, o, r,  , u, s,  , r, e, p, u, b, ...
3        [f, b, i,  , r, u, s, s, i, a,  , p, r, o, b, ...
4        [t, r, u, m, p,  , w, a, n, t, s,  , p, o, s, ...
5        [w, h, i, t, e,  , h, o, u, s, e,  , c, o, n, ...
6        [t, r, u, m, p,  , s, a, y, s,  , r, u, s, s, ...
7        [f, a, c, t, b, o, x,  , t, r, u, m, p,  , t, ...
8        [t, r, u, m, p,  , t, w, i, t, t, e, r,  , d, ...
9        [a, l, a, b, a, m, a,  , o, f, f, i, c, i, a, ...
10       [j, o, n, e, s,  , c, e, r, t, i, f, i, e, d, ...
11       [n, e, w,  , y, o, r, k,  , g, o, v, e, r, n, ...
12       [f, a, c, t, b, o, x,  , t, r, u, m, p,  , t, ...
13       [t, r, u, m, p,  , t, w, i, t, t, e, r,  , d, ...
14       [m, a, n,  , s, a, y, s,  , d, e, l, i, v, e, ...
15       [v, i, r, g, i, n, i, a,  , o, f, f, i, c, i, ...
16       [u, s,  , l, a, w, m, a, k, e, r, s,  , q, u, .

In [27]:
data.title_text[4]



In [28]:
data.shape

(44898, 4)

In [29]:
x = data.drop(columns = 'score', axis=1)
y = data['score']

In [30]:
data.head()

Unnamed: 0,title_length_char,title_lower_ratio,score,title_text
0,64,0.0625,1,us budget fight looms republicans flip fiscal ...
1,64,0.0625,1,us military accept transgender recruits monday...
2,60,0.116667,1,senior us republican senator let mr mueller jo...
3,59,0.135593,1,fbi russia probe helped australian diplomat ti...
4,69,0.057971,1,trump wants postal service charge much more am...


# Splitting the dataset

In [31]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,random_state=0,test_size=0.3)

# Initializing the model

In [None]:
#from sklearn.pipeline import Pipeline
#from sklearn.preprocessing import MinMaxScaler
#from sklearn.compose import ColumnTransformer
#from sklearn.linear_model import SGDClassifier
#from sklearn.feature_extraction.text import CountVectorizer


#preprocessor = ColumnTransformer([
   # ('vectorizer_title_text', CountVectorizer(), 'title_text'),
    #('MinMaxScaler', MinMaxScaler(), ['title_length_char'])
#])
#final_pipe = Pipeline([
    #('preprocessing', preprocessor),
    #('clf', SGDClassifier())])
    
   

In [32]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import SVC

preprocessor = ColumnTransformer([
    ('vectorizer_title_text', CountVectorizer(), 'title_text'),
    ('MinMaxScaler', MinMaxScaler(), ['title_length_char'])
])
final_pipe = Pipeline([
    ('preprocessing', preprocessor),
    ('svc', SVC())])


In [2]:
#final_pipe.fit(x_train,y_train)

In [3]:
#final_pipe.score(x_test,y_test)

In [33]:
from sklearn.model_selection import GridSearchCV
parameters = {
    'preprocessing__vectorizer_title_text__ngram_range'
grid_search = GridSearchCV(final_pipe,
                           parameters,
                           scoring = ["f1", "accuracy", "recall"],
                           refit= "accuracy",
                           cv=5,
                           verbose = 1)

In [34]:
import time

start = time.time()
grid_search.fit(x_train,y_train)
stop = time.time()
print(f"Training time: {stop - start}s")

Fitting 3 folds for each of 4 candidates, totalling 12 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  return self.partial_fit(X, y)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  return self.partial_fit(X, y)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
[Parallel(n_jobs=1)]: Done  12 out of  12 | elapsed: 402.5min finished
  return self.partial_fit(X, y)


Training time: 25017.920446157455s


# Evaluation

In [35]:
tuned_pipe = grid_search.best_estimator_

In [36]:
tuned_pipe

Pipeline(memory=None,
     steps=[('preprocessing', ColumnTransformer(n_jobs=None, remainder='drop', sparse_threshold=0.3,
         transformer_weights=None,
         transformers=[('vectorizer_title_text', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf...,
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))])

In [37]:
grid_search.best_params_

{'svc__gamma': 'scale', 'svc__kernel': 'rbf'}

In [38]:
grid_search.score(x_test,y_test)

0.9899777282850779

In [39]:
pd.DataFrame(grid_search.cv_results_)



Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_svc__gamma,param_svc__kernel,params,split0_test_f1,split1_test_f1,split2_test_f1,...,split1_test_recall,split2_test_recall,mean_test_recall,std_test_recall,rank_test_recall,split0_train_recall,split1_train_recall,split2_train_recall,mean_train_recall,std_train_recall
0,430.398347,8.139596,260.213829,16.872329,scale,rbf,"{'svc__gamma': 'scale', 'svc__kernel': 'rbf'}",0.986386,0.988868,0.986461,...,0.985015,0.982617,0.98395,0.000997,1,0.997103,0.996604,0.997003,0.996903,0.000216
1,585.193385,57.060192,450.068181,8.362079,scale,poly,"{'svc__gamma': 'scale', 'svc__kernel': 'poly'}",0.738305,0.719053,0.734582,...,0.57043,0.591409,0.585375,0.010633,3,0.643057,0.643192,0.6364,0.640883,0.003171
2,390.36107,2.584382,551.068369,2.030831,auto,rbf,"{'svc__gamma': 'auto', 'svc__kernel': 'rbf'}",0.871544,0.853392,0.860732,...,0.772827,0.787213,0.786561,0.010956,2,0.798801,0.785836,0.787534,0.790723,0.005754
3,561.625118,18.329634,740.88148,0.355263,auto,poly,"{'svc__gamma': 'auto', 'svc__kernel': 'poly'}",0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,4,0.0,0.0,0.0,0.0,0.0


In [42]:
results = pd.DataFrame(grid_search.cv_results_)