# Importing the data

In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from nltk.tokenize import word_tokenize
import nltk
from nltk.stem import WordNetLemmatizer

true = pd.read_csv('../raw_data/True.csv')
fake = pd.read_csv('../raw_data/Fake.csv')

In [2]:
true.drop(columns = ['subject','date'], inplace = True)
fake.drop(columns = ['subject','date'], inplace = True)

In [3]:
true_copy = true.copy()
fake_copy = fake.copy()

# Removing Getty

In [4]:
stop_words =['/Getty Images']
pat = '|'.join(r"\b{}\b".format(x) for x in stop_words)
true_copy['text'] = true_copy['text'].str.replace(pat, '')

stop_words =['/Getty Images']
pat = '|'.join(r"\b{}\b".format(x) for x in stop_words)
fake_copy['text'] = fake_copy['text'].str.replace(pat, '')

# Feature Engineering

In [5]:
true_copy['title_length_char'] = true_copy.title.str.len()
fake_copy['title_length_char'] = fake_copy.title.str.len()

In [6]:
true_copy['title_Upper'] = true_copy['title'].str.count(r'[A-Z]')

fake_copy['title_Upper'] = fake_copy['title'].str.count(r'[A-Z]')

In [7]:
true_copy['title_lower_ratio'] = true_copy.title_Upper / true_copy.title_length_char
true_copy.drop(columns = 'title_Upper', inplace = True)




In [8]:
fake_copy['title_lower_ratio'] = fake_copy.title_Upper / fake_copy.title_length_char
fake_copy.drop(columns = 'title_Upper', inplace = True)

# Setting the target

In [9]:
true_copy['score'] = 1
fake_copy['score'] = 0

In [10]:
data = pd.concat([true_copy, fake_copy], ignore_index=True)

# Creating text + title

In [11]:
data['title_text'] = data['title'] + data['text']

In [12]:
data.drop(columns = ['title','text'], inplace = True)

In [13]:
data.head()

Unnamed: 0,title_length_char,title_lower_ratio,score,title_text
0,64,0.0625,1,"As U.S. budget fight looms, Republicans flip t..."
1,64,0.0625,1,U.S. military to accept transgender recruits o...
2,60,0.116667,1,Senior U.S. Republican senator: 'Let Mr. Muell...
3,59,0.135593,1,FBI Russia probe helped by Australian diplomat...
4,69,0.057971,1,Trump wants Postal Service to charge 'much mor...


# Preprocessing

In [14]:
def lower_case(text):
    text = text.lower()
    return text
data['title_text'] = data['title_text'].map(lower_case)



In [15]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
data['title_text'] = data['title_text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))



In [16]:
import string
punc = string.punctuation + '“' + '”' + '’' + '‘'
def remove_punctuation(text):
    for punctuation in punc:
        text = text.replace(punctuation, '')
    return text


data['title_text'] = data['title_text'].apply(remove_punctuation)



In [17]:
def remove_numbers(text):
    text = ''.join(word for word in text if not word.isdigit())
    return text
data['title_text'] = data['title_text'].map(remove_numbers)



In [18]:
data.head()

Unnamed: 0,title_length_char,title_lower_ratio,score,title_text
0,64,0.0625,1,us budget fight looms republicans flip fiscal ...
1,64,0.0625,1,us military accept transgender recruits monday...
2,60,0.116667,1,senior us republican senator let mr mueller jo...
3,59,0.135593,1,fbi russia probe helped australian diplomat ti...
4,69,0.057971,1,trump wants postal service charge much more am...


In [20]:
data = data[['title_text', 'title_length_char', 'title_lower_ratio', 'score']]
data.head()

Unnamed: 0,title_text,title_length_char,title_lower_ratio,score
0,us budget fight looms republicans flip fiscal ...,64,0.0625,1
1,us military accept transgender recruits monday...,64,0.0625,1
2,senior us republican senator let mr mueller jo...,60,0.116667,1
3,fbi russia probe helped australian diplomat ti...,59,0.135593,1
4,trump wants postal service charge much more am...,69,0.057971,1


In [None]:
from sklearn.feature_extraction.text import HashingVectorizer
corpus = data['title_text']
vectorizer = HashingVectorizer(n_features=2**4)
X = vectorizer.fit_transform(corpus)
print(X.shape)

# Tokenizer

In [None]:
#from nltk.tokenize import word_tokenize
#def tokenize(text):
    #token= word_tokenize(text)
    #return token
#data['title_text'] = data['title_text'].apply(tokenize)

# Lemmatizer

In [None]:
def lemmatize_text(text):
    lemmatizer = WordNetLemmatizer()
    return [lemmatizer.lemmatize(w) for w in text]

data['title_text'].apply(lemmatize_text)

# X and Y


In [None]:
x = data.drop(columns = 'score', axis=1)
y = data['score']

# Splitting the dataset

In [None]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,random_state=0,test_size=0.3)

# Initializing the model

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import SVC

preprocessor = ColumnTransformer([
    ('vectorizer_title_text', CountVectorizer(), 'title_text'),
    ('MinMaxScaler', MinMaxScaler(), ['title_length_char'])
])
final_pipe = Pipeline([
    ('preprocessing', preprocessor),
    ('sgdc', SGDC())])



In [None]:
from sklearn.model_selection import GridSearchCV
parameters = {
    'preprocessing__vectorizer_text__max_df' : [0.8,1.0],
    'preprocessing__vectorizer_text__min_df' : [0.05,0.1],
    'sgdc__kernel' : ('rbf', 'poly', 'linear'),
    'sgdc__gamma' : ('scale', 'auto')}
grid_search = GridSearchCV(final_pipe,
                           parameters,
                           scoring = ["f1", "accuracy", "recall", "precision"],
                           refit= “accuracy”,
                           cv=5,
                           verbose = 1)