# Importing data

In [1]:
import pandas as pd
true = pd.read_csv('../raw_data/True.csv')
fake = pd.read_csv('../raw_data/Fake.csv')

# Dropping some columns 

In [2]:
true.drop(columns = ['date', 'subject'], inplace = True)
fake.drop(columns = ['date', 'subject'], inplace = True)

# Creating new datasets so we dont mess with the real ones

In [38]:
true_copy = true.copy()
fake_copy = fake.copy()

# Brief Data Cleaning

In [4]:
stop_words = ['/Getty Images']
pat = '|'.join(r"\b{}\b".format(x) for x in stop_words)
fake_copy['text'] = fake_copy['text'].str.replace(pat, '')
true_copy['text'] = true_copy['text'].str.replace(pat, '')

# Adding features ratio

## Length Feature

In [10]:
true_copy['title_length_char'] = true_copy.title.str.len()

In [11]:
fake_copy['title_length_char'] = fake_copy.title.str.len()

## UpperCase Ratio

In [7]:
true_copy['title_Upper'] = true_copy['title'].str.count(r'[A-Z]')
fake_copy['title_Upper'] = fake_copy['title'].str.count(r'[A-Z]')

In [8]:
true_copy['title_lower_ratio'] = true_copy.title_Upper / true_copy.title_length_char
true_copy.drop(columns = 'title_Upper', inplace = True)

In [9]:
fake_copy['title_lower_ratio'] = fake_copy.title_Upper / fake_copy.title_length_char
fake_copy.drop(columns = 'title_Upper', inplace = True)

# Starting a model

## First create target and concat datasets

In [12]:
true_copy['score'] = 1
fake_copy['score'] = 0

In [13]:
data = pd.concat([true_copy, fake_copy], ignore_index=True)

In [14]:
data.head()

Unnamed: 0,title,text,title_length_char,title_lower_ratio,score
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,64,0.0625,1
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,64,0.0625,1
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,60,0.116667,1
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,59,0.135593,1
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,69,0.057971,1


## Let's initialize a model

In [15]:
x = data.drop(columns='score')
y = data['score']

In [16]:
from sklearn.model_selection import train_test_split

x_train,x_test,y_train,y_test=train_test_split(x,y,random_state=0,test_size=0.3)

In [20]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer


preprocessor = ColumnTransformer([
    ('vectorizer_title', TfidfVectorizer(), 'title'),
    ('vectorizer_text', TfidfVectorizer, 'text'),
    ('MinMaxScaler', MinMaxScaler(), ['title_length_char'])
])

final_pipe = Pipeline([
    ('preprocessing', preprocessor),
    ('nb', MultinomialNB())])

In [36]:
from sklearn.model_selection import GridSearchCV

parameters = {
    'preprocessing__vectorizer_title__ngram_range': ((2,2),(3,3)),
    'preprocessing__vectorizer_text__ngram_range': ((2,2),(3,3)),
    'preprocessing__vectorizer_text__max_df' : ([0.5,1]),
    'preprocessing__vectorizer_text__min_df' : ([0.2,0.4]),
    'nb__alpha': (2,3,4,5)}

grid_search = GridSearchCV(final_pipe,
                           parameters,
                           scoring = ["f1", "accuracy", "recall"], 
                           refit= "accuracy",
                           cv=3,
                           verbose = 0)

In [37]:
import time
start = time.time()
grid_search.fit(x_train,y_train)
stop = time.time()

Fitting 5 folds for each of 64 candidates, totalling 320 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
  return self.partial_fit(X, y)


KeyboardInterrupt: 

In [None]:
tuned_pipe = random_search.best_estimator_

In [None]:
grid_search.best_params_

In [None]:
pd.DataFrame(grid_search.cv_results_)

In [None]:
import pickle

# Export pipeline as pickle file
with open("pipeline.pkl", "wb") as file:
    pickle.dump(tuned_pipe, file )