# Importing Data

In [10]:
import pandas as pd
true = pd.read_csv('../raw_data/True.csv', nrows = 1000)
fake = pd.read_csv('../raw_data/Fake.csv', nrows = 1000)

# Dropping rows not needed

In [11]:
true.drop(columns = ['date', 'subject'], inplace = True)
fake.drop(columns = ['date', 'subject'], inplace = True)

# Creating target

In [12]:
true['score'] = 1
fake['score'] = 0

# Creating onde dataset

In [13]:
data = pd.concat([true, fake], ignore_index=True)

# Data Cleaning

In [14]:
stop_words = ['/Getty Images']
pat = '|'.join(r"\b{}\b".format(x) for x in stop_words)
data['text'] = data['text'].str.replace(pat, '')
data['text']

0       WASHINGTON (Reuters) - The head of a conservat...
1       WASHINGTON (Reuters) - Transgender people will...
2       WASHINGTON (Reuters) - The special counsel inv...
3       WASHINGTON (Reuters) - Trump campaign adviser ...
4       SEATTLE/WASHINGTON (Reuters) - President Donal...
5       WEST PALM BEACH, Fla./WASHINGTON (Reuters) - T...
6       WEST PALM BEACH, Fla (Reuters) - President Don...
7       The following statements were posted to the ve...
8       The following statements were posted to the ve...
9       WASHINGTON (Reuters) - Alabama Secretary of St...
10      (Reuters) - Alabama officials on Thursday cert...
11      NEW YORK/WASHINGTON (Reuters) - The new U.S. t...
12      The following statements were posted to the ve...
13      The following statements were posted to the ve...
14       (In Dec. 25 story, in second paragraph, corre...
15      (Reuters) - A lottery drawing to settle a tied...
16      WASHINGTON (Reuters) - A Georgian-American bus...
17      The fo

In [15]:
import string

def remove_punctuation(text):
    for punctuation in string.punctuation:
        text = text.replace(punctuation, '')
    return text

data['text'] = data['text'].map(remove_punctuation)
data['title'] = data['title'].map(remove_punctuation)


def lower_case(text):
    text = text.lower()
    return text

data['text'] = data['text'].map(lower_case)
data['title'] = data['title'].map(lower_case)

def remove_numbers(text):
    text = ''.join(word for word in text if not word.isdigit())
    return text

data['text'] = data['text'].map(remove_numbers)
data['title'] = data['title'].map(remove_numbers)

# Grid Search for best params

In [16]:
x = data['text']
y = data['score']

In [17]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,random_state=0,test_size=0.3)

In [18]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

# Create Pipeline
pipeline = Pipeline([
    ('cvt', CountVectorizer()),
    ('nb', MultinomialNB()),
])

# Set parameters to search
parameters = {
    'cvt__ngram_range': ((1,1),(2,2),(3,3),(4,4)),
    'nb__alpha': (0.1,1,2,5,10),}

# Perform grid search
grid_search = GridSearchCV(pipeline, parameters,
                           scoring = "accuracy",
                           refit=True, cv=5, verbose = 1)

grid_search.fit(x_train,y_train)

Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


KeyboardInterrupt: 

# checking score

In [None]:
grid_search.best_score_

In [None]:
y_pred = grid_search.predict(x_test)

In [None]:
y_test = list(y_test)
y_test

In [None]:
y_test = list(y_test)
y_test
same = 0 
index = 0
for pred in y_pred:
    if pred == y_test[index]:
        same += 1
    index +=1
same

In [None]:
len(y_test)

In [1]:
593/600

0.9883333333333333

In [None]:
X_test
grid_search.predict()

In [None]:
grid_search.best_params_

# Creating a model based on best params

In [19]:
from sklearn.model_selection import train_test_split

x_train,x_test,y_train,y_test=train_test_split(x,y,random_state=0,test_size=0.3)

In [20]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(ngram_range= (2, 2))

x_train_bow = vectorizer.fit_transform(x_train)

In [21]:
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB(alpha = 10)

model.fit(x_train_bow,y_train)

MultinomialNB(alpha=10, class_prior=None, fit_prior=True)

In [22]:
x_test_bow = vectorizer.transform(x_test)

In [23]:
model.score(x_test_bow,y_test)

0.9883333333333333

# Doing some tests

In [31]:
test = 'kjaxnaxsa x saj j as xa s'
test = pd.Series(test)

In [32]:
test_T =  vectorizer.transform(test)

In [33]:
model.predict(test)

ValueError: could not convert string to float: 'hello howa red today'