In [3]:
import pandas as pd
true = pd.read_csv('../raw_data/True.csv', nrows = 1000)
fake = pd.read_csv('../raw_data/Fake.csv', nrows = 1000)

In [None]:
true.drop(columns = ['date', 'subject'], inplace = True)
fake.drop(columns = ['date', 'subject'], inplace = True)

In [4]:
true['score'] = 1
fake['score'] = 0

In [6]:
data = pd.concat([true, fake], ignore_index=True)

In [12]:
stop_words = ['/Getty Images']
pat = '|'.join(r"\b{}\b".format(x) for x in stop_words)
data['text'] = data['text'].str.replace(pat, '')
data['text']

0       WASHINGTON (Reuters) - The head of a conservat...
1       WASHINGTON (Reuters) - Transgender people will...
2       WASHINGTON (Reuters) - The special counsel inv...
3       WASHINGTON (Reuters) - Trump campaign adviser ...
4       SEATTLE/WASHINGTON (Reuters) - President Donal...
5       WEST PALM BEACH, Fla./WASHINGTON (Reuters) - T...
6       WEST PALM BEACH, Fla (Reuters) - President Don...
7       The following statements were posted to the ve...
8       The following statements were posted to the ve...
9       WASHINGTON (Reuters) - Alabama Secretary of St...
10      (Reuters) - Alabama officials on Thursday cert...
11      NEW YORK/WASHINGTON (Reuters) - The new U.S. t...
12      The following statements were posted to the ve...
13      The following statements were posted to the ve...
14       (In Dec. 25 story, in second paragraph, corre...
15      (Reuters) - A lottery drawing to settle a tied...
16      WASHINGTON (Reuters) - A Georgian-American bus...
17      The fo

In [24]:
import string

def remove_punctuation(text):
    for punctuation in string.punctuation:
        text = text.replace(punctuation, '')
    return text

data['text'] = data['text'].map(remove_punctuation)
data['title'] = data['title'].map(remove_punctuation)


def lower_case(text):
    text = text.lower()
    return text

data['text'] = data['text'].map(lower_case)
data['title'] = data['title'].map(lower_case)

def remove_numbers(text):
    text = ''.join(word for word in text if not word.isdigit())
    return text

data['text'] = data['text'].map(remove_numbers)
data['title'] = data['title'].map(remove_numbers)

In [54]:
x = data['text']
y = data['score']

In [55]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,random_state=0,test_size=0.3)

In [56]:
print(x_train.shape,x_test.shape,y_train.shape,y_test.shape)

(1400,) (600,) (1400,) (600,)


In [62]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

# Create Pipeline
pipeline = Pipeline([
    ('cvt', CountVectorizer()),
    ('nb', MultinomialNB()),
])

# Set parameters to search
parameters = {
    'cvt__ngram_range': ((1,1),(2,2),(3,3),(4,4)),
    'nb__alpha': (0.1,1,2,5,10),}

# Perform grid search
grid_search = GridSearchCV(pipeline, parameters,
                           scoring = "accuracy",
                           refit=True, cv=5, verbose = 1)

grid_search.fit(x_train,y_train)

Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:  6.8min finished


GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('cvt', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)), ('nb', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))]),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'cvt__ngram_range': ((1, 1), (2, 2), (3, 3), (4, 4)), 'nb__alpha': (0.1, 1, 2, 5, 10)},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=1)

In [63]:
grid_search.best_score_

0.9842857142857143

In [66]:
y_pred = grid_search.predict(x_test)

In [75]:
y_test = list(y_test)
y_test

[1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 0,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 0,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 0,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 1,
 0,
 1,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,


In [76]:
y_test = list(y_test)
y_test
same = 0 
index = 0
for pred in y_pred:
    if pred == y_test[index]:
        same += 1
    index +=1
same

593

In [77]:
len(y_test)

600

In [78]:
593/600

0.9883333333333333

In [None]:
X_test
grid_search.predict()

In [81]:
grid_search.predict('Fake News, how are you today')

ValueError: Iterable over raw text documents expected, string object received.

In [83]:
grid_search.best_params_

{'cvt__ngram_range': (2, 2), 'nb__alpha': 10}

In [100]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(ngram_range= (2, 2))

x = vectorizer.fit_transform(x)

from sklearn.model_selection import train_test_split

x_train,x_test,y_train,y_test=train_test_split(x,y,random_state=0,test_size=0.3)

In [102]:
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB(alpha = 10)

model.fit(x_train,y_train)

MultinomialNB(alpha=10, class_prior=None, fit_prior=True)

In [103]:
model.score(x_test,y_test)

0.9883333333333333

In [107]:
model.predict(a)

ValueError: Expected 2D array, got scalar array instead:
array=how are you uisnsn x shsbsah  s DHhDA.
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.