# Import stuff

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import re,string
import tensorflow as tf
from string import punctuation
from nltk.corpus import stopwords
import enchant

from nltk.tokenize import word_tokenize
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
from nltk.corpus import stopwords

# Importing Data

In [2]:
true = pd.read_csv('../raw_data/True.csv')
fake = pd.read_csv('../raw_data/Fake.csv')

## Deleting non necessary columns

In [3]:
true.drop(columns = ['subject','date'], inplace = True)
fake.drop(columns = ['subject','date'], inplace = True)

## Creating Target

In [4]:
true['score'] = 1
fake['score'] = 0

## Concatenate everything to one dataset

In [5]:
data = pd.concat([true,fake],ignore_index=True)

# Cleaning some data

## Removing punctuation 

In [6]:
import string

punc = string.punctuation + '“' + '”' + '’' + '‘'
def remove_punctuation(text):
    for punctuation in punc:
        text = text.replace(punctuation, '')
    return text

In [7]:
data['title_clean']=data['title'].apply(remove_punctuation)
data['text_clean']=data['text'].apply(remove_punctuation)

## Removing Numbers

In [8]:
def remove_numbers(text):
    text = ''.join(word for word in text if not word.isdigit())
    return text

In [9]:
data['title_clean']=data['title_clean'].apply(remove_numbers)

data['text_clean']=data['text_clean'].apply(remove_numbers)

## Making everything lower case

In [10]:
def lower_case(text):
    text = text.lower()
    return text

In [11]:
data['title_clean']=data['title_clean'].apply(lower_case)
data['text_clean']=data['text_clean'].apply(lower_case)

## Removing Stop Words

In [12]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

data['title_clean'] = data['title_clean'].apply(lambda x: ' '.join([word for word in x.split() if word.lower() not in (stop_words)]))
data['text_clean'] = data['text_clean'].apply(lambda x: ' '.join([word for word in x.split() if word.lower() not in (stop_words)]))

## tokenizing text in order to count how many words we have to calculate ratio

In [13]:
def tokenize_text(text):
    return text.split()

data['title_tokens']=data['title_clean'].apply(tokenize_text)
data['text_tokens']=data['text_clean'].apply(tokenize_text)

## Function to count typos in text

In [240]:
english = enchant.DictWithPWL("en_US", "vocab.txt")
wrong_words={}
correct_words=set()
def get_typos_t(tokens):
     wrong_count=0
     for token in tokens:
            if token in wrong_words:
                wrong_words[token]+=1
                wrong_count+=1
            else:
                if not token in correct_words:    
                    if not english.check(token) and not english.check(token.capitalize()):
                        wrong_words[token]=1
                        wrong_count+=1
                    else:
                        correct_words.add(token)
     return wrong_count       

# Create typo ratio

In [15]:
data['typos_title_count']=data['title_tokens'].apply(get_typos_t)
data['typos_text_count']=data['text_tokens'].apply(get_typos_t)

In [16]:
data['title_typo_ratio']= data['typos_title_count']/len(data['title_tokens'])
data['text_typo_ratio']= data['typos_text_count']/len(data['text_tokens'])

In [17]:
data = data[['title', 'text', 'title_clean', 'text_clean', 'title_tokens',
       'text_tokens', 'typos_title_count', 'typos_text_count',
       'title_typo_ratio', 'text_typo_ratio','score']]

# Define Generic Function

def preparation(data):
    data['title_clean']=data['title'].apply(remove_numbers)
    data['text_clean']=data['text'].apply(remove_numbers)
    
    data['title_clean']=data['title_clean'].apply(remove_punctuation)
    data['text_clean']=data['text_clean'].apply(remove_punctuation)
    
    data['title_clean'] = data['title_clean'].apply(lambda x: ' '.join([word for word in x.split() if word.lower() not in (stop_words)]))
    data['text_clean'] = data['text_clean'].apply(lambda x: ' '.join([word for word in x.split() if word.lower() not in (stop_words)]))
    
    data['title_tokens']=data['title_clean'].apply(tokenize_text)
    data['text_tokens']=data['text_clean'].apply(tokenize_text)
    
    data['title_token_count']=data['title_tokens'].apply(lambda tokens:len(tokens))
    data['text_token_count']=data['text_tokens'].apply(lambda tokens:len(tokens))
    
    data['wrong_title_token_count']=data['title_tokens'].apply(get_typos_t)
    data['wrong_text_token_count']=data['text_tokens'].apply(get_typos_t)
    
    data['title_typo_ratio']= data['wrong_title_token_count']/data['title_token_count']
    data['text_typo_ratio']= data['wrong_text_token_count']/data['text_token_count']
    
    return data[['title_clean', 'text_clean','text_typo_ratio']]
    

# Time to run a cross_val and find the best params for the model

## separating the data

In [18]:
x = data[['title_clean', 'text_clean','text_typo_ratio']]
y=data['score']

## Splitting the data in two : Train/Test

In [19]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,random_state=0,test_size=0.3)

In [20]:
x_test

Unnamed: 0,title_clean,text_clean,text_typo_ratio
35305,rubio sides democrats giving whopping billion ...,democrats want spend whopping billion zika vir...,0.000290
29180,ted cruz says endorsing trump grave mistake pr...,republican presidential candidate texas senato...,0.000423
29805,responsible gun owner road rages threatens sho...,national rifle association tells us need ensur...,0.000290
38237,teachers union protest gets ugly protesters co...,case live midwest might noticed fight continue...,0.000379
5099,etihad advises checks us missions new trump order,dubai reuters etihad airways advising passenge...,0.000111
5812,factbox trump us supreme court pick could affe...,reuters president donald trumps nominee us sup...,0.000200
2747,trump asks supreme court block travel ban ruling,washington reuters us justice department frida...,0.000178
34681,breaking bombshell dem congressmen phone numbe...,wow keeps getting better better happened bette...,0.000512
12157,security tight germany marks anniversary chris...,berlin reuters germany tightened security chri...,0.000245
31537,altleft fake news media refuses tell video,media trying best control see see comes altlef...,0.000334


## Creating a pipeline to vectorize the text/title

In [25]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
preprocessor = ColumnTransformer([
    ('vectorizer_title', CountVectorizer(), 'title_clean'),
    ('vectorizer_text', CountVectorizer(), 'text_clean'),

    #insert function here
    
    
])
final_pipe = Pipeline([
    ('preprocessing', preprocessor),
    ('nb', MultinomialNB())])

# Grid searching for best params 

In [None]:
#from sklearn.pipeline import Pipeline
#from sklearn.preprocessing import MinMaxScaler
#from sklearn.compose import ColumnTransformer
#from sklearn.naive_bayes import MultinomialNB
#from sklearn.feature_extraction.text import CountVectorizer
#preprocessor = ColumnTransformer([
    #('vectorizer_title', CountVectorizer(), 'title'),
    #('vectorizer_text', CountVectorizer(), 'text')])
#final_pipe = Pipeline([
    #('preprocessing', preprocessor),
    #('nb', MultinomialNB())])

In [30]:
from sklearn.model_selection import GridSearchCV
parameters = {
    'nb__alpha': (1,2,5,10)}
grid_search = GridSearchCV(final_pipe,
                           parameters,
                           scoring = ["f1", "accuracy", "recall"],
                           refit= "accuracy",
                           cv=3,
                           verbose = 1)

In [31]:
import time
start = time.time()
grid_search.fit(x_train,y_train)
stop = time.time()

Fitting 3 folds for each of 4 candidates, totalling 12 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  12 out of  12 | elapsed:  5.0min finished


In [32]:
grid_search.best_params_

{'nb__alpha': 1}

# Training model on best params found in grid_search

In [34]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
preprocessor = ColumnTransformer([
    ('vectorizer_title', CountVectorizer(), 'title_clean'),
    ('vectorizer_text', CountVectorizer(), 'text_clean'),

    #insert function here
    
    
])
final_pipe = Pipeline([
    ('preprocessing', preprocessor),
    ('nb', MultinomialNB())])

In [35]:
final_pipe.fit(x_train,y_train)

Pipeline(memory=None,
     steps=[('preprocessing', ColumnTransformer(n_jobs=None, remainder='drop', sparse_threshold=0.3,
         transformer_weights=None,
         transformers=[('vectorizer_title', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', ...abulary=None), 'text_clean')])), ('nb', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [36]:
final_pipe.score(x_test,y_test)

0.9656273199703044

# Testing our model on some new data

In [220]:
import trafilatura
downloaded = trafilatura.fetch_url('https://enduringvision.com/news/us_071310.php')
text_test = trafilatura.extract(downloaded)

In [221]:
#text_test

In [222]:
title_test="CONSERVATIONISTS FEAR DWINDLING PARK SPACE REDUCES PLACES KIDS CAN SAFELY GET HIGH"

In [223]:
#text_test_2 = 

In [224]:
#title_text_2 = 

## Creating a function that cleans the test text

In [225]:
import string

punc = string.punctuation + '“' + '”' + '’' + '‘' +'—' +'€' + '❤'

from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

def clean_text(text):
    for punctuation in punc:
        text = text.replace(punctuation, '')
    text = ''.join(word for word in text if not word.isdigit())
    text = text.lower()
    text_split = text.split()
    for word in text_split:
        if word in stop_words:
            text_split.remove(word)
    text_string = ''
    for word in text_split:
        text_string = text_string + ' ' + word
    return text_string

In [226]:
text_clean = clean_text(text_test)
title_clean = clean_text(title_test)

In [227]:
text_clean

' conservationists fear dwindling park space reduces places kids safely get high letter obama administration published today naturalists conservationists expressed growing concern chronic reduction nations parks open spaces limiting safe areas americas children go get stoned decade nations youth could count darkened baseball diamond an abandoned picnic shelter they wanted smoke bowl safely without worrying crashing moving car getting caught dad walking basement read letter published latest issue high times foxnewscom havens disappearing thanks budget crises pushing children dark dangerous forests even innercity neighborhoods few tokes authors letter including michael brune executive director the sierra club argue local state governments mortgaging childrens futures save few bucks wall street elitists got us this financial mess theyre still rich kids continue have the means get high peace said brune theyre taking advantage wealthy parents lengthy vacations away home their kickass backya

In [228]:
title_clean

' conservationists fear dwindling park space reduces places kids safely get high'

## Creating a general function to calculate typo ratio

In [229]:
def typo_ratio(text):
    text_split = text.split()
    lenght = len(text_split)
    typos = get_typos_t(text)
    print()

In [230]:
typo_ratio(text_clean)




## Transforming the data as an input that the model accepts

In [231]:
test_series=pd.Series(text_clean)
title_series=pd.Series(title_clean)

In [232]:
test_df=pd.DataFrame({'title_clean':title_series,'text_clean':test_series, 'text_typo_ratio': get_typos_t(text_clean)})

In [233]:
final_pipe.predict(test_df)

array([0])

# Testing the model on a new dataset made up only of fake news

In [None]:
fake_test = pd.read_csv('../raw_data/fake_extra.csv')

In [None]:
fake_test = fake_test.replace(np.nan, '', regex=True)

In [None]:
result=final_pipe.predict(preparation(fake_test))

In [None]:
unique, counts = np.unique(result, return_counts=True)
dict(zip(unique, counts))

# Trying a logistic regression with uncleaned data

# Trying a logistic regression with more features

In [234]:
text_test

'Conservationists Fear Dwindling Park Space Reduces Places Kids Can Safely Get High\nIn a letter to the Obama administration published today, naturalists and conservationists expressed a growing concern that chronic reduction of the nation\'s parks and open spaces is limiting the safe areas that America\'s children have to go and get stoned.\n"For decade, our nation\'s youth could count on a darkened baseball diamond or an abandoned picnic shelter when they wanted to smoke a bowl safely, without worrying about crashing a moving car or getting caught by Dad walking into the basement," read the letter, published in the latest issue of High Times and on FoxNews.com.\n"Now, with these havens disappearing thanks to budget crises, we\'re pushing our children further into dark, dangerous forests -- or even inner-city neighborhoods -- for just a few tokes."\nThe authors of the letter, including Michael Brune, executive director of the Sierra Club, argue that local and state governments are "mo

In [241]:
get_typos_t('I my teh is curreelty living in Italy')

0

In [245]:
text_2 = 'I my teh is curreelty living in Italy'

In [246]:
def tokenize_text(text_2):
    return text.split()

In [247]:
text_2

'I my teh is curreelty living in Italy'

In [248]:
get_typos_t(text_2)

0

In [250]:
get_typos_t('I my teh is cuffrreelty living in Italy'.split())

2