# Importing modules

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import re,string
import tensorflow as tf
import enchant
import trafilatura

from string import punctuation
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
from nltk.corpus import stopwords

# Importing Data

In [2]:
true = pd.read_csv('../raw_data/True.csv')
fake = pd.read_csv('../raw_data/Fake.csv')

## Deleting non necessary columns

In [3]:
true.drop(columns = ['subject','date'], inplace = True)
fake.drop(columns = ['subject','date'], inplace = True)

## Brief data cleaning to fake dataset

In [4]:
stop_words = ['/Getty Images']
pat = '|'.join(r"\b{}\b".format(x) for x in stop_words)
fake['text'] = fake['text'].str.replace(pat, '')

## Creating Target

In [5]:
true['score'] = 1
fake['score'] = 0

## Concatenate everything to one dataset

In [6]:
data = pd.concat([true,fake],ignore_index=True)

# Cleaning some data

## Removing links from text

In [7]:
import re
def rem_urls(text):
    return re.sub('https?:\S+','',text)

In [8]:
data['title_clean']=data['title'].apply(rem_urls)
data['text_clean']=data['text'].apply(rem_urls)

## Removing punctuation 

In [9]:
import string

punc_no_sq = '!“#$%&\()*+,./:;<=>?@[\\]^_`{|}~“”—’-'

def remove_punctuation(text):
    for punctuation in punc_no_sq:
        text = text.replace(punctuation, '')
    return text

In [10]:
data['title_clean']=data['title_clean'].apply(remove_punctuation)
data['text_clean']=data['text_clean'].apply(remove_punctuation)

## Removing Numbers

In [11]:
def remove_numbers(text):
    text = ''.join(word for word in text if not word.isdigit())
    return text

In [12]:
data['title_clean']=data['title_clean'].apply(remove_numbers)

data['text_clean']=data['text_clean'].apply(remove_numbers)

## Making everything lower case

In [13]:
def lower_case(text):
    text = text.lower()
    return text

In [14]:
data['title_clean']=data['title_clean'].apply(lower_case)
data['text_clean']=data['text_clean'].apply(lower_case)

## Removing Stop Words

In [15]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

data['title_clean'] = data['title_clean'].apply(lambda x: ' '.join([word for word in x.split() if word.lower() not in (stop_words)]))
data['text_clean'] = data['text_clean'].apply(lambda x: ' '.join([word for word in x.split() if word.lower() not in (stop_words)]))

## tokenizing text in order to count how many words we have to calculate ratio

In [16]:
def tokenize_text(text):
    return text.split()

data['title_tokens']=data['title_clean'].apply(tokenize_text)
data['text_tokens']=data['text_clean'].apply(tokenize_text)

## Function to count typos in text

In [17]:
english = enchant.DictWithPWL("en_US", "vocab.txt")
wrong_words={}
correct_words=set()
def get_typos_t(tokens):
     wrong_count=0
     for token in tokens:
            if token in wrong_words:
                wrong_words[token]+=1
                wrong_count+=1
            else:
                if not token in correct_words:
                    if token[0].islower() and not '-' in token and not english.check(token) and not english.check(token.capitalize()):
                        wrong_words[token]=1
                        wrong_count+=1
                    else:
                        correct_words.add(token)
     return wrong_count    

# Create typo ratio

In [18]:
data['typos_title_count']=data['title_tokens'].apply(get_typos_t)
data['typos_text_count']=data['text_tokens'].apply(get_typos_t)

In [19]:
data['title_typo_ratio']= data['typos_title_count']/len(data['title_tokens'])
data['text_typo_ratio']= data['typos_text_count']/len(data['text_tokens'])

# Ordering the dataset

In [20]:
data = data[['title', 'text', 'title_clean', 'text_clean', 'title_tokens',
       'text_tokens', 'typos_title_count', 'typos_text_count',
       'title_typo_ratio', 'text_typo_ratio','score']]

# Define Generic Function

def preparation(data):
    data['title_clean']=data['title'].apply(remove_numbers)
    data['text_clean']=data['text'].apply(remove_numbers)
    
    data['title_clean']=data['title_clean'].apply(remove_punctuation)
    data['text_clean']=data['text_clean'].apply(remove_punctuation)
    
    data['title_clean'] = data['title_clean'].apply(lambda x: ' '.join([word for word in x.split() if word.lower() not in (stop_words)]))
    data['text_clean'] = data['text_clean'].apply(lambda x: ' '.join([word for word in x.split() if word.lower() not in (stop_words)]))
    
    data['title_tokens']=data['title_clean'].apply(tokenize_text)
    data['text_tokens']=data['text_clean'].apply(tokenize_text)
    
    data['title_token_count']=data['title_tokens'].apply(lambda tokens:len(tokens))
    data['text_token_count']=data['text_tokens'].apply(lambda tokens:len(tokens))
    
    data['wrong_title_token_count']=data['title_tokens'].apply(get_typos_t)
    data['wrong_text_token_count']=data['text_tokens'].apply(get_typos_t)
    
    data['title_typo_ratio']= data['wrong_title_token_count']/data['title_token_count']
    data['text_typo_ratio']= data['wrong_text_token_count']/data['text_token_count']
    
    return data[['title_clean', 'text_clean','text_typo_ratio']]
    

# Time to run a cross_val and find the best params for the model

## separating the data

In [21]:
x = data[['title_clean', 'text_clean','text_typo_ratio']]
y=data['score']

## Splitting the data in two : Train/Test

In [22]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,random_state=0,test_size=0.3)

## Creating a pipeline to vectorize the text/title

In [23]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
preprocessor = ColumnTransformer([
    
    ('vectorizer_title', CountVectorizer(), 'title_clean'),
    ('vectorizer_text', CountVectorizer(), 'text_clean'),

    #insert function here
    
])

final_pipe = Pipeline([
    ('preprocessing', preprocessor),
    ('Logistic', LogisticRegression())])

# Grid searching for best params 

In [24]:
from sklearn.model_selection import GridSearchCV

parameters = {
    
    'Logistic__solver': ('newton-cg', 'lbfgs', 'sag'),
    'Logistic__C': ([0.2, 0.4, 0.6, 0.8, 1.0])

}

grid_search = GridSearchCV(final_pipe,
                           parameters,
                           scoring = ["f1", "accuracy", "recall"], 
                           refit= "accuracy",
                           cv=3,
                           verbose = 0)

In [43]:
import time
start = time.time()
grid_search.fit(x_train,y_train)
stop = time.time()



In [45]:
grid_search.best_params_

{'Logistic__C': 0.4, 'Logistic__solver': 'newton-cg'}

**ok so now that we have best params lets train a model on them**

# Training model on best params found in grid_search with reuters in text

In [25]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression

preprocessor = ColumnTransformer([
    
    ('vectorizer_title', CountVectorizer(), 'title_clean'),
    ('vectorizer_text', CountVectorizer(), 'text_clean'),

    #insert function here
])

final_pipe = Pipeline([
    ('preprocessing', preprocessor),
    ('Logistic', LogisticRegression(solver = 'newton-cg', C =0.4 ))])

In [26]:
final_pipe.fit(x_train,y_train)

Pipeline(memory=None,
     steps=[('preprocessing', ColumnTransformer(n_jobs=None, remainder='drop', sparse_threshold=0.3,
         transformer_weights=None,
         transformers=[('vectorizer_title', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', ...ty='l2', random_state=None, solver='newton-cg',
          tol=0.0001, verbose=0, warm_start=False))])

In [27]:
final_pipe.score(x_test,y_test)

0.9973273942093541

In [28]:
# this final pipe is fitted with training data cointaing reuters in the text

# Testing our model on some new data

## Creating a function that cleans the test text

In [29]:
import trafilatura
downloaded = trafilatura.fetch_url('https://www.rte.ie/news/ireland/2020/1121/1179617-zoo-funding/')

In [30]:
text_test = trafilatura.extract(downloaded)
title_test='€1.6m in funding secured to support zoo sector'

In [31]:
import string

punc = string.punctuation + '“' + '”' + '’' + '‘' +'—' +'€' + '❤'

from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

def clean_text(text):
    for punctuation in punc:
        text = text.replace(punctuation, '')
    text = ''.join(word for word in text if not word.isdigit())
    text = text.lower()
    text_split = text.split()
    for word in text_split:
        if word in stop_words:
            text_split.remove(word)
    text_string = ''
    for word in text_split:
        text_string = text_string + ' ' + word
    return text_string

In [32]:
text_clean = clean_text(text_test)
title_clean = clean_text(title_test)

## Creating a general function to calculate typo ratio

In [33]:
def typo_ratio(text):
    text_split = text.split()
    lenght = len(text_split)
    typos = get_typos_t(text_split)
    return typos/lenght

## Transforming the data as an input that the model accepts

In [34]:
test_series=pd.Series(text_clean)
title_series=pd.Series(title_clean)

In [35]:
test_df=pd.DataFrame({'title_clean':title_series,'text_clean':test_series, 'text_typo_ratio': typo_ratio(text_clean)})

In [36]:
final_pipe.predict(test_df)

array([0], dtype=int64)

In [37]:
# this is how we use the model to predict on one specific example(this pipe is fitted with reuters text)

# Trying a logistic regression with real news not having reuters in its text

## Lets create a new column in data that will have the same text/title without reuters

In [38]:
stop_words = ['reuters']
pat = '|'.join(r"\b{}\b".format(x) for x in stop_words)
data['text_clean_without_reuters'] = data['text_clean'].str.replace(pat, '')

In [39]:
x_2 = data[['title_clean', 'text_clean_without_reuters','text_typo_ratio']]
y_2 = data['score']

In [40]:
from sklearn.model_selection import train_test_split
x_train_2,x_test_2,y_train_2,y_test_2=train_test_split(x_2,y_2,random_state=0,test_size=0.3)

In [41]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression

preprocessor_2 = ColumnTransformer([
    
    ('vectorizer_title', CountVectorizer(), 'title_clean'),
    ('vectorizer_text', CountVectorizer(), 'text_clean_without_reuters'),

    #insert function here
    
])

final_pipe_2 = Pipeline([
    ('preprocessing', preprocessor_2),
    ('Logistic', LogisticRegression())])

In [42]:
from sklearn.model_selection import GridSearchCV

parameters = {
    
    'Logistic__solver': ('newton-cg', 'lbfgs'), #didnt try sag because it wasnt converging
    'Logistic__C': ([0.2, 0.4, 0.6, 0.8, 1.0])

}

grid_search_2 = GridSearchCV(final_pipe_2,
                           parameters,
                           scoring = ["f1", "accuracy", "recall"], 
                           refit= "accuracy",
                           cv=3,
                           verbose = 0)

In [41]:
grid_search_2.fit(x_train_2,y_train_2)



GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('preprocessing', ColumnTransformer(n_jobs=None, remainder='drop', sparse_threshold=0.3,
         transformer_weights=None,
         transformers=[('vectorizer_title', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', ...penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False))]),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'Logistic__solver': ('newton-cg', 'lbfgs'), 'Logistic__C': [0.2, 0.4, 0.6, 0.8, 1.0]},
       pre_dispatch='2*n_jobs', refit='accuracy',
       return_train_score='warn', scoring=['f1', 'accuracy', 'recall'],
       verbose=0)

In [75]:
grid_search_2.best_score_

0.9887679775995927

In [45]:
grid_search_2.best_params_

{'Logistic__C': 0.4, 'Logistic__solver': 'newton-cg'}

In [36]:
#ok so even without reuters in the text the best params remain the same

## Training  a model on best params found

In [43]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression

preprocessor_2 = ColumnTransformer([
    
    ('vectorizer_title', CountVectorizer(), 'title_clean'),
    ('vectorizer_text', CountVectorizer(), 'text_clean_without_reuters'),

    #insert function here  
    
])

final_pipe_2 = Pipeline([
    ('preprocessing', preprocessor_2),
    ('Logistic', LogisticRegression(solver = 'newton-cg', C =0.4 ))])

In [44]:
final_pipe_2.fit(x_train_2,y_train_2)

Pipeline(memory=None,
     steps=[('preprocessing', ColumnTransformer(n_jobs=None, remainder='drop', sparse_threshold=0.3,
         transformer_weights=None,
         transformers=[('vectorizer_title', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', ...ty='l2', random_state=None, solver='newton-cg',
          tol=0.0001, verbose=0, warm_start=False))])

In [45]:
final_pipe_2.score(x_test_2,y_test_2)

0.9925760950259837

## Testing the same example as before where we got fake when the news was real

In [46]:
downloaded = trafilatura.fetch_url('https://www.bbc.com/news/av/uk-england-leicestershire-55007262')
text_test_2 = trafilatura.extract(downloaded)
title_test_2 ='''Horse racing: 'It doesn't matter what colour you are'''

In [47]:
text_clean_2 = clean_text(text_test_2)
title_clean_2 = clean_text(title_test_2)

In [48]:
test_series_2=pd.Series(text_clean_2)
title_series_2=pd.Series(title_clean_2)

In [49]:
test_df_2 = pd.DataFrame({'title_clean':title_series_2,'text_clean_without_reuters':test_series_2, 'text_typo_ratio': typo_ratio(text_clean_2)})

In [50]:
#from sklearn.metrics import classification_report
#print(classification_report(y_true, y_pred, target_names=target_names)) work on this later

In [51]:
final_pipe_2.predict(test_df_2)

array([1], dtype=int64)

**okay so this model is generalising alot better**

## Checking how well the model generalise for fake news using the fake news only dataset (testing in pipeline fitted with reuters(pipeline) and fitted witout it (pipeline_2)

In [67]:
fake_test_pipeline_2 = pd.read_csv('../raw_data/fake_extra.csv')
fake_test_pipeline_2.dropna(axis = 0, inplace = True)
fake_test_pipeline_2 = fake_test_pipeline_2[['title','text']]
fake_test_pipeline_2['text_clean_without_reuters'] = fake_test_pipeline_2['text'].apply(remove_punctuation)
fake_test_pipeline_2['title_clean'] = fake_test_pipeline_2['title'].apply(remove_punctuation)
fake_test_pipeline_2['text_clean_without_reuters'] = fake_test_pipeline_2['text_clean_without_reuters'].apply(remove_numbers)
fake_test_pipeline_2['title_clean'] = fake_test_pipeline_2['title_clean'].apply(remove_numbers)
fake_test_pipeline_2['text_clean_without_reuters'] = fake_test_pipeline_2['text_clean_without_reuters'].apply(lower_case)
fake_test_pipeline_2['title_clean'] = fake_test_pipeline_2['title_clean'].apply(lower_case)
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
fake_test_pipeline_2['text_clean_without_reuters'] = fake_test_pipeline_2['text_clean_without_reuters'].apply(lambda x: ' '.join([word for word in x.split() if word.lower() not in (stop_words)]))
fake_test_pipeline_2['title_clean'] = fake_test_pipeline_2['title_clean'].apply(lambda x: ' '.join([word for word in x.split() if word.lower() not in (stop_words)]))
fake_test_pipeline_2['title_tokens']=fake_test_pipeline_2['title_clean'].apply(tokenize_text)
fake_test_pipeline_2['text_tokens']=fake_test_pipeline_2['text_clean_without_reuters'].apply(tokenize_text)
fake_test_pipeline_2['typos_title_count'] = fake_test_pipeline_2['title_tokens'].apply(get_typos_t)
fake_test_pipeline_2['typos_text_count'] = fake_test_pipeline_2['text_tokens'].apply(get_typos_t)
fake_test_pipeline_2['title_typo_ratio']= fake_test_pipeline_2['typos_title_count']/len(fake_test_pipeline_2['title_tokens'])
fake_test_pipeline_2['text_typo_ratio']= fake_test_pipeline_2['typos_text_count']/len(fake_test_pipeline_2['text_tokens'])
fake_test_pipeline_2 = fake_test_pipeline_2[['title_clean','text_clean_without_reuters', 'text_typo_ratio']]

In [67]:
result_fake_without_reuters = final_pipe_2.predict(fake_test_pipeline_2)
unique_fake_without_reuters, counts_fake_without_reuters = np.unique(result_fake_without_reuters, return_counts=True)
dict(zip(unique_fake_without_reuters, counts_fake_without_reuters))

{0: 4155, 1: 547}

In [69]:
4155/(4155+547)

0.8836665248830285

In [70]:
#this is the accuracy on fake dataset by the pipeline fitted without reuters

In [68]:
fake_test_pipeline = pd.read_csv('../raw_data/fake_extra.csv')
fake_test_pipeline.dropna(axis = 0, inplace = True)
fake_test_pipeline = fake_test_pipeline[['title','text']]
fake_test_pipeline['text_clean'] = fake_test_pipeline['text'].apply(remove_punctuation)
fake_test_pipeline['title_clean'] = fake_test_pipeline['title'].apply(remove_punctuation)
fake_test_pipeline['text_clean'] = fake_test_pipeline['text_clean'].apply(remove_numbers)
fake_test_pipeline['title_clean'] = fake_test_pipeline['title_clean'].apply(remove_numbers)
fake_test_pipeline['text_clean'] = fake_test_pipeline['text_clean'].apply(lower_case)
fake_test_pipeline['title_clean'] = fake_test_pipeline['title_clean'].apply(lower_case)
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
fake_test_pipeline['text_clean'] = fake_test_pipeline['text_clean'].apply(lambda x: ' '.join([word for word in x.split() if word.lower() not in (stop_words)]))
fake_test_pipeline['title_clean'] = fake_test_pipeline['title_clean'].apply(lambda x: ' '.join([word for word in x.split() if word.lower() not in (stop_words)]))
fake_test_pipeline['title_tokens']=fake_test_pipeline['title_clean'].apply(tokenize_text)
fake_test_pipeline['text_tokens']=fake_test_pipeline['text_clean'].apply(tokenize_text)
fake_test_pipeline['typos_title_count'] = fake_test_pipeline['title_tokens'].apply(get_typos_t)
fake_test_pipeline['typos_text_count'] = fake_test_pipeline['text_tokens'].apply(get_typos_t)
fake_test_pipeline['title_typo_ratio']= fake_test_pipeline['typos_title_count']/len(fake_test_pipeline['title_tokens'])
fake_test_pipeline['text_typo_ratio']= fake_test_pipeline['typos_text_count']/len(fake_test_pipeline['text_tokens'])
fake_test_pipeline = fake_test_pipeline[['title_clean','text_clean', 'text_typo_ratio']]

In [75]:
result_fake_with_reuters = final_pipe.predict(fake_test_pipeline)
unique_fake_with_reuters, counts_fake_with_reuters = np.unique(result_fake_with_reuters, return_counts=True)
dict(zip(unique_fake_with_reuters, counts_fake_with_reuters))

{0: 4546, 1: 156}

In [77]:
4546/(4546+156)

0.9668226286686517

In [None]:
#this is the accuracy on fake daset by the pipeline fitted with reuters

## Testing both pipelines on true news and checking which one perfoms best

In [66]:
true_test_pipeline_2 = pd.read_csv('../raw_data/new_york_real.csv')
true_test_pipeline_2.dropna(axis = 0, inplace = True)
true_test_pipeline_2 = true_test_pipeline_2[['title','text']]
true_test_pipeline_2['text_clean_without_reuters'] = true_test_pipeline_2['text'].apply(remove_punctuation)
true_test_pipeline_2['title_clean'] = true_test_pipeline_2['title'].apply(remove_punctuation)
true_test_pipeline_2['text_clean_without_reuters'] = true_test_pipeline_2['text_clean_without_reuters'].apply(remove_numbers)
true_test_pipeline_2['title_clean'] = true_test_pipeline_2['title_clean'].apply(remove_numbers)
true_test_pipeline_2['text_clean_without_reuters'] = true_test_pipeline_2['text_clean_without_reuters'].apply(lower_case)
true_test_pipeline_2['title_clean'] = true_test_pipeline_2['title_clean'].apply(lower_case)
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
true_test_pipeline_2['text_clean_without_reuters'] = true_test_pipeline_2['text_clean_without_reuters'].apply(lambda x: ' '.join([word for word in x.split() if word.lower() not in (stop_words)]))
true_test_pipeline_2['title_clean'] = true_test_pipeline_2['title_clean'].apply(lambda x: ' '.join([word for word in x.split() if word.lower() not in (stop_words)]))
true_test_pipeline_2['title_tokens']=true_test_pipeline_2['title_clean'].apply(tokenize_text)
true_test_pipeline_2['text_tokens']=true_test_pipeline_2['text_clean_without_reuters'].apply(tokenize_text)
true_test_pipeline_2['typos_title_count'] = true_test_pipeline_2['title_tokens'].apply(get_typos_t)
true_test_pipeline_2['typos_text_count'] = true_test_pipeline_2['text_tokens'].apply(get_typos_t)
true_test_pipeline_2['title_typo_ratio']= true_test_pipeline_2['typos_title_count']/len(true_test_pipeline_2['title_tokens'])
true_test_pipeline_2['text_typo_ratio']= true_test_pipeline_2['typos_text_count']/len(true_test_pipeline_2['text_tokens'])
true_test_pipeline_2 = true_test_pipeline_2[['title_clean','text_clean_without_reuters', 'text_typo_ratio']]

In [79]:
result_without_reuters_trained = final_pipe_2.predict(true_test_pipeline_2)
unique, counts = np.unique(result_without_reuters_trained, return_counts=True)
dict(zip(unique, counts))

{0: 4878, 1: 2925}

In [80]:
2925/(4878+2925)

0.3748558246828143

In [None]:
#this is the accuracy on true daset by the pipeline fitted without reuters

In [65]:
true_test_for_pipeline_1 = pd.read_csv('../raw_data/new_york_real.csv')
true_test_for_pipeline_1.dropna(axis = 0, inplace = True)
true_test_for_pipeline_1 = true_test_for_pipeline_1[['title','text']]
true_test_for_pipeline_1['text_clean'] = true_test_for_pipeline_1['text'].apply(remove_punctuation)
true_test_for_pipeline_1['title_clean'] = true_test_for_pipeline_1['title'].apply(remove_punctuation)
true_test_for_pipeline_1['text_clean'] = true_test_for_pipeline_1['text_clean'].apply(remove_numbers)
true_test_for_pipeline_1['title_clean'] = true_test_for_pipeline_1['title_clean'].apply(remove_numbers)
true_test_for_pipeline_1['text_clean'] = true_test_for_pipeline_1['text_clean'].apply(lower_case)
true_test_for_pipeline_1['title_clean'] = true_test_for_pipeline_1['title_clean'].apply(lower_case)
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
true_test_for_pipeline_1['text_clean'] = true_test_for_pipeline_1['text_clean'].apply(lambda x: ' '.join([word for word in x.split() if word.lower() not in (stop_words)]))
true_test_for_pipeline_1['title_clean'] = true_test_for_pipeline_1['title_clean'].apply(lambda x: ' '.join([word for word in x.split() if word.lower() not in (stop_words)]))
true_test_for_pipeline_1['title_tokens']=true_test_for_pipeline_1['title_clean'].apply(tokenize_text)
true_test_for_pipeline_1['text_tokens']=true_test_for_pipeline_1['text_clean'].apply(tokenize_text)
true_test_for_pipeline_1['typos_title_count'] = true_test_for_pipeline_1['title_tokens'].apply(get_typos_t)
true_test_for_pipeline_1['typos_text_count'] = true_test_for_pipeline_1['text_tokens'].apply(get_typos_t)
true_test_for_pipeline_1['title_typo_ratio']= true_test_for_pipeline_1['typos_title_count']/len(true_test_for_pipeline_1['title_tokens'])
true_test_for_pipeline_1['text_typo_ratio']= true_test_for_pipeline_1['typos_text_count']/len(true_test_for_pipeline_1['text_tokens'])
true_test_for_pipeline_1 = true_test_for_pipeline_1[['title_clean','text_clean', 'text_typo_ratio']]

In [82]:
result_with_reuters = final_pipe.predict(true_test_for_pipeline_1)
unique_2, counts_2 = np.unique(result_with_reuters, return_counts=True)
dict(zip(unique_2, counts_2))

{0: 5938, 1: 1865}

In [83]:
1865/(5938+1865)

0.23901063693451235

In [None]:
#this is the accuracy on true daset by the pipeline fitted with reuters

### Conclusion

**the model that was trained without reuters perfoms better than the one that trained with reuters for detecting real news**

# Trying a logistic regression with more features

In [47]:
data.head()

Unnamed: 0,title,text,title_clean,text_clean,title_tokens,text_tokens,typos_title_count,typos_text_count,title_typo_ratio,text_typo_ratio,score,text_clean_without_reuters
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,us budget fight looms republicans flip fiscal ...,washington reuters head conservative republica...,"[us, budget, fight, looms, republicans, flip, ...","[washington, reuters, head, conservative, repu...",0,15,0.0,0.000334,1,washington head conservative republican facti...
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,us military accept transgender recruits monday...,washington reuters transgender people allowed ...,"[us, military, accept, transgender, recruits, ...","[washington, reuters, transgender, people, all...",0,10,0.0,0.000223,1,washington transgender people allowed first t...
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,senior us republican senator 'let mr mueller job',washington reuters special counsel investigati...,"[senior, us, republican, senator, 'let, mr, mu...","[washington, reuters, special, counsel, invest...",1,8,2.2e-05,0.000178,1,washington special counsel investigation link...
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,fbi russia probe helped australian diplomat ti...,washington reuters trump campaign adviser geor...,"[fbi, russia, probe, helped, australian, diplo...","[washington, reuters, trump, campaign, adviser...",3,17,6.7e-05,0.000379,1,washington trump campaign adviser george papa...
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,trump wants postal service charge 'much more' ...,seattlewashington reuters president donald tru...,"[trump, wants, postal, service, charge, 'much,...","[seattlewashington, reuters, president, donald...",1,23,2.2e-05,0.000512,1,seattlewashington president donald trump call...


## Adding different features that we saw were relevant to distinguish fake from real news

In [53]:
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))

def stopwords_ratio(tokens):
    count_stop_words = 0
    amount_tokens = 0   
    for token in tokens:
        amount_tokens += 1
        if token in stop_words:
            count_stop_words += 1
    if amount_tokens == 0:
        return 0
    return count_stop_words / amount_tokens

In [54]:
data['title_length_char'] = data.title.str.len() #amount of char in title
data['title_Upper'] = data['title'].str.count(r'[A-Z]')
data['title_Upper_Ratio'] = data['title_Upper']/data['title_length_char'] #ratio of Uppercase Char in title
data.drop(columns = ['title_Upper'])
data['tex_tokens_with_stopwords']=data['text'].apply(tokenize_text)
data['text_stop_words_ratio'] = data['tex_tokens_with_stopwords'].apply(stopwords_ratio) #stopwords ratio in text

## Let's train a model on the different features we have

In [55]:
x_for_test = data[['title_clean', 'text_clean_without_reuters','text_typo_ratio','title_length_char','title_Upper_Ratio','text_stop_words_ratio']]
y_for_test = data['score']

In [56]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x_for_test,y_for_test,random_state=0,test_size=0.3)

In [60]:
x_train

Unnamed: 0,title_clean,text_clean_without_reuters,text_typo_ratio,title_length_char,title_Upper_Ratio,text_stop_words_ratio
33958,breaking finally new wikileaks email…we going ...,latest wikileaks email evidence smoke hillary ...,0.000601,86,0.220930,0.420118
19813,german liberals would expect finance ministry ...,berlin germany free democrats fdp would want ...,0.000891,65,0.030769,0.386935
25814,trump loses complete nervous breakdown worst w...,trump bad week first humiliated front millions...,0.000223,82,0.317073,0.409214
18689,merkel macron pledge lead eu forward postbrexit,tallinn french president emmanuel macron back...,0.000624,52,0.096154,0.382979
44673,american tragedy really killed jonbenét ramsey,roses know thorns hurt quote attributed jonben...,0.005435,55,0.418182,0.384701
22749,‘portal hell internet loses mysterious red lig...,earlier month sinkhole opened outside donald t...,0.001782,99,0.161616,0.291312
2628,man whose firm behind trump dossier testify se...,washington cofounder firm commissioned dossie...,0.000067,71,0.042254,0.422360
10117,biden ukraine's poroshenko meet thursday white...,washington us vice president joe biden ukrain...,0.000067,57,0.105263,0.347222
4388,us strikes syria show resolve chemical attacks...,brussels european council president donald tu...,0.000000,70,0.085714,0.254545
16056,tunnel collapse may killed north korea nuclear...,tokyo tunnel north korea nuclear test site co...,0.000089,88,0.045455,0.330935


In [67]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression

preprocessor_all_features = ColumnTransformer([
    
    ('vectorizer_title', CountVectorizer(), 'title_clean'),
    ('vectorizer_text', CountVectorizer(), 'text_clean_without_reuters'),
     ('scaling_title_char', MinMaxScaler(), ['title_length_char']),
   
    #insert function here
])

final_pipe_all_features = Pipeline([
    ('preprocessing', preprocessor_all_features),
    ('Logistic', LogisticRegression(solver = 'newton-cg', C =0.4 ))])

In [68]:
final_pipe_all_features.fit(x_train,y_train)

  return self.partial_fit(X, y)


Pipeline(memory=None,
     steps=[('preprocessing', ColumnTransformer(n_jobs=None, remainder='drop', sparse_threshold=0.3,
         transformer_weights=None,
         transformers=[('vectorizer_title', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', ...ty='l2', random_state=None, solver='newton-cg',
          tol=0.0001, verbose=0, warm_start=False))])

In [75]:
import re
def rem_urls(text):
    return re.sub('https?:\S+','',text)

import string
punc_no_sq = '!“#$%&\()*+,./:;<=>?@[\\]^_`{|}~“”—’-'
def remove_punctuation(text):
    for punctuation in punc_no_sq:
        text = text.replace(punctuation, '')
    return text

def remove_numbers(text):
    text = ''.join(word for word in text if not word.isdigit())
    return text

def lower_case(text):
    text = text.lower()
    return text

from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))


def tokenize_text(text):
    return text.split()

import enchant
english = enchant.DictWithPWL("en_US", "vocab.txt")
wrong_words={}
correct_words=set()
def get_typos_t(tokens):
     wrong_count=0
     for token in tokens:
            if token in wrong_words:
                wrong_words[token]+=1
                wrong_count+=1
            else:
                if not token in correct_words:
                    if token[0].islower() and not '-' in token and not english.check(token) and not english.check(token.capitalize()):
                        wrong_words[token]=1
                        wrong_count+=1
                    else:
                        correct_words.add(token)
     return wrong_count    

def clean_data(data):
    
    df = data.copy()
    
    #drop nan values in df
    
    df.dropna(axis = 0, inplace = True)
    
    #add title_clean and text_clean to df
    
    df['title_clean'] = df['title'].apply(rem_urls)
    df['text_clean_without_reuters'] = df['text'].apply(rem_urls)
    df['title_clean'] = df['title_clean'].apply(remove_punctuation)
    df['text_clean_without_reuters'] = df['text_clean_without_reuters'].apply(remove_punctuation)
    df['title_clean'] = df['title_clean'].apply(remove_numbers)
    df['text_clean_without_reuters'] = df['text_clean_without_reuters'].apply(remove_numbers)
    df['title_clean'] = df['title_clean'].apply(lower_case)
    df['text_clean_without_reuters'] = df['text_clean_without_reuters'].apply(lower_case)
    df['title_clean'] = df['title_clean'].apply(lambda x: ' '.join([word for word in x.split() if word.lower() not in (stop_words)]))
    df['text_clean_without_reuters'] = df['text_clean_without_reuters'].apply(lambda x: ' '.join([word for word in x.split() if word.lower() not in (stop_words)]))
    
    #adding features to the df
    
    df['title_tokens'] = df['title_clean'].apply(tokenize_text)
    df['text_tokens'] = df['text_clean_without_reuters'].apply(tokenize_text)
    
    df['typos_title_count'] = df['title_tokens'].apply(get_typos_t)
    df['typos_text_count'] = df['text_tokens'].apply(get_typos_t)
    
    df['title_typo_ratio'] = df['typos_title_count']/len(df['title_tokens'])
    df['text_typo_ratio'] = df['typos_text_count']/len(df['text_tokens'])
    
    df['title_length_char'] = data.title.str.len()
    
    df['title_Upper'] = df['title'].str.count(r'[A-Z]')
    df['title_Upper_Ratio'] = df['title_Upper']/df['title_length_char']
    
    df['text_tokens_with_stopwords'] = df['text'].apply(tokenize_text)  
    df['text_stop_words_ratio'] = df['text_tokens_with_stopwords'].apply(stopwords_ratio)
    
    return df[['title_clean','text_clean_without_reuters','text_typo_ratio', 'title_length_char', 'title_Upper_Ratio', 'text_stop_words_ratio']]

In [77]:
first_test_new_york = pd.read_csv('../raw_data/new_york_real.csv')

In [78]:
result_with_all_features = final_pipe_all_features.predict(clean_data(first_test_new_york))
unique_with_all_features, counts_with_all_features = np.unique(result_with_all_features, return_counts=True)
dict(zip(unique_with_all_features, counts_with_all_features))

{0: 4919, 1: 2884}

# Adding more data for our model to generalize better

In [54]:
margarida_fake = pd.read_csv('../raw_data/fake_extra.csv')
margarida_fake.dropna(inplace = True)
margarida_fake.shape

(4702, 20)

In [55]:
business_insider_true = pd.read_csv('../raw_data/Business_Insider.csv', nrows = 5000)
business_insider_true.dropna(inplace = True)
business_insider_true.shape

(5000, 3)

In [56]:
washington_post_true = pd.read_csv('../raw_data/Washington_Post.csv', nrows = 5000)
washington_post_true.dropna(inplace = True)
washington_post_true.shape

(5000, 3)