1. Introduction

1.1 Domain-specific Area

1.2 Description of the selected dataset

1.3 Objectives

1.4 Evaluation Methodology

2. Implementation

2.1 Pre-processing

In [2]:
import pandas as pd
import nltk
import string
import random
import numpy as np
from numpy import hstack
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import PorterStemmer
from nltk import ngrams
from functools import reduce
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.utils.extmath import softmax


from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import RidgeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier


from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import RocCurveDisplay

nltk.download('stopwords')


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/mmenna/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
def text_processing(text, n = 1):
    """
    Takes in a string of text, then performs the following:
    1. Convert text to lower case and remove all punctuation
    2. Optionally apply stemming
    3. Apply Ngram Tokenisation
    4. Returns the tokenised text as a list 
    """
    
    stemmer = SnowballStemmer("english")
    stop = stopwords.words('english')
    #write steps here
    # lower function
    t_1 = lambda x : x.lower()
    # Remove punctuation function
    t_2 = lambda x : x.translate(str.maketrans('', '', string.punctuation))
    # Remove stopwords
    t_3 = lambda x : " ".join([w for w in x.split() if w not in stop])
    # Snowball stemming
    t_4 = lambda x : " ".join([stemmer.stem(w) for w in x.split()])
    # Ngrams with n number of grams
    t_5 = lambda x : [" ".join(ng) for ng in list(ngrams(x.split(), n))]
    
    
    #List of transformation functions
    t = [t_1, t_2, t_3, t_4, t_5]
    
    #Apply transformations
    tokenised = reduce(lambda r, f: f(r), t, text)
    
    return tokenised

In [4]:
n_1 = sum(1 for line in open('True.csv')) - 1
n_2 = sum(1 for line in open('Fake.csv')) - 1
s = 1500 #desired sample size
skip_1 = sorted(random.sample(range(1,n_1+1),n_1-s)) #the 0-indexed header will not be included in the skip list
skip_2 = sorted(random.sample(range(1,n_2+1),n_2-s)) #the 0-indexed header will not be included in the skip list

raw_data_true = pd.read_csv('True.csv', skiprows=skip_1)
raw_data_fake = pd.read_csv('Fake.csv', skiprows=skip_2)

In [5]:
raw_data_true['isFake'] = 0
raw_data_fake['isFake'] = 1
raw_data = raw_data_true.append(raw_data_fake)


In [6]:
data_t1 = pd.DataFrame()
data_t1['article'] = raw_data['title'] + ' ' + raw_data['text']
data_t1['isFake'] = raw_data['isFake']

In [7]:
bag = data_t1['article'].apply(text_processing, n=1)
bag

0       [top, democrat, say, trump, fire, mueller, cou...
1       [white, hous, expect, congress, waiv, spend, c...
2       [trump, strategi, document, say, russia, meddl...
3       [factbox, trump, twitter, decemb, 15, quantico...
4       [us, judg, lift, hous, arrest, former, trump, ...
                              ...                        
1493    [us, delta, forc, begin, target, isi, iraq, th...
1494    [final, control, tpp, ttip, tisa, global, corp...
1495    [ron, paul, burn, oregon, standoff, juri, null...
1496    [seven, iranian, freed, prison, swap, return, ...
1497    [blow, 700, million, al, jazeera, america, fin...
Name: article, Length: 2998, dtype: object

In [8]:
identity = lambda x : x
corpus = bag.values
print('Count Vectorizing...')
vectorizer = CountVectorizer(tokenizer = identity, preprocessor = identity)
count_vector = vectorizer.fit_transform(corpus).toarray()
print('Transforming to tfidf matrix...')
tfidfTransformer = TfidfTransformer()
text_tfidf = tfidfTransformer.fit_transform(count_vector)


Count Vectorizing...
Transforming to tfidf matrix...


In [9]:
X = pd.DataFrame(text_tfidf.toarray())
y = data_t1['isFake']

In [10]:
X_train.reset_index(inplace=True, drop=True)
X_test.reset_index(inplace=True, drop=True)
y_train.reset_index(inplace=True, drop=True)
y_test.reset_index(inplace=True, drop=True)

NameError: name 'X_train' is not defined

In [18]:
l_raw_data = pd.read_csv('liar_dataset/train.tsv', sep='\t', names= ['ID','Label','Statement', 'Subject', 'Speaker', 'Speaker Job', 'State', 'Party Aff', 'Credit', 'True', 'Half true', 'Mostly true', 'Pants on fire', 'Context'])

In [35]:
liar_mapper = {
    'false': 1,
    'half-true': 1,
    'mostly-true': 0,
    'true': 0,
    'barely-true': 1,
    'pants-fire': 1
}
reduce_fake = lambda x : liar_mapper[x]
l_data_t1 = pd.DataFrame()
l_data_t1['article'] = l_raw_data['Statement']
l_data_t1['isFake'] = l_raw_data['Label'].apply(reduce_fake)

l_data_t1

Unnamed: 0,article,isFake
0,Says the Annies List political group supports ...,1
1,When did the decline of coal start? It started...,1
2,"Hillary Clinton agrees with John McCain ""by vo...",0
3,Health care reform legislation is likely to ma...,1
4,The economic turnaround started at the end of ...,1
...,...,...
10235,There are a larger number of shark attacks in ...,0
10236,Democrats have now become the party of the [At...,0
10237,Says an alternative to Social Security that op...,1
10238,On lifting the U.S. Cuban embargo and allowing...,1


In [36]:
bag = l_data_t1['article'].apply(text_processing, n=1)
bag

0        [say, anni, list, polit, group, support, third...
1        [declin, coal, start, start, natur, gas, took,...
2        [hillari, clinton, agre, john, mccain, vote, g...
3        [health, care, reform, legisl, like, mandat, f...
4                   [econom, turnaround, start, end, term]
                               ...                        
10235    [larger, number, shark, attack, florida, case,...
10236    [democrat, becom, parti, atlanta, metro, area,...
10237    [say, altern, social, secur, oper, galveston, ...
10238      [lift, us, cuban, embargo, allow, travel, cuba]
10239    [depart, veteran, affair, manual, tell, vetera...
Name: article, Length: 10240, dtype: object

In [37]:
corpus = bag.values
print('Count Vectorizing...')
vectorizer = CountVectorizer(tokenizer = identity, preprocessor = identity)
count_vector = vectorizer.fit_transform(corpus).toarray()
print('Transforming to tfidf matrix...')
tfidfTransformer = TfidfTransformer()
text_tfidf = tfidfTransformer.fit_transform(count_vector)

Count Vectorizing...
Transforming to tfidf matrix...


In [43]:
l_X = pd.DataFrame(text_tfidf.toarray())
l_y = l_data_t1['isFake']

2.2 Baseline performance

In [44]:
class RidgeClassifierWithProba(RidgeClassifier):
    def predict_proba(self, X):
        d = self.decision_function(X)
        d_2d = np.c_[-d, d]
        
        return softmax(d_2d)
    
models= [LinearDiscriminantAnalysis(), 
         LogisticRegression(random_state=42), 
         SGDClassifier(max_iter=1000, tol=1e-3, loss='modified_huber'),
         RidgeClassifierWithProba()
         #SVC(probability=True)
        ]
model_names = [
    "Linear Discriminant Analysis",
    "Logistic Regression",
    "Stocasthic Gradient Descent",
    "Ridge"
    #"SVC"
]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
l_X_train, l_X_test, l_y_train, l_y_test = train_test_split(l_X, l_y, test_size=0.2)

In [45]:
scores = []
l_scores = []
for i,model in enumerate(models):
    print('Fitting model ', model_names[i], 'for Fake News dataset...')
    model.fit(X_train, y_train)
    yhat = model.predict(X_test)
    scores.append([
        model_names[i],
        yhat,
        roc_auc_score(y_test, yhat),
        f1_score(y_test, yhat),
        precision_score(y_test, yhat),
        recall_score(y_test, yhat),
        accuracy_score(y_test, yhat)
    ])
    print('Fitting model ', model_names[i], 'for Liar dataset...')
    model.fit(l_X_train, l_y_train)
    yhat = model.predict(l_X_test)
    l_scores.append([
        model_names[i],
        yhat,
        roc_auc_score(l_y_test, yhat),
        f1_score(l_y_test, yhat),
        precision_score(l_y_test, yhat),
        recall_score(l_y_test, yhat),
        accuracy_score(l_y_test, yhat)
    ])
    

Fitting model  Linear Discriminant Analysis  for Fake News dataset...
Fitting model  Linear Discriminant Analysis  for Liar dataset...
Fitting model  Logistic Regression  for Fake News dataset...
Fitting model  Logistic Regression  for Liar dataset...
Fitting model  Stocasthic Gradient Descent  for Fake News dataset...
Fitting model  Stocasthic Gradient Descent  for Liar dataset...
Fitting model  Ridge  for Fake News dataset...
Fitting model  Ridge  for Liar dataset...


In [12]:
scores_df = pd.DataFrame(scores, columns= ['Model', 'Predictions', 'ROC AUC', 'F1-Score', 'Precision', 'Recall', 'Accuracy'])

scores_df

Unnamed: 0,Model,Predictions,ROC AUC,F1-Score,Precision,Recall,Accuracy
0,Linear Discriminant Analysis,"[0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, ...",0.896742,0.891608,0.940959,0.847176,0.896494
1,Logistic Regression,"[0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, ...",0.96166,0.961345,0.972789,0.950166,0.961603
2,Stocasthic Gradient Descent,"[0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, ...",0.976627,0.976744,0.976744,0.976744,0.976628
3,Ridge,"[0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, ...",0.983305,0.983389,0.983389,0.983389,0.983306


2.3 Classification Approach

In [13]:
# fit the blending ensemble
def fit_ensemble(models, X_train, X_val, y_train, y_val, hard=True):
    # fit all models on the training set and predict on hold out set
    meta_X = list()
    for model in models:
        # fit in training set
        model.fit(X_train, y_train)
        # predict on hold out set
        yhat = model.predict(X_val) if hard else model.predict_proba(X_val)
        # reshape predictions into a matrix with one column
        if hard:
            yhat = yhat.reshape(len(yhat), 1)
        # store predictions as input for blending
        meta_X.append(yhat)
    # create 2d array from predictions, each set is an input feature
    meta_X = hstack(meta_X)
    # define blending model
    blender = LogisticRegression()
    # fit on predictions from base models
    blender.fit(meta_X, y_val)
    return blender

# make a prediction with the blending ensemble
def predict_ensemble(models, blender, X_test, hard=True):
    # make predictions with base models
    meta_X = list()
    for model in models:
        # predict with base model
        yhat = model.predict(X_test) if hard else model.predict_proba(X_test)
        # reshape predictions into a matrix with one column
        if hard: 
            yhat = yhat.reshape(len(yhat), 1)
        # store prediction
        meta_X.append(yhat)
    # create 2d array from predictions, each set is an input feature
    meta_X = hstack(meta_X)
    # predict
    return blender.predict(meta_X)

In [14]:
# split dataset into train and test sets
X_train_full, X_test, y_train_full, y_test = train_test_split(X, y, test_size=0.5, random_state=1)
# split training set into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train_full, y_train_full, test_size=0.33, random_state=1)

blender = fit_ensemble(models, X_train, X_val, y_train, y_val)
yhat = predict_ensemble(models, blender, X_test)

scores.append([
        'Hard Voting Blender',
        yhat,
        roc_auc_score(y_test, yhat),
        f1_score(y_test, yhat),
        precision_score(y_test, yhat),
        recall_score(y_test, yhat),
        accuracy_score(y_test, yhat)
    ])


In [15]:
blender = fit_ensemble(models, X_train, X_val, y_train, y_val, False)
yhat = predict_ensemble(models, blender, X_test, False)

scores.append([
        'Soft Voting Blender',
        yhat,
        roc_auc_score(y_test, yhat),
        f1_score(y_test, yhat),
        precision_score(y_test, yhat),
        recall_score(y_test, yhat),
        accuracy_score(y_test, yhat)
    ])

In [16]:
# evaluate each base model
def evaluate_models(models, X_train, X_val, y_train, y_val):
    # fit and evaluate the models
    scores = list()
    for model in models:
        # fit the model
        model.fit(X_train, y_train)
        # evaluate the model
        yhat = model.predict(X_val)
        acc = accuracy_score(y_val, yhat)
        # store the performance
        scores.append(acc)
    # report model performance
    return scores

In [17]:
accuracies = evaluate_models(models, X_train, X_val, y_train, y_val)
print(accuracies)
ensemble = VotingClassifier(estimators=list(zip(model_names, models)), voting='soft', weights=accuracies)
ensemble.fit(X_train, y_train)
yhat = ensemble.predict(X_test)

scores_bck = scores.copy()
scores.append([
        'Soft Weighted Ensemble',
        yhat,
        roc_auc_score(y_test, yhat),
        f1_score(y_test, yhat),
        precision_score(y_test, yhat),
        recall_score(y_test, yhat),
        accuracy_score(y_test, yhat)
    ])

[0.8, 0.9636363636363636, 0.9575757575757575, 0.9797979797979798]


In [18]:
scores_df = pd.DataFrame(scores, columns= ['Model', 'Predictions', 'ROC AUC', 'F1-Score', 'Precision', 'Recall', 'Accuracy'])

scores_df

Unnamed: 0,Model,Predictions,ROC AUC,F1-Score,Precision,Recall,Accuracy
0,Linear Discriminant Analysis,"[0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, ...",0.896742,0.891608,0.940959,0.847176,0.896494
1,Logistic Regression,"[0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, ...",0.96166,0.961345,0.972789,0.950166,0.961603
2,Stocasthic Gradient Descent,"[0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, ...",0.976627,0.976744,0.976744,0.976744,0.976628
3,Ridge,"[0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, ...",0.983305,0.983389,0.983389,0.983389,0.983306
4,Hard Voting Blender,"[1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.973907,0.974359,0.968627,0.980159,0.973965
5,Soft Voting Blender,"[1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.969277,0.969617,0.968338,0.970899,0.969292
6,Soft Weighted Ensemble,"[1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.971274,0.971617,0.969697,0.973545,0.971295


3. Conclusion

3.1 Evaluation

3.2 Summary and conclusions