# Applying Traditional Machine Learning Models

In [2]:
import numpy as np 
import pandas as pd 

from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, precision_score, recall_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC 
from sklearn.naive_bayes import GaussianNB
import pickle
from sklearn.model_selection import StratifiedKFold, GridSearchCV
import warnings
import pickle

%matplotlib inline

## 1. Data Preparation

In [5]:
# Loading Data
df = pd.read_csv("../data/processed/final_clean.csv")

In [2]:
# A function to vectorize features
def prep_features(df, use_clean, use_date): 
    if use_clean:
        text = df["clean_content"]
    else:
        text = df["content"]

    # Vectorizing text input
    vectorizer = TfidfVectorizer(min_df= 3, stop_words="english", sublinear_tf = True, norm = 'l2', ngram_range = (1, 2))
    text_processed = vectorizer.fit_transform(text).toarray()

    with open("vectorizer", "wb") as f:
        pickle.dump(vectorizer, f)

    if use_date:
        # One-Hot Encoding to process Dates
        date_processed = LabelBinarizer().fit_transform(df.date)
        features = np.concatenate((date_processed, text_processed), axis = 1)
    else:
        features = text_processed
    
    print(type(features))

    return features

# Finalize training and testing data
def prep_data(df, use_clean, use_date):
    X = prep_features(df, use_clean, use_date)
    Y = df["reliability"]
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.25)

    print(type(X_train))

    return X_train, X_test, y_train, y_test

In [5]:
# A helper function to train and test model.
# Returns dataframe of misclassified data

def train_test_model(model_name, df, save, use_clean, use_date):
    X_train, X_test, y_train, y_test = prep_data(df, use_clean, use_date)

    if (model_name == "LR"):
        model = generate_model_LR(X_train, y_train)
    
    if (model_name == "SVM"):
        model = generate_model_SVM(X_train, y_train)

    if (model_name == "NB"):
        model = generate_model_NB(X_train, y_train)

    model.fit(X_train, y_train)

    if (save):
        with open("model_instances/" + model_name, "wb") as f:
            pickle.dump(model, f)

    ytest = np.array(y_test)
    predictions = model.predict(X_test)

    # confusion matrix and classification report(precision, recall, F1-score)
    print("Model Accuracy: " + str(accuracy_score(ytest, predictions)))  
    print("Precision Score: " + str(precision_score(ytest, predictions)))
    print("Recall Score: " + str(recall_score(ytest, predictions)))
    print("F1 Score: " + str(f1_score(ytest, predictions)))

    print("Confusion Matrix: ")
    print(confusion_matrix(ytest, predictions))

## 2. Modeling

## 2.1 Logistic Regression

### 2.1.1 Hyperparameter Tuning  & Model Instance Generation

In [29]:
def generate_model_LR(X_train, y_train):
    warnings.filterwarnings("ignore")

    grid = {
        "penalty": ["none", "l1", "l2", "elasticnet"],
        "C":  [1e-3, 10],
        "solver": ["newton-cg", "lbfgs", "liblinear"]
    }
    cv = StratifiedKFold(n_splits=5)
    search = GridSearchCV(LogisticRegression(), grid, scoring = "accuracy", n_jobs = -1, cv = cv)
    result = search.fit(X_train, y_train)

    return result.best_estimator_

### 2.1.2 Train and Test Model

In [49]:
train_test_model("LR", df, save=False, use_clean=True, use_date=True)

Model Accuracy: 0.9188701923076923
Precision Score: 0.9233926128590971
Recall Score: 0.8952254641909815
F1 Score: 0.9090909090909092
Confusion Matrix: 
[[854  56]
 [ 79 675]]


In [71]:
train_test_model("LR", df, save=False, use_clean=True, use_date=False)

Model Accuracy: 0.9272836538461539
Precision Score: 0.9389736477115118
Recall Score: 0.8978779840848806
F1 Score: 0.9179661016949152
Confusion Matrix: 
[[866  44]
 [ 77 677]]


In [62]:
train_test_model("LR", df, save=False, use_clean=False, use_date=True)

Model Accuracy: 0.9134615384615384
Precision Score: 0.9284712482468443
Recall Score: 0.8768211920529801
F1 Score: 0.901907356948229
Confusion Matrix: 
[[858  51]
 [ 93 662]]


In [30]:
train_test_model("LR", df, save=False, use_clean=False, use_date=False)

Model Accuracy: 0.9140625
Precision Score: 0.9117259552042161
Recall Score: 0.9010416666666666
F1 Score: 0.9063523248199084
Confusion Matrix: 
[[829  67]
 [ 76 692]]


## 2.2. Support Vector Machine

### 2.2.1 Hyperparameter Tuning  & Model Instance Generation

In [5]:
def generate_model_SVM(X_train, y_train):
    warnings.filterwarnings("ignore")

    grid = {
        'C': [1, 0.01],
        'gamma': [1, 0.1, "scale"],
        'kernel': ['rbf', "linear"]
    }
    cv = StratifiedKFold(n_splits=5)
    search = GridSearchCV(SVC(), grid, scoring = "accuracy", n_jobs = -1, cv = cv)
    result = search.fit(X_train, y_train)

    return result.best_estimator_

### 2.2.2 Train and Test Model

In [9]:
train_test_model("SVM", df, save=False, use_clean=True, use_date=True)

{'C': 1, 'gamma': 1, 'kernel': 'linear'}
Model Accuracy: 0.9350961538461539
Precision Score: 0.9586114819759679
Recall Score: 0.9031446540880503
F1 Score: 0.9300518134715026
Confusion Matrix: 
[[838  31]
 [ 77 718]]


In [13]:
train_test_model("SVM", df, save=False, use_clean=True, use_date=False)

{'C': 1, 'gamma': 1, 'kernel': 'rbf'}
Model Accuracy: 0.9320913461538461
Precision Score: 0.9296551724137931
Recall Score: 0.9157608695652174
F1 Score: 0.9226557152635181
Confusion Matrix: 
[[877  51]
 [ 62 674]]


In [12]:
train_test_model("SVM", df, save=False, use_clean=False, use_date=True)

In [6]:
train_test_model("SVM", df, save=False, use_clean=False, use_date=False)

Model Accuracy: 0.9278846153846154
Precision Score: 0.9412550066755674
Recall Score: 0.9026888604353394
F1 Score: 0.9215686274509803
Confusion Matrix: 
[[839  44]
 [ 76 705]]


## 2.3 Naive Bayes

### 2.3.1 Hyperparameter Tuning and Model Generation

In [32]:
def generate_model_NB(X_train, y_train):
    warnings.filterwarnings("ignore")

    grid = {
        'var_smoothing': [1e-13, 1e-12, 1e-11, 1e-10, 1e-9]
    }
    cv = StratifiedKFold(n_splits=5)
    search = GridSearchCV(GaussianNB(), grid, scoring = "accuracy", n_jobs = -1, cv = cv)
    result = search.fit(X_train, y_train)
    
    return result.best_estimator_

### 2.3.2 Train and Test Model

In [16]:
train_test_model("NB", df, save=False, use_clean=True, use_date=True)

{'var_smoothing': 1e-13}
Model Accuracy: 0.8816105769230769
Precision Score: 0.8502994011976048
Recall Score: 0.907928388746803
F1 Score: 0.878169449598021
Confusion Matrix: 
[[757 125]
 [ 72 710]]


In [20]:
train_test_model("NB", df, save=False, use_clean=True, use_date=False)

{'var_smoothing': 1e-09}
Model Accuracy: 0.8731971153846154
Precision Score: 0.8419117647058824
Recall Score: 0.893368010403121
F1 Score: 0.8668769716088328
Confusion Matrix: 
[[766 129]
 [ 82 687]]


In [26]:
train_test_model("NB", df, save=False, use_clean=False, use_date=True)

{'var_smoothing': 1e-13}
Model Accuracy: 0.8876201923076923
Precision Score: 0.8768656716417911
Recall Score: 0.8890290037831021
F1 Score: 0.8829054477144647
Confusion Matrix: 
[[772  99]
 [ 88 705]]


In [33]:
train_test_model("NB", df, save=False, use_clean=False, use_date=False)

{'var_smoothing': 1e-13}
Model Accuracy: 0.8816105769230769
Precision Score: 0.8697368421052631
Recall Score: 0.8708827404479579
F1 Score: 0.8703094140882159
Confusion Matrix: 
[[806  99]
 [ 98 661]]


# 3. Misclassified Data

In [25]:
def get_misclassified(model_name, df, save):
    X = prep_features(df, False, False)
    Y = df["reliability"]

    with open("model_instances/" + model_name, "rb") as f:
        model = pickle.load(f)

    predictions = model.predict(X)
    misclassified = np.where(Y != predictions)
    misclassified_df = df.iloc[misclassified]

    if (save):
        misclassified_df.to_csv("misclassified_data/" + model_name + ".csv", index=False)

    return misclassified_df

## 3.1 Logistic Regression

In [34]:
get_misclassified("LR", df, save=False)

Unnamed: 0,date,content,reliability,clean_content
73,2020-04-02,Social Security recipients to automatically ge...,1,social secur recipi automat get coronaviru money
88,2020-04-01,Children in need census 2019 to 2020: guide,1,children need censu guid
90,2020-04-01,Harry Pottering around at home? Rowling to res...,1,harri potter around home rowl rescu bore kid l...
147,2020-03-30,"Coronavirus: Over 4,000 cases reported in Penn...",1,coronaviru over 4 case report pennsylvania
169,2020-03-29,M&A in Times of COVID-19,1,ma time covid
...,...,...,...,...
6366,2020-05-19,"""The new Dane County lockdown policy... (has)...",0,the new dane counti lockdown polici effect kil...
6405,2020-05-21,A blog article which says that the Italian do...,0,a blog articl say italian doctor expert charg ...
6436,2020-05-24,Coronavirus multiplies in sewage and pouring ...,0,coronaviru multipli sewag pour bleach sewag wa...
6531,2020-06-03,Claim that despite the fact that the crisis h...,0,claim despit fact crisi headquart north macedo...


## 3.2 Support Vector Machine

In [26]:
get_misclassified("SVM", df, save=False)

Unnamed: 0,date,content,reliability,clean_content
23,2020-04-03,Trump advises voluntary mask use against coron...,1,trump advis voluntari mask use coronaviru wear...
50,2020-04-03,Stamp Duty and Stamp Duty Reserve Tax: transfe...,1,stamp duti stamp duti reserv tax transfer sche...
73,2020-04-02,Social Security recipients to automatically ge...,1,social secur recipi automat get coronaviru money
110,2020-04-01,Regulatory status of equipment being used to h...,1,regulatori statu equip use help prevent corona...
135,2020-03-31,Indian doctors fight coronavirus with raincoat...,1,indian doctor fight coronaviru raincoat helmet...
...,...,...,...,...
6111,2020-05-08,"""The mainstream media pretended there was a d...",0,the mainstream medium pretend deadli surg covi...
6126,2020-05-09,"Under an 1866 Supreme Court ruling, stay-at-h...",0,under suprem court rule stay home order illeg ...
6338,2020-05-18,"Texas and Florida have a ""balanced budget""Â ...",0,texa florida balanc budget â california debt b...
6364,2020-05-19,The survival rate for COVID-19 is 98.54% in t...,0,the surviv rate covid uk


## 3.3 Naive Bayes

In [35]:
get_misclassified("NB", df, save=False)

Unnamed: 0,date,content,reliability,clean_content
40,2020-04-03,Europe's north-south lockdown divide revealed ...,1,europ north south lockdown divid reveal googl ...
50,2020-04-03,Stamp Duty and Stamp Duty Reserve Tax: transfe...,1,stamp duti stamp duti reserv tax transfer sche...
62,2020-04-02,"Mecca, Medina get 24-hour curfew; Gulf migrant...",1,mecca medina get hour curfew gulf migrant work...
101,2020-04-01,China clamps down on coronavirus test kit expo...,1,china clamp coronaviru test kit export accurac...
124,2020-03-31,River Thames: lock and weir fishing permit app...,1,river thame lock weir fish permit applic form
...,...,...,...,...
6609,2020-06-14,"""George Floyd's 'murder' filmed before COVID-...",0,georg floyd murder film covid â
6617,2020-06-15,The vaccine is not the final solution against...,0,the vaccin final solut novel coronaviru
6618,2020-06-15,"News stories referencing the number ""322""Â a...",0,new stori referenc number â covid proof case t...
6622,2020-06-16,Facebook is using your pictures and posts in ...,0,facebook use pictur post lawsuit compani amid ...


## 3.4 Misclassified by both LR and SVM 

In [36]:
LR_mis = pd.read_csv("misclassified_data/LR.csv")
SVM_mis = pd.read_csv("misclassified_data/SVM.csv")

temp = pd.concat([LR_mis, SVM_mis])

both_LR_SVM_mis = temp[temp.duplicated()]

both_LR_SVM_mis.to_csv("misclassified_data/both_LR_SVM.csv", index=False)