## Assignment 2
# Group 13
Mathieu Mailhot - Isabel Lougheed - Frank-Lucas Pantazis

In [233]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import csv
import os

## Checkout link: https://www.analyticsvidhya.com/blog/2021/11/a-guide-to-building-an-end-to-end-multiclass-text-classification-model/

In [100]:
# Hyperparameter
folds = 5 # between 5 and 10

# Loading Training data
df_train = pd.read_csv('train.csv', encoding='utf-8', encoding_errors='ignore') # errors were not pertinent characters
df_train["subreddit"] = df_train["subreddit"].map({"Boston": 0, "Canberra": 1,"Geneva":2,"Ottawa":3})

y = df_train["subreddit"]
X = df_train.drop("subreddit",axis=1)

# Loading Test Data
df_test = pd.read_csv('test.csv', encoding='utf-8', encoding_errors='ignore') # errors were not pertinent characters
X_test = df_test["body"] # Not what we should do with the ID

<h2>Text Preprocessing</h2>

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

# Observations
# - bigram -> worse performance
# - sublinear_tf -> seems to improve accuracy
# - decreasing max_features -> seems to decrease accuracy (feature reduction)

# TODO
# - Create custom stop word list since default one might not be suited for our case according to documentation: https://scikit-learn.org/stable/modules/feature_extraction.html#stop-words
# - explore different ways to extract features from text data

tfidf_uni = TfidfVectorizer(ngram_range=(1, 1), sublinear_tf=True, stop_words='english')
tfidf_uni_bi = TfidfVectorizer(ngram_range=(1, 2),sublinear_tf=True, stop_words='english')
tfidf_bi = TfidfVectorizer(ngram_range=(2, 2),  stop_words='english')

X_uni = tfidf_uni.fit_transform(df_train["body"]).toarray()
X_uni_bi = tfidf_uni_bi.fit_transform(df_train["body"]).toarray()
X_bi = tfidf_bi.fit_transform(df_train["body"]).toarray()

print("Unigram", "(size:",str(len(tfidf_uni.get_feature_names_out()))+")\n",tfidf_uni.get_feature_names_out(),"\n")
print("Unigram & Bigram", "(size:",str(len(tfidf_uni_bi.get_feature_names_out()))+")\n",tfidf_uni_bi.get_feature_names_out(),"\n")
print("Bigram", "(size:",str(len(tfidf_bi.get_feature_names_out()))+")\n", tfidf_bi.get_feature_names_out())

# To get a better idea of the extracted features
with open("features.csv", mode='w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    # Write a header (optional, if you want)
    writer.writerow(["Feature Name"])
    # Write the features from the array
    for feature in tfidf_uni.get_feature_names_out():
        writer.writerow([feature])

Unigram (size: 3000)
 ['00' '000' '01' ... 'zone' 'zoo' 'zurich'] 

Unigram & Bigram (size: 3000)
 ['00' '000' '01' ... 'zero' 'zone' 'zoo'] 

Bigram (size: 49610)
 ['00 33' '00 avec' '00 bit' ... 'zurich migrations' 'zurich tried'
 'zxwr7mvro1z3kabb year']


<h3>Hyperparamater Optimisation</h3>

In [None]:
import sklearn
from sklearn.model_selection import GridSearchCV

# This function does all the tunning for each model
def hyperparamaterTunning(X, param, folds, model):
    
    model_gridSearch = GridSearchCV(model, param_grid=param,cv=folds, verbose=True) # According to doc the data will be split the same way accross all calls

    model_best_clf = model_gridSearch.fit(X,y)

    print("Best Parameters:", model_best_clf.best_params_)

    print("Accuracy:", model_best_clf.best_score_)

    return model_best_clf



<h2>Naive Bayes</h2>

In [40]:
class naiveBayes:
    def __init__(self, x_all, y_all):
        self.x_all = x_all
        self.y_all = y_all
        self.features_probability = dict()

        self.folds_features_probability = 0 # array of dict
        self.folds_accuracy = 0
        self.avg_accuracy = 0

    
    def calc_probability(self): # Train/Fit # Mathieu
        pass
    
    def predict(self, x_i): # Issy
        pass # return y (0,1,2,3)

    def accu_eval(self, x, y): # Issy
        pass

    def crossValidation(self, k): # Issy (PS: I think we are allowed to use the method from sklearn)
        pass


<h2>Logistic Regression Models</h2>

In [None]:
from sklearn.linear_model import LogisticRegression

param_grid_logModel_1 = [
    {"penalty":["elasticnet"],
     "l1_ratio": np.arange(0, 1.2, 0.2), # 0 is only l2 penalty, 1 is only l1 penalty
     "solver":["saga"],
     "max_iter": [1000]
     }]

logModel_tunned_1 = hyperparamaterTunning(X_uni, param_grid_logModel_1, folds, LogisticRegression(fit_intercept=True))

In [266]:
from sklearn.linear_model import LogisticRegression

param_grid_logModel_2 = [
    {"penalty":["l2"],
     "solver":["sag","lbfgs","newton-cg"],
     "tol":[1e-4,1e-5],
     "max_iter": [1000,2000]
     }]
logModel_tunned_2 = hyperparamaterTunning(X_uni, param_grid_logModel_2, folds, LogisticRegression(fit_intercept=True))

Fitting 5 folds for each of 12 candidates, totalling 60 fits
Best Parameters: {'max_iter': 1000, 'penalty': 'l2', 'solver': 'lbfgs', 'tol': 0.0001}
Accuracy: 0.6971428571428572


<h2>Linear SVC Models</h2>

In [None]:
from sklearn.svm import LinearSVC
param_grid_SVC_1 = [
    {"penalty":["l1","l2"],
     "loss": ["squared_hinge"],
     "tol":[1e-4,1e-5],
     "max_iter": [1000]
     }]

SVMModel_tunned_1 = hyperparamaterTunning(X_uni, param_grid_SVC_1, folds, LinearSVC(fit_intercept=True))

Fitting 5 folds for each of 4 candidates, totalling 20 fits
Best Parameters: {'loss': 'squared_hinge', 'max_iter': 1000, 'penalty': 'l2', 'tol': 0.0001}
Accuracy: 0.5014285714285714


In [281]:
from sklearn.svm import LinearSVC
param_grid_SVC_2 = [
    {"penalty":["l2"],
     "loss": ["hinge","squared_hinge"],
     "tol":[1e-4,1e-5],
     "max_iter": [1000]
     }]

SVMModel_tunned_2 = hyperparamaterTunning(X_uni_bi, param_grid_SVC_2, folds, LinearSVC(fit_intercept=True))

Fitting 5 folds for each of 4 candidates, totalling 20 fits
Best Parameters: {'loss': 'hinge', 'max_iter': 1000, 'penalty': 'l2', 'tol': 0.0001}
Accuracy: 0.7028571428571428


<h2>Random Forest Model</h2>

In [282]:
from sklearn.ensemble import RandomForestClassifier

param_grid_rf = [{
"criterion":["gini", "entropy", "log_loss"],
"max_features":["sqrt", "log2"],
"max_depth": range(10,20) # Need to look into what values to use here
}]
rF = hyperparamaterTunning(X_uni, param_grid_rf, folds, RandomForestClassifier())

Fitting 5 folds for each of 60 candidates, totalling 300 fits
Best Parameters: {'criterion': 'gini', 'max_depth': 17, 'max_features': 'sqrt'}
Accuracy: 0.6635714285714285
