## Assignment 2
# Group 13
Mathieu Mailhot - Isabel Lougheed - Frank-Lucas Pantazis

In [5]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import csv
import os
import re

## Checkout link: https://www.analyticsvidhya.com/blog/2021/11/a-guide-to-building-an-end-to-end-multiclass-text-classification-model/

In [28]:
# Hyperparameter
folds = 30 # between 5 and 10 # best value at the moment when folds = 30

# Loading Training data
df_train = pd.read_csv('train.csv', encoding='utf-8', encoding_errors='ignore') # errors were not pertinent characters
df_train["subreddit"] = df_train["subreddit"].map({"Boston": 0, "Canberra": 1,"Geneva":2,"Ottawa":3})

y = df_train["subreddit"]
X = df_train.drop("subreddit",axis=1)

print(X)

# Loading Test Data
df_test = pd.read_csv('test.csv', encoding='utf-8', encoding_errors='ignore') # errors were not pertinent characters
#X_test = df_test["body"] # Not what we should do with the ID

                                                   body
0     I had to put in a drain well/french drain, and...
1     I've worked with James from Prova accountants ...
2     https://lebonmelange.com.au/ is a Gungahlin ca...
3     What I love about Canberra are the town planni...
4     Canberra has a bigger issue with strata. Rates...
...                                                 ...
1395  Take the train to La pleine and walk to joncti...
1396  IIL alumni here!\n\nGeneva private schools are...
1397  I'm really sorry to hear about your bad experi...
1398  They quite easy to handle. Typically, they don...
1399  **Specialization is authentic.** You don?t go ...

[1400 rows x 1 columns]


<h2>Text Preprocessing</h2>

In [51]:
# Test different stop word libraries
# Checkout: https://towardsdatascience.com/text-pre-processing-stop-words-removal-using-different-libraries-f20bac19929a/
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction import text


vectorizer = CountVectorizer(stop_words='english',ngram_range=(1, 1),min_df=2)

X_disp = vectorizer.fit_transform(df_train["body"])
feature_names = vectorizer.get_feature_names_out()

Boston_counts = X_disp[y == 0].sum(axis=0).A1 # Sum occurrences for class 'Boston'
Canberra_counts = X_disp[y == 1].sum(axis=0).A1 # Sum occurrences for class 'Canberra'
Geneva_counts = X_disp[y == 2].sum(axis=0).A1 # Sum occurrences for class 'Geneva'
Ottawa_counts = X_disp[y == 3].sum(axis=0).A1 # Sum occurrences for class 'Ottawa'

stop_words = []
for i in range(len(feature_names)):
    k = [Boston_counts[i].item(),Canberra_counts[i].item(),Geneva_counts[i].item(),Ottawa_counts[i].item()]
    a = sum([Boston_counts[i].item(),Canberra_counts[i].item(),Geneva_counts[i].item(),Ottawa_counts[i].item()])
    crit  = abs(min(k) - max(k))/ max(k)
    if(crit<0.20):
        stop_words.append(feature_names[i])
print("Initial Stop Word Size:",len(text.ENGLISH_STOP_WORDS))
stop_words_custom = list(text.ENGLISH_STOP_WORDS.union(stop_words))
stop_words_custom.append("ve")
stop_words_custom.append("don")
print("Custom Stop Word Size:",len(stop_words_custom))


Initial Stop Word Size: 318
Custom Stop Word Size: 351


<h3>Train</h3>

In [52]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import KFold
from sklearn.decomposition import NMF
# Observations
# - bigram -> worse performance
# - sublinear_tf -> seems to improve accuracy
# - decreasing max_features -> seems to decrease accuracy (feature reduction)

# TODO
# - Create custom stop word list since default one might not be suited for our case according to documentation: https://scikit-learn.org/stable/modules/feature_extraction.html#stop-words
# - explore different ways to extract features from text data

# Instantiate Vectorizer
tfidf_uni = TfidfVectorizer(ngram_range=(1, 1), sublinear_tf=True, min_df=2, stop_words=stop_words_custom)
tfidf_uni_bi = TfidfVectorizer(ngram_range=(1, 2),sublinear_tf=True, min_df=5, stop_words=stop_words_custom)
tfidf_bi = TfidfVectorizer(ngram_range=(2, 2), sublinear_tf=True,min_df=2, stop_words=stop_words_custom)
naiveBayes_uni = CountVectorizer(max_features=3000, ngram_range=(1, 1), stop_words=stop_words_custom)


# Fit Vectorizer from data
X_uni = tfidf_uni.fit_transform(df_train["body"]).toarray()

X_uni_bi = tfidf_uni_bi.fit_transform(df_train["body"]).toarray()
X_bi = tfidf_bi.fit_transform(df_train["body"]).toarray()
X_naive_bayes = naiveBayes_uni.fit_transform(df_train["body"]).toarray()

X_test = tfidf_uni.transform(df_test["body"])


print("Unigram", "(size:",str(len(tfidf_uni.get_feature_names_out()))+")")
print(tfidf_uni.get_feature_names_out())
#print(tfidf_uni.get_feature_names_out())
print("Unigram & Bigram", "(size:",str(len(tfidf_uni_bi.get_feature_names_out()))+")")
#print(tfidf_uni_bi.get_feature_names_out())
print("Bigram", "(size:",str(len(tfidf_bi.get_feature_names_out()))+")")
#print(tfidf_bi.get_feature_names_out())

# LOOK INTO NMF AND WHAT IT CAN DO TO HELP US
#nmf = NMF(100).fit(X_uni)

#for topic_idx, topic in enumerate(nmf.components_):
#    top_features_ind = topic.argsort()[-10:]
#    top_features = tfidf_uni.get_feature_names_out()[top_features_ind]
#    print(top_features)
    
# To get a better idea of the extracted features
with open("features.csv", mode='w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    # Write a header (optional, if you want)
    writer.writerow(["Feature Name"])
    # Write the features from the array
    for feature in tfidf_uni.get_feature_names_out():
        writer.writerow([feature])

with open("featuresNaiveBayes.csv", mode='w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    # Write a header (optional, if you want)
    writer.writerow(["Feature Name"])
    # Write the features from the array
    for feature in naiveBayes_uni.get_feature_names_out():
        writer.writerow([feature])

Unigram (size: 5328)
['00' '000' '01' ... 'zoom' 'zucchini' 'zurich']
Unigram & Bigram (size: 2249)
Bigram (size: 2253)


In [None]:
# Draft of Feature Visualizer
# Maybe should put all of it in an excel and then display it?
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(stop_words='english',ngram_range=(1, 1),min_df=10)

X_disp = vectorizer.fit_transform(df_train["body"])
feature_names = vectorizer.get_feature_names_out()

Boston_counts = X_disp[y == 0].sum(axis=0).A1 # Sum occurrences for class 'Boston'
Canberra_counts = X_disp[y == 1].sum(axis=0).A1 # Sum occurrences for class 'Canberra'
Geneva_counts = X_disp[y == 2].sum(axis=0).A1 # Sum occurrences for class 'Geneva'
Ottawa_counts = X_disp[y == 3].sum(axis=0).A1 # Sum occurrences for class 'Ottawa'


header = ["features","Boston","Canberra","Geneva","Ottawa"]
table = []
for i in range(len(feature_names)):
    k = [Boston_counts[i].item(),Canberra_counts[i].item(),Geneva_counts[i].item(),Ottawa_counts[i].item()]
    a = sum([Boston_counts[i].item(),Canberra_counts[i].item(),Geneva_counts[i].item(),Ottawa_counts[i].item()])
    crit  = abs(min(k) - max(k))/ max(k)
    if (crit>0.20):
        table.append([feature_names[i],Boston_counts[i].item(),Canberra_counts[i].item(),Geneva_counts[i].item(),Ottawa_counts[i].item(),a])
    


if (True):
    with open("featureVisualiser4.csv", mode='w', newline='') as csvfile:
        writer = csv.writer(csvfile)
        # Write a header (optional, if you want)
        writer.writerow(header)
        # Write the features from the array
        for row in table:
            writer.writerow(row)
    print(table)
    
# Plot a grouped bar chart
# y_pos = np.arange(len(feature_names))*2 # Word indices
# width = 0.4  # Bar width

# fig, ax = plt.subplots(figsize=(12, 12))

# ax.barh(y_pos + 3*width/2, Boston_counts, width, label="Boston", color='red')
# ax.barh(y_pos + width/2, Canberra_counts, width, label="Canberra", color='orange')
# ax.barh(y_pos - width/2, Geneva_counts, width, label="Geneva", color='blue')
# ax.barh(y_pos - 3*width/2, Ottawa_counts, width, label="Ottawa", color='green')
# 
# # Formatting
# ax.set_yticks(y_pos, labels=feature_names)
# 
# ax.set_xlabel("Word Count")
# ax.set_title("Feature Appearance in Each Class")
# ax.legend()
# 
# plt.show()


<h3>Helper Functions</h3>

In [53]:
import sklearn
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score

# This function does all the tunning for each model
def hyperparamaterTunning(X, param, folds, model, verbose_val=1):
    
    model_gridSearch = GridSearchCV(model, param_grid=param,cv=folds, verbose=verbose_val) # According to doc the data will be split the same way accross all calls

    model_best_clf = model_gridSearch.fit(X,y)

    cv_results = model_gridSearch.cv_results_

    
    print()

    print(f"Best Parameters: {model_best_clf.best_params_}")
    try:

        best_index = model_gridSearch.best_index_

        score = []
        for fold in range(folds):
            score.append(model_gridSearch.cv_results_[f"split{fold}_test_score"][best_index].item())

        print(f"Cross-validation Accuracies: {score}")
    except:
        pass
    print(f"Mean Accuracy: {model_best_clf.best_score_:.4f}")

    return model_best_clf


<h2>Naive Bayes</h2>

In [38]:
class NaiveBayes:
    def __init__(self, x_all, y_all, feature_vectoriser):
        self.x_all = self.clean_text_data(x_all) # Make lists of strings
        self.y_all = y_all
        self.feature_vectoriser = feature_vectoriser

        self.folds_features_probability = 0 # array of dict
        self.folds_accuracy = 0
        self.avg_accuracy = 0

    
    def calc_probability(self, x, y): # Train/Fit # Mathieu
        # Create an empty dictionnary with the 3000 most common words for each subreddit.
        features_probability_boston = {word: 0 for word in self.feature_vectoriser.get_feature_names_out()}
        features_probability_canberra = {word: 0 for word in self.feature_vectoriser.get_feature_names_out()}
        features_probability_geneva = {word: 0 for word in self.feature_vectoriser.get_feature_names_out()}
        features_probability_ottawa = {word: 0 for word in self.feature_vectoriser.get_feature_names_out()}

        # Initialize the count for the total number of text from each subreddit
        count_boston = 0
        count_canberra = 0
        count_geneva = 0
        count_ottowa = 0

        # Add 1 to the word in the dictionnary when the word is present in the text
        for i in range(y.shape[0]):
            if y[i] == 0:
                count_boston += 1
                self.add_probability(features_probability_boston, x[i])
            if y[i] == 1:
                count_canberra += 1
                self.add_probability(features_probability_canberra, x[i])
            if y[i] == 2:
                count_geneva += 1
                self.add_probability(features_probability_geneva, x[i])
            else:
                count_ottowa += 1
                self.add_probability(features_probability_ottawa, x[i])

        # Add the total count of each city to a variable called "city_count" and the probability of each city in a variable called "city_probability" in each one of the dictionary
        features_probability_boston["city_count"] = count_boston
        features_probability_boston["city_probability"] = count_boston / (count_boston + count_canberra + count_geneva + count_ottowa)
        features_probability_canberra["city_count"] = count_canberra
        features_probability_canberra["city_probability"] = count_canberra / (count_boston + count_canberra + count_geneva + count_ottowa)
        features_probability_geneva["city_count"] = count_geneva
        features_probability_geneva["city_probability"] = count_geneva / (count_boston + count_canberra + count_geneva + count_ottowa)
        features_probability_ottawa["city_count"] = count_ottowa
        features_probability_ottawa["city_probability"] = count_ottowa / (count_boston + count_canberra + count_geneva + count_ottowa)
                
        return features_probability_boston, features_probability_canberra, features_probability_geneva, features_probability_ottawa # return all dictionaries

    def clean_text_data(self, x): # Helper function to make a list of lists of words 
        # Take text remove all capitalized letters, removed special characters and make an array of words.
        cleaned_data = [
            re.sub(r'[^a-z0-9\s]', '', text.lower()).split()
            for text in x
        ]
        print("This is the cleaned data", cleaned_data[0])
        return cleaned_data # return a list of lists of words (better to use lists for this since numpy is mostly for numerical values)
    
    def add_probability(self, city_dict, x): # Helper function to update probabilities given a dict and a list of words
        for word in set(x): # Creates a set from words(unique elements)
            if word in city_dict:
                city_dict[word] += 1
                
    
    def predict(self, features_probability_boston, features_probability_canberra, features_probability_geneva, features_probability_ottawa, x_i): # (Is x_i in formula equal to 1?)
        # Initialize probabilities for each subreddit
        prob_boston = features_probability_boston["city_probability"]
        prob_canberra = features_probability_canberra["city_probability"]
        prob_geneva = features_probability_geneva["city_probability"]
        prob_ottowa = features_probability_ottawa["city_probability"]

        for word in x_i:
            if word in features_probability_boston: # All have the same most common words
                # Laplace smoothing
                prob_boston = prob_boston * ((features_probability_boston[word] + 1) / (features_probability_boston["city_count"] + 2))
                prob_canberra = prob_canberra * ((features_probability_canberra[word] + 1) / (features_probability_canberra["city_count"] + 2))
                prob_geneva = prob_geneva * ((features_probability_geneva[word] + 1) / (features_probability_geneva["city_count"] + 2))
                prob_ottowa = prob_ottowa * ((features_probability_ottawa[word] + 1) / (features_probability_ottawa["city_count"] + 2))
        
        probabilities = np.array([prob_boston, prob_canberra, prob_geneva, prob_ottowa])
        
        return np.argmax(probabilities)

    def accu_eval(self, x, y): # Issy
        # Validation
        # Returns Accuracy = 1 - Error

        num_correct_labels = 0

        # Get probabilities / train model
        features_probability_boston, features_probability_canberra, features_probability_geneva, features_probability_ottawa = self.calc_probability(x,y)

        # Predict
        for i in range(len(x)):
            predicted_label = self.predict(features_probability_boston, features_probability_canberra, features_probability_geneva, features_probability_ottawa, x[i])
            if predicted_label == y[i]:
                num_correct_labels += 1

        # Get accuracy
        accuracy = num_correct_labels/len(y)
        return accuracy

    def crossValidation(self, k): # Issy (PS: I think we are allowed to use the method from sklearn)
        kf = KFold(n_splits=k, shuffle=True, random_state=9)
        accuracies = []

        # Split into train and validation sets
        for train_indices, val_indices in kf.split(self.x_all):
            x_train = [self.x_all[i] for i in train_indices] # separate x into training subset
            x_val = [self.x_all[i] for i in val_indices] # separate x into validating subset

            y_train = [self.y_all[i] for i in train_indices] # separate y into training subset
            y_val = [self.y_all[i] for i in val_indices] # separate y into validating subset

            
            # For each set, get probabilities / train with training set
            features_probability_boston, features_probability_canberra, features_probability_geneva, features_probability_ottawa = self.calc_probability(x_train,np.array(y_train))

            # check with validation subset
            num_correct_labels = 0
            for i in range(len(y_val)):
                predicted_label = self.predict(features_probability_boston, features_probability_canberra, features_probability_geneva, features_probability_ottawa, x_val[i])
                if predicted_label == y_val[i]:
                    num_correct_labels += 1
        
            # calculate accuracy
            accuracy = num_correct_labels / len(y_val)
            accuracies.append(accuracy)

        self.folds_accuracy = accuracies
        self.avg_accuracy = np.mean(accuracies)
        return self.avg_accuracy


In [39]:
# Naive Bayes set up
naiveBayes = NaiveBayes(X.to_numpy().flatten(), y.to_numpy(), feature_vectoriser=naiveBayes_uni)

accuracy = naiveBayes.crossValidation(28)
print(accuracy)

This is the cleaned data ['i', 'had', 'to', 'put', 'in', 'a', 'drain', 'wellfrench', 'drain', 'and', 'the', 'ground', 'about', '6', 'inches', 'down', 'was', 'all', 'mud', 'and', 'clay', 'i', 'was', 'ass', 'over', 'end', 'in', 'this', 'hole', 'scooping', 'clay', 'mud', 'and', 'was', 'joined', 'by', 'probably', 'ten', 'of', 'these', 'mud', 'daubers', 'for', 'a', 'couple', 'hours', 'they', 'never', 'bothered', 'me', 'at', 'all', 'in', 'their', 'own', 'little', 'way', 'they', 'were', 'kinda', 'helping', 'out', 'i', 'suppose', 'theyd', 'build', 'nests', 'in', 'the', 'garage', 'where', 'i', 'workout', 'and', 'aside', 'from', 'almost', 'smacking', 'into', 'each', 'other', 'they', 'never', 'bothered', 'me', 'theyd', 'just', 'go', 'back', 'and', 'forth', 'building', 'their', 'mud', 'tubes', 'and', 'filling', 'them', 'with', 'paralyzed', 'spiders', 'i', 'think', 'they', 'helped', 'with', 'garden', 'pests', 'and', 'we', 'had', 'so', 'many', 'spiders', 'i', 'didnt', 'mind', 'them', 'culling', 'tha

In [57]:
# Compare with Naive Bayes model from sklearn
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

param_grid_NB_1 = {'alpha': np.arange(0.01, 1.11, 0.1)}

NB = hyperparamaterTunning(X_uni, param_grid_NB_1 ,folds, MultinomialNB())

Fitting 30 folds for each of 11 candidates, totalling 330 fits

Best Parameters: {'alpha': np.float64(1.01)}
Cross-validation Accuracies: [0.7021276595744681, 0.7021276595744681, 0.7021276595744681, 0.7021276595744681, 0.723404255319149, 0.8085106382978723, 0.7446808510638298, 0.7446808510638298, 0.7872340425531915, 0.6382978723404256, 0.6595744680851063, 0.8297872340425532, 0.7446808510638298, 0.8297872340425532, 0.7659574468085106, 0.7446808510638298, 0.5957446808510638, 0.7021276595744681, 0.7659574468085106, 0.7872340425531915, 0.6956521739130435, 0.6956521739130435, 0.6304347826086957, 0.6956521739130435, 0.8695652173913043, 0.7391304347826086, 0.7608695652173914, 0.6086956521739131, 0.8043478260869565, 0.717391304347826]
Mean Accuracy: 0.7299


<h2>Logistic Regression Models</h2>

In [None]:
from sklearn.linear_model import LogisticRegression

param_grid_logModel_1 = [
    {"penalty":["elasticnet"],
     "l1_ratio": np.arange(0, 1.2, 0.2), # 0 is only l2 penalty, 1 is only l1 penalty
     "solver":["saga"],
     "max_iter": [1000]
     }]

logModel_tunned_1a = hyperparamaterTunning(X_uni, param_grid_logModel_1, folds, LogisticRegression(fit_intercept=True))
logModel_tunned_1b = hyperparamaterTunning(X_uni_bi, param_grid_logModel_1, folds, LogisticRegression(fit_intercept=True))


Fitting 80 folds for each of 6 candidates, totalling 480 fits

Best Parameters: {'l1_ratio': np.float64(0.0), 'max_iter': 1000, 'penalty': 'elasticnet', 'solver': 'saga'}
Cross-validation Accuracies: [0.8333333333333334, 0.7222222222222222, 0.6111111111111112, 0.6111111111111112, 0.7222222222222222, 0.7222222222222222, 0.6111111111111112, 0.7222222222222222, 0.8333333333333334, 0.6111111111111112, 0.7777777777777778, 0.8333333333333334, 0.6666666666666666, 0.7222222222222222, 0.7777777777777778, 0.7222222222222222, 0.8333333333333334, 0.7777777777777778, 0.7777777777777778, 0.9444444444444444, 0.7222222222222222, 0.8333333333333334, 0.7777777777777778, 0.6666666666666666, 0.8333333333333334, 0.7222222222222222, 0.6111111111111112, 0.7222222222222222, 0.6111111111111112, 0.6111111111111112, 0.8888888888888888, 0.7777777777777778, 0.7222222222222222, 0.6111111111111112, 0.8888888888888888, 0.8888888888888888, 0.6111111111111112, 0.5555555555555556, 0.8888888888888888, 0.7777777777777778,

In [None]:
from sklearn.linear_model import LogisticRegression

param_grid_logModel_2 = [
    {"penalty":["l2"],
     "solver":["sag","lbfgs","newton-cg"],
     "tol":[1e-4,1e-5],
     "max_iter": [1000,2000]
     }]
logModel_tunned_2a = hyperparamaterTunning(X_uni, param_grid_logModel_2, folds, LogisticRegression(fit_intercept=True))

logModel_tunned_2b = hyperparamaterTunning(X_uni_bi, param_grid_logModel_2, folds, LogisticRegression(fit_intercept=True))

Fitting 80 folds for each of 12 candidates, totalling 960 fits

Best Parameters: {'max_iter': 1000, 'penalty': 'l2', 'solver': 'lbfgs', 'tol': 0.0001}
Cross-validation Accuracies: [0.8333333333333334, 0.7222222222222222, 0.6111111111111112, 0.6111111111111112, 0.7777777777777778, 0.7222222222222222, 0.6111111111111112, 0.7222222222222222, 0.8333333333333334, 0.6111111111111112, 0.7777777777777778, 0.8333333333333334, 0.6666666666666666, 0.7222222222222222, 0.7777777777777778, 0.7222222222222222, 0.8333333333333334, 0.7777777777777778, 0.7777777777777778, 0.9444444444444444, 0.7222222222222222, 0.8333333333333334, 0.7777777777777778, 0.6666666666666666, 0.8333333333333334, 0.7222222222222222, 0.6111111111111112, 0.7222222222222222, 0.6111111111111112, 0.6111111111111112, 0.8888888888888888, 0.7777777777777778, 0.7222222222222222, 0.6111111111111112, 0.8888888888888888, 0.8888888888888888, 0.6111111111111112, 0.5555555555555556, 0.8888888888888888, 0.7777777777777778, 0.6470588235294118,

<h2>SVM Models</h2>

In [47]:
from sklearn.svm import LinearSVC
param_grid_SVC_1 = [
    {"penalty":["l1","l2"],
     "C": np.arange(0.1,1.1,0.1).tolist(),
     "loss": ["squared_hinge"],
     "tol":[1e-4],
     "max_iter": [1000]
     }]

SVMModel_tunned_1a = hyperparamaterTunning(X_uni, param_grid_SVC_1, folds, LinearSVC(fit_intercept=True))

SVMModel_tunned_1b = hyperparamaterTunning(X_uni_bi, param_grid_SVC_1, folds, LinearSVC(fit_intercept=True))

Fitting 30 folds for each of 20 candidates, totalling 600 fits

Best Parameters: {'C': 0.1, 'loss': 'squared_hinge', 'max_iter': 1000, 'penalty': 'l2', 'tol': 0.0001}
Cross-validation Accuracies: [0.6808510638297872, 0.7021276595744681, 0.7021276595744681, 0.7021276595744681, 0.7021276595744681, 0.8085106382978723, 0.7446808510638298, 0.7446808510638298, 0.8297872340425532, 0.6808510638297872, 0.6595744680851063, 0.7872340425531915, 0.7659574468085106, 0.8723404255319149, 0.7021276595744681, 0.7659574468085106, 0.6170212765957447, 0.6808510638297872, 0.7659574468085106, 0.7659574468085106, 0.6956521739130435, 0.7391304347826086, 0.6521739130434783, 0.6521739130434783, 0.8043478260869565, 0.6956521739130435, 0.6739130434782609, 0.6739130434782609, 0.8260869565217391, 0.7391304347826086]
Mean Accuracy: 0.7278
Fitting 30 folds for each of 20 candidates, totalling 600 fits

Best Parameters: {'C': 0.8, 'loss': 'squared_hinge', 'max_iter': 1000, 'penalty': 'l1', 'tol': 0.0001}
Cross-validati

In [None]:
from sklearn.svm import LinearSVC
param_grid_SVC_2 = [
    {"penalty":["l2"],
     "C": np.arange(0.1,1.1,0.1).tolist(),
     "loss": ["hinge"],
     "tol":[1e-4],
     "max_iter": [1000]
     }]
# X_uni --> only discarded term that are smaller than 2

# MAX VALUE: 0.7443 -> Best Parameters: {'C': 0.6, 'loss': 'hinge', 'max_iter': 1000, 'penalty': 'l2', 'tol': 0.0001} using X1_uni and 70 folds

SVMModel_tunned_2a = hyperparamaterTunning(X_uni, param_grid_SVC_2, folds, LinearSVC(fit_intercept=True)) # Best one so far
SVMModel_tunned_2b = hyperparamaterTunning(X_uni_bi, param_grid_SVC_2, folds, LinearSVC(fit_intercept=True)) # Best one so far

Fitting 70 folds for each of 10 candidates, totalling 700 fits

Best Parameters: {'C': 0.5, 'loss': 'hinge', 'max_iter': 1000, 'penalty': 'l2', 'tol': 0.0001}
Cross-validation Accuracies: [0.9, 0.7, 0.65, 0.7, 0.8, 0.65, 0.75, 0.75, 0.85, 0.75, 0.7, 0.75, 0.75, 0.65, 0.75, 0.85, 0.75, 0.75, 0.8, 0.95, 0.75, 0.8, 0.7, 0.4, 0.7, 0.85, 0.9, 0.7, 0.75, 0.8, 0.8, 0.8, 0.7, 0.85, 0.65, 0.65, 0.8, 0.75, 0.65, 0.65, 0.7, 0.8, 0.75, 0.8, 0.85, 0.8, 0.8, 0.75, 0.7, 0.9, 0.7, 0.65, 0.65, 0.75, 0.5, 0.85, 0.7, 0.75, 0.8, 0.8, 0.75, 0.65, 0.8, 0.6, 0.7, 0.8, 0.9, 0.8, 0.7, 0.75]
Mean Accuracy: 0.7471
Fitting 70 folds for each of 10 candidates, totalling 700 fits

Best Parameters: {'C': 0.1, 'loss': 'hinge', 'max_iter': 1000, 'penalty': 'l2', 'tol': 0.0001}
Cross-validation Accuracies: [0.75, 0.65, 0.6, 0.65, 0.8, 0.7, 0.7, 0.65, 0.8, 0.75, 0.65, 0.8, 0.75, 0.7, 0.75, 0.75, 0.65, 0.8, 0.7, 0.85, 0.7, 0.8, 0.65, 0.4, 0.75, 0.85, 0.9, 0.65, 0.75, 0.75, 0.75, 0.75, 0.8, 0.8, 0.6, 0.55, 0.7, 0.7, 0.45, 

In [45]:
from sklearn.linear_model import SGDClassifier
# LOOK INTO THIS OR ELSE DELETE
param_grid_SDG = [
    {"penalty":["l1","l2"],
     "loss": ["hinge", "log_loss", "modified_huber", "squared_hinge", "perceptron", "squared_error", "huber", "epsilon_insensitive", "squared_epsilon_insensitive"],
     "tol":[1e-4],
     "max_iter": [1000]
     }]

param_grid_SDG = [
    {"penalty":["l2"],
     "alpha":[1e-3],
     "loss": ["hinge"],
     "tol":[1e-4],
     "max_iter": [1000]
     }]

# SDGModel = hyperparamaterTunning(X_uni_bi, param_grid_SDG, folds, SGDClassifier(fit_intercept=True),3)

<h2>Random Forest Model</h2>

In [None]:
from sklearn.ensemble import RandomForestClassifier

param_grid_rf = [{
"criterion":["gini", "entropy", "log_loss"],
"max_features":["sqrt", "log2"],
"max_depth": [18] # Need to look into what values to use here
}]
rF = hyperparamaterTunning(X_uni, param_grid_rf, folds, RandomForestClassifier())

Fitting 80 folds for each of 6 candidates, totalling 480 fits

Best Parameters: {'criterion': 'gini', 'max_depth': 18, 'max_features': 'sqrt'}
Cross-validation Accuracies: [0.7777777777777778, 0.5, 0.6666666666666666, 0.5555555555555556, 0.5, 0.6666666666666666, 0.5555555555555556, 0.6666666666666666, 0.7222222222222222, 0.6111111111111112, 0.7222222222222222, 0.5, 0.5, 0.7222222222222222, 0.7222222222222222, 0.7777777777777778, 0.7222222222222222, 0.7222222222222222, 0.8333333333333334, 0.8888888888888888, 0.6666666666666666, 0.8888888888888888, 0.8333333333333334, 0.5555555555555556, 0.8333333333333334, 0.6111111111111112, 0.7222222222222222, 0.6666666666666666, 0.7222222222222222, 0.6666666666666666, 0.8333333333333334, 0.7222222222222222, 0.7222222222222222, 0.7222222222222222, 0.8888888888888888, 0.7777777777777778, 0.8333333333333334, 0.6666666666666666, 0.7777777777777778, 0.7222222222222222, 0.5882352941176471, 0.5294117647058824, 0.47058823529411764, 0.5882352941176471, 0.6470

In [55]:
#Test
y_pred = SVMModel_tunned_2a.predict(X_test)

print(y_pred)
with open("output2.csv", mode='w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    # Write a header (optional, if you want)
    writer.writerow(["Feature Name"])
    map = ["Boston","Canberra","Geneva","Ottawa"]
    # Write the features from the array
    for i in range(len(y_pred)):
        output = y_pred[i]
        writer.writerow([i, map[output]])
    

[1 0 3 1 3 1 0 1 1 1 1 1 3 1 1 0 1 1 1 1 0 1 1 1 1 1 1 1 2 3 1 1 1 1 1 1 1
 1 1 1 1 1 1 2 1 2 1 0 0 1 1 3 1 2 1 3 0 1 1 1 2 1 1 1 1 1 0 1 1 1 1 1 2 1
 1 2 3 1 1 1 1 1 1 1 1 1 0 1 2 2 3 1 1 1 1 1 3 1 3 1 1 2 1 1 1 3 1 1 1 1 1
 1 2 1 1 1 1 2 0 1 1 1 2 1 1 0 1 0 1 1 3 1 1 1 1 1 3 1 0 1 1 1 3 1 3 1 1 2
 1 0 3 1 0 2 2 1 3 2 0 3 2 1 2 1 1 1 0 1 3 1 3 3 2 3 3 3 3 3 1 3 1 3 2 1 3
 2 0 0 3 1 3 3 3 0 0 3 3 3 3 3 3 3 3 3 3 0 3 3 3 1 3 3 3 0 3 3 2 3 0 3 3 3
 2 1 3 1 3 3 3 0 2 3 2 1 0 1 3 3 2 1 3 3 3 3 3 1 0 3 1 3 3 3 3 3 3 0 3 0 3
 3 3 2 3 3 0 3 3 3 1 3 3 1 3 3 3 1 2 1 0 1 3 3 3 3 1 2 3 3 3 3 3 3 3 3 0 0
 3 0 3 3 3 0 0 1 3 0 1 0 3 0 0 3 2 0 3 1 3 0 0 1 1 1 0 0 2 0 0 0 1 0 3 0 0
 3 0 0 0 0 0 1 0 0 0 0 0 3 0 0 0 0 0 0 3 0 0 0 0 0 0 0 0 0 0 0 3 0 0 1 2 0
 3 0 0 0 3 0 3 1 0 0 0 3 0 0 0 1 3 0 0 0 0 0 0 0 0 0 2 2 0 0 0 0 0 0 0 0 3
 0 2 0 0 0 0 2 0 0 0 0 2 0 0 0 0 0 3 2 0 0 0 3 0 0 3 0 0 0 0 0 3 1 0 3 0 2
 1 0 0 2 0 0 2 1 1 0 2 2 2 2 2 2 2 2 3 2 0 2 0 2 2 0 2 2 2 2 2 2 2 2 2 0 2
 2 2 2 1 2 2 1 2 2 2 3 3 