## Assignment 2
# Group 13
Mathieu Mailhot - Isabel Lougheed - Frank-Lucas Pantazis

In [45]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import csv
import os
import re

import nltk
from nltk import pos_tag
from nltk.corpus import wordnet as wn
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.tokenize import word_tokenize
from collections import defaultdict
from sklearn.feature_extraction import text
import spacy

nlp = spacy.load("en_core_web_sm")
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger_eng')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\frank\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\frank\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\frank\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\frank\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\frank\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\frank\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nl

True

<h2>Parameters<h2>

In [None]:
# General
folds = 10 

# For Linear SVM Model
C = 0.7
aug_similarity = 0.7
aug_quantity = 0
stop_word_package = 1 # 0 is nltk and 1 is sklearn stop word package

In [47]:

# Loading Training data
df_train = pd.read_csv('train.csv', encoding='utf-8', encoding_errors='ignore') # errors were not pertinent characters
df_train["subreddit"] = df_train["subreddit"].map({"Boston": 0, "Canberra": 1,"Geneva":2,"Ottawa":3})

y = df_train["subreddit"]
X = df_train["body"]


# Loading Test Data
df_test = pd.read_csv('test.csv', encoding='utf-8', encoding_errors='ignore') # errors were not pertinent characters

<h2>Helper Functions</h2>
<h3>Preprocessing</h3>

In [48]:
def lemmatize_tokenizer2(subreddit_post, stop_word_catalogue=None):
    subreddit_post = subreddit_post.lower()
    words = word_tokenize(subreddit_post)
    # WordNetLemmatizer requires Pos tags to understand if the word is noun or verb or adjective etc. By default it is set to Noun
    tag_map = defaultdict(lambda : wn.NOUN)
    tag_map['J'] = wn.ADJ
    tag_map['V'] = wn.VERB
    tag_map['R'] = wn.ADV
    
    lemmatized_words = []
    
    word_Lemmatized = WordNetLemmatizer()
    for word, tag in pos_tag(words):
        # Below condition is to check for Stop words and consider only alphabets
        condition = True
        if (stop_word_catalogue!=None):
            condition = word not in stop_word_catalogue
        if(condition):
            if word.isalnum():
                lemmatized_word = word_Lemmatized.lemmatize(word,tag_map[tag[0]])
                #print(word,lemmatized_word)
                lemmatized_words.append(lemmatized_word)                   
    return lemmatized_words

def lemmatize_tokenizer(subreddit_post, stop_word_catalogue=None):
    subreddit_post = subreddit_post.lower()
    words = word_tokenize(subreddit_post)

    lemmatized_words = []
    word_Lemmatized = WordNetLemmatizer()
    for word in words:
        if word.isalpha():
            lemmatized_word = word_Lemmatized.lemmatize(word)
            lemmatized_words.append(lemmatized_word)  
 
    return lemmatized_words

def preprocessing_data(X, stop_word_catalogue,version=True):
    lemmatized_data = []
    for i in range(len(X)):
        post = X[i]
        if (version):
            lemmatized_post = lemmatize_tokenizer(post,stop_word_catalogue)
        else:
            lemmatized_post = lemmatize_tokenizer2(post,stop_word_catalogue)
        lemmatized_data.append(" ".join(lemmatized_post))
    return lemmatized_data

def feature_extraction(X,ngram_range,min_df,caption=""):
    vectorizer = CountVectorizer(ngram_range=ngram_range,min_df=min_df)
    X_disp = vectorizer.fit_transform(X)
    feature_names = vectorizer.get_feature_names_out()
    print(caption,"features:", len(feature_names))
    return feature_names

<h3>Train and Tunning<h3>

In [49]:
import sklearn
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score

# This function does all the tunning for each model
def hyperparamaterTunning(X,param, folds, model, verbose_val=1, Y=y):
    model_gridSearch = GridSearchCV(model, param_grid=param,cv=folds, verbose=verbose_val) # According to doc the data will be split the same way accross all calls

    model_best_clf = model_gridSearch.fit(X,Y)

    cv_results = model_gridSearch.cv_results_
    
    print()

    print(f"Best Parameters: {model_best_clf.best_params_}")
    try:

        best_index = model_gridSearch.best_index_

        score = []
        for fold in range(folds):
            score.append(model_gridSearch.cv_results_[f"split{fold}_test_score"][best_index].item())

        print(f"Cross-validation Accuracies: {np.round(score,2).tolist()}")
    except:
        pass
    print(f"Mean Accuracy: {model_best_clf.best_score_:.4f}")

    return model_best_clf

<h2>Train</h2>

<h3> Preprocessing </h3>

In [50]:
# Stop Word Catalogues
nltk_stop_words = stopwords.words('english')
sklean_stop_words = list(text.ENGLISH_STOP_WORDS)

# Trainning Data
pp_df_train_1 = preprocessing_data(X,nltk_stop_words,False)
pp_df_train_2 = preprocessing_data(X,sklean_stop_words,False)

pp_train = pp_df_train_1
stop_words_selected = nltk_stop_words
if (stop_word_package==1):
    pp_train = pp_df_train_2
    stop_words_selected = sklean_stop_words

<h2>Data Augmentation<h2>

In [51]:

def get_synonyms(word, tag = None):
    synonyms = set()
    for synonym in wn.synsets(word, pos=tag):
            for lemma in synonym.lemmas():
                if (not lemma.name()[0].isupper()):
                    synonyms.add(lemma.name())
    return sorted(list(synonyms))

def spacyWordnetMapping(tag):
    # since lemmatized --> only      
    if (tag=="VERB"):
        return "v"
    if (tag=="ADJ"):
        return "a"
    if (tag=="ADV"):
        return "r"
    if (tag=="NOUN"):
        return "n"
    return -1
# Replace words with their first synonym deterministically
def SynonymAug(text, aug_p, random_state = 42):
    random.seed(random_state)
    newPost = []
    # Assumed input is lemmatized
    doc = nlp(text)
    # Create a list of (word, POS) pairs
    numbers = range(len(doc))
    aug_idx = random.sample(numbers, int(len(doc)*aug_p))
    for token in doc:
        word = token.text
        tag = spacyWordnetMapping(token.pos_)
        if (tag!=-1):
            synonyms = get_synonyms(word,tag)
            if (len(synonyms)!=0):
                newPost.append(synonyms[0])
        else:
            newPost.append(word)
    return " ".join(newPost)


In [52]:
# Data augmentation
import nlpaug.augmenter.word as naw
from langdetect import detect
from langdetect import DetectorFactory
from nltk.corpus import wordnet
import random


# Random Seed 
random.seed(42)
np.random.seed(42)
DetectorFactory.seed = 1
aug_val = round(1-aug_similarity,1)
aug = naw.SynonymAug(aug_src='wordnet', aug_p=aug_val)
augmented = []
augmented_y = []

numbers = range(len(pp_train))
picked_numbers = random.sample(numbers, int(len(pp_train)*aug_quantity))

for i in picked_numbers:

    post = pp_train[i]
    if (detect(post)=="en"):

        #augmented.append(SynonymAug(post,0.3))
        augmented.append(aug.augment(post)[0])
        augmented_y.append(y[i].item())


print(len(augmented))
print(len(augmented_y))

augmented_df_train = pp_train + augmented
augmented_df_y = y.tolist() + augmented_y

X_arr = np.array(augmented_df_train)
Y_arr = np.array(augmented_df_y)

#X_shuffled = pp_train
#y_shuffled = y.tolist()
if (aug_quantity!=0):
    indices = np.arange(len(X_arr))
#print(indices)
    np.random.shuffle(indices)
    X_shuffled = X_arr[indices]
    y_shuffled = Y_arr[indices]
else:
    X_shuffled = pp_train
    y_shuffled = y.tolist()
# Reshuffling data --> due to addition of data augmentation



0
0


<h2>Train<h2>
<h3>Feature Extraction<h3>

In [53]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer

tfidf_primary = TfidfVectorizer(ngram_range=(1,1),sublinear_tf=True,stop_words=stop_words_selected)
X_uni_primary = tfidf_primary.fit_transform(X_shuffled)
print(len(tfidf_primary.get_feature_names_out()))

naiveBayes = CountVectorizer(max_features=3000, ngram_range=(1, 1), stop_words="english")
X_naive_bayes = naiveBayes.fit_transform(X).toarray()
print(len(naiveBayes.get_feature_names_out()))



9471
3000


<h3>Feature Reduction</h3>

In [54]:
# TRUCATION
from sklearn.decomposition import TruncatedSVD
# Step 1: Fit initial SVD with many components
svd = TruncatedSVD(n_components=2000,random_state=42) 
X_svd = svd.fit_transform(X_uni_primary)
print(len(X_svd))


1400


<h3>Naive Bayes Model</h3>

In [55]:
from sklearn.model_selection import KFold
class NaiveBayes:
    def __init__(self, x_all, y_all, feature_vectoriser):
        self.x_all = self.clean_text_data(x_all) # Make lists of strings
        self.y_all = y_all
        self.feature_vectoriser = feature_vectoriser

        self.folds_features_probability = 0 # array of dict
        self.folds_accuracy = 0
        self.avg_accuracy = 0

    
    def calc_probability(self, x, y): # Train/Fit # Mathieu
        # Create an empty dictionnary with the 3000 most common words for each subreddit.
        features_probability_boston = {word: 0 for word in self.feature_vectoriser.get_feature_names_out()}
        features_probability_canberra = {word: 0 for word in self.feature_vectoriser.get_feature_names_out()}
        features_probability_geneva = {word: 0 for word in self.feature_vectoriser.get_feature_names_out()}
        features_probability_ottawa = {word: 0 for word in self.feature_vectoriser.get_feature_names_out()}

        # Initialize the count for the total number of text from each subreddit
        count_boston = 0
        count_canberra = 0
        count_geneva = 0
        count_ottowa = 0

        # Add 1 to the word in the dictionnary when the word is present in the text
        for i in range(y.shape[0]):
            if y[i] == 0:
                count_boston += 1
                self.add_probability(features_probability_boston, x[i])
            if y[i] == 1:
                count_canberra += 1
                self.add_probability(features_probability_canberra, x[i])
            if y[i] == 2:
                count_geneva += 1
                self.add_probability(features_probability_geneva, x[i])
            else:
                count_ottowa += 1
                self.add_probability(features_probability_ottawa, x[i])

        # Add the total count of each city to a variable called "city_count" and the probability of each city in a variable called "city_probability" in each one of the dictionary
        features_probability_boston["city_count"] = count_boston
        features_probability_boston["city_probability"] = count_boston / (count_boston + count_canberra + count_geneva + count_ottowa)
        features_probability_canberra["city_count"] = count_canberra
        features_probability_canberra["city_probability"] = count_canberra / (count_boston + count_canberra + count_geneva + count_ottowa)
        features_probability_geneva["city_count"] = count_geneva
        features_probability_geneva["city_probability"] = count_geneva / (count_boston + count_canberra + count_geneva + count_ottowa)
        features_probability_ottawa["city_count"] = count_ottowa
        features_probability_ottawa["city_probability"] = count_ottowa / (count_boston + count_canberra + count_geneva + count_ottowa)
                
        return features_probability_boston, features_probability_canberra, features_probability_geneva, features_probability_ottawa # return all dictionaries

    def clean_text_data(self, x): # Helper function to make a list of lists of words 
        # Take text remove all capitalized letters, removed special characters and make an array of words.
        cleaned_data = [
            re.sub(r'[^a-z0-9\s]', '', text.lower()).split()
            for text in x
        ]
        print("This is the cleaned data", cleaned_data[0])
        return cleaned_data # return a list of lists of words (better to use lists for this since numpy is mostly for numerical values)
    
    def add_probability(self, city_dict, x): # Helper function to update probabilities given a dict and a list of words
        for word in set(x): # Creates a set from words(unique elements)
            if word in city_dict:
                city_dict[word] += 1
                
    
    def predict(self, features_probability_boston, features_probability_canberra, features_probability_geneva, features_probability_ottawa, x_i): # (Is x_i in formula equal to 1?)
        # Initialize probabilities for each subreddit
        prob_boston = features_probability_boston["city_probability"]
        prob_canberra = features_probability_canberra["city_probability"]
        prob_geneva = features_probability_geneva["city_probability"]
        prob_ottowa = features_probability_ottawa["city_probability"]

        for word in x_i:
            if word in features_probability_boston: # All have the same most common words
                # Laplace smoothing
                prob_boston = prob_boston * ((features_probability_boston[word] + 1) / (features_probability_boston["city_count"] + 2))
                prob_canberra = prob_canberra * ((features_probability_canberra[word] + 1) / (features_probability_canberra["city_count"] + 2))
                prob_geneva = prob_geneva * ((features_probability_geneva[word] + 1) / (features_probability_geneva["city_count"] + 2))
                prob_ottowa = prob_ottowa * ((features_probability_ottawa[word] + 1) / (features_probability_ottawa["city_count"] + 2))
        
        probabilities = np.array([prob_boston, prob_canberra, prob_geneva, prob_ottowa])
        
        return np.argmax(probabilities)

    def accu_eval(self, x, y): # Issy
        # Validation
        # Returns Accuracy = 1 - Error

        num_correct_labels = 0

        # Get probabilities / train model
        features_probability_boston, features_probability_canberra, features_probability_geneva, features_probability_ottawa = self.calc_probability(x,y)

        # Predict
        for i in range(len(x)):
            predicted_label = self.predict(features_probability_boston, features_probability_canberra, features_probability_geneva, features_probability_ottawa, x[i])
            if predicted_label == y[i]:
                num_correct_labels += 1

        # Get accuracy
        accuracy = num_correct_labels/len(y)
        return accuracy

    def crossValidation(self, k): # Issy (PS: I think we are allowed to use the method from sklearn)
        kf = KFold(n_splits=k, shuffle=True, random_state=9)
        accuracies = []

        # Split into train and validation sets
        for train_indices, val_indices in kf.split(self.x_all):
            x_train = [self.x_all[i] for i in train_indices] # separate x into training subset
            x_val = [self.x_all[i] for i in val_indices] # separate x into validating subset

            y_train = [self.y_all[i] for i in train_indices] # separate y into training subset
            y_val = [self.y_all[i] for i in val_indices] # separate y into validating subset

            
            # For each set, get probabilities / train with training set
            features_probability_boston, features_probability_canberra, features_probability_geneva, features_probability_ottawa = self.calc_probability(x_train,np.array(y_train))

            # check with validation subset
            num_correct_labels = 0
            for i in range(len(y_val)):
                predicted_label = self.predict(features_probability_boston, features_probability_canberra, features_probability_geneva, features_probability_ottawa, x_val[i])
                if predicted_label == y_val[i]:
                    num_correct_labels += 1
        
            # calculate accuracy
            accuracy = num_correct_labels / len(y_val)
            accuracies.append(accuracy)

        self.folds_accuracy = accuracies
        print(np.round(self.folds_accuracy,2).tolist())
        self.avg_accuracy = np.round(np.mean(accuracies),4)

        return self.avg_accuracy

In [56]:
# Naive Bayes set up
customNaiveBayes = NaiveBayes(X.to_numpy().flatten(), y.to_numpy(), feature_vectoriser=naiveBayes)

accuracy = customNaiveBayes.crossValidation(folds)
print(accuracy)

This is the cleaned data ['i', 'had', 'to', 'put', 'in', 'a', 'drain', 'wellfrench', 'drain', 'and', 'the', 'ground', 'about', '6', 'inches', 'down', 'was', 'all', 'mud', 'and', 'clay', 'i', 'was', 'ass', 'over', 'end', 'in', 'this', 'hole', 'scooping', 'clay', 'mud', 'and', 'was', 'joined', 'by', 'probably', 'ten', 'of', 'these', 'mud', 'daubers', 'for', 'a', 'couple', 'hours', 'they', 'never', 'bothered', 'me', 'at', 'all', 'in', 'their', 'own', 'little', 'way', 'they', 'were', 'kinda', 'helping', 'out', 'i', 'suppose', 'theyd', 'build', 'nests', 'in', 'the', 'garage', 'where', 'i', 'workout', 'and', 'aside', 'from', 'almost', 'smacking', 'into', 'each', 'other', 'they', 'never', 'bothered', 'me', 'theyd', 'just', 'go', 'back', 'and', 'forth', 'building', 'their', 'mud', 'tubes', 'and', 'filling', 'them', 'with', 'paralyzed', 'spiders', 'i', 'think', 'they', 'helped', 'with', 'garden', 'pests', 'and', 'we', 'had', 'so', 'many', 'spiders', 'i', 'didnt', 'mind', 'them', 'culling', 'tha

In [57]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

param_grid_NB_1 = {'alpha': np.arange(0.1, 1.1, 0.1).tolist()}

NB = hyperparamaterTunning(X_uni_primary, param_grid_NB_1 ,folds, MultinomialNB(),Y=y_shuffled)

Fitting 10 folds for each of 10 candidates, totalling 100 fits

Best Parameters: {'alpha': 0.1}
Cross-validation Accuracies: [0.67, 0.66, 0.78, 0.69, 0.67, 0.67, 0.66, 0.74, 0.74, 0.74]
Mean Accuracy: 0.7014


<h3>SVM Models</h3>

In [58]:
from sklearn.svm import LinearSVC, SVC, NuSVC
from sklearn.decomposition import PCA


param_grid_SVC_1 = [
    {"penalty":["l2"],
     "C": [C],
     "loss": ["hinge"],
     "tol":[1e-4],
     "max_iter": [5000]
     }]


SVMModel_tunned_1b = hyperparamaterTunning(X_svd, param_grid_SVC_1, folds, LinearSVC(fit_intercept=True),verbose_val=3,Y=y_shuffled)


Fitting 10 folds for each of 1 candidates, totalling 10 fits
[CV 1/10] END C=0.7, loss=hinge, max_iter=5000, penalty=l2, tol=0.0001;, score=0.679 total time=   0.4s
[CV 2/10] END C=0.7, loss=hinge, max_iter=5000, penalty=l2, tol=0.0001;, score=0.657 total time=   0.5s
[CV 3/10] END C=0.7, loss=hinge, max_iter=5000, penalty=l2, tol=0.0001;, score=0.764 total time=   0.5s
[CV 4/10] END C=0.7, loss=hinge, max_iter=5000, penalty=l2, tol=0.0001;, score=0.750 total time=   0.6s
[CV 5/10] END C=0.7, loss=hinge, max_iter=5000, penalty=l2, tol=0.0001;, score=0.736 total time=   0.4s
[CV 6/10] END C=0.7, loss=hinge, max_iter=5000, penalty=l2, tol=0.0001;, score=0.657 total time=   1.2s
[CV 7/10] END C=0.7, loss=hinge, max_iter=5000, penalty=l2, tol=0.0001;, score=0.686 total time=   1.6s
[CV 8/10] END C=0.7, loss=hinge, max_iter=5000, penalty=l2, tol=0.0001;, score=0.729 total time=   1.2s
[CV 9/10] END C=0.7, loss=hinge, max_iter=5000, penalty=l2, tol=0.0001;, score=0.714 total time=   1.3s
[CV