In [None]:
pip install contractions

In [None]:
from google.colab import drive
drive.mount('/gdrive')

In [None]:
from google.colab import files

uploaded = files.upload()

for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from Gaussian_Naive_Bayes import GaussianNaiveBayes
from nltk.tokenize import word_tokenize
import re
import nltk
from contractions import contractions_dict
from nltk.corpus import stopwords
from spacy.lang.en.stop_words import STOP_WORDS
from itertools import filterfalse
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
import time
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

In [None]:
data = pd.read_csv("/gdrive/MyDrive/Colab Notebooks/Email processing/emails.csv")

In [None]:
def pre_processing(text):
    regex = r'^@[a-zA-z0-9]|^#[a-zA-Z0-9]|\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*|\W+|\d+|<("[^"]*"|\'[^\']*\'|[^\'">])*>|_+|[^\u0000-\u007f]+'
    en_stop_words = list(set(stopwords.words('english')).union(set(STOP_WORDS)))

    #strip titles
    if "Subject: re :" in text:
        text = text[13:]
    elif "Subject: news :" in text:
        text = text[15:]
    else:
        text = text[8:]

    #tokenize using nltk
    text = word_tokenize(text)

    #normalize tokens
    text = [token.lower() for token in text]

    #contraction expansion
    text = [contractions_dict[token] if token in contractions_dict.keys() else token for token in text]

    #remove waste words
    l = list()
    for token in text:
      if not(re.search(regex,token)):
        l.append(token)
    text = l
    del l

    #split
    text = [re.split(regex,token)[0] for token in text]

    #remove stop words
    l = list()
    for token in text:
      if not(token in en_stop_words or re.search(r'\b\w\b|[^\u0000-\u007f]+|_+|\W+',token)):
        l.append(token)
    text = l
    del l

    #assigning pos tag to each token
    text = pos_tag(text)
    text = [(i[0],wordnet.ADJ) if i[1][0] == 'J' else (i[0],wordnet.VERB) if i[1][0] == 'V' else (i[0],wordnet.ADV) if i[1][0] == 'R' else (i[0],wordnet.NOUN) for i in text]

    #applying lemmatizer
    lemmatizer = WordNetLemmatizer()
    if len(text) > 0:
      text = ['' if token == None else lemmatizer.lemmatize(word=token[0],pos=token[1]) for token in text]

    del regex, en_stop_words, lemmatizer
    return text

In [None]:
# data preprocessing
data['text'] = data['text'].apply(lambda x: pre_processing(x))

In [None]:
data

In [None]:
preprocessed_data = data.copy()

vocab = set()
for list_of_tokens in preprocessed_data['text']:
  vocab = vocab.union(set(list_of_tokens))

vocab_dict = dict(zip(vocab,list(range(0,len(vocab)))))

preprocessed_data['text'] = preprocessed_data['text'].apply(lambda x: " ".join(x))

corpus = [i for i in preprocessed_data['text']]

vectorizer = TfidfVectorizer(vocabulary=vocab_dict)
tf_idf_matrix = vectorizer.fit_transform(corpus).toarray()

del preprocessed_data, vocab, vocab_dict, corpus, vectorizer

print(tf_idf_matrix.shape)

In [None]:
class Email_classification(GaussianNaiveBayes):
  def __init__(self, tf_idf_matrix, labels, n_comp, dicriminant_analysis='qda', rda_p=np.nan):

    pca = PCA(n_components=n_comp)
    self.tf_idf_matrix_reduced = pca.fit_transform(tf_idf_matrix)

    self.reduced_data = pd.DataFrame(self.tf_idf_matrix_reduced)
    self.reduced_data['spam'] = labels

    spam_data = self.reduced_data[self.reduced_data['spam'] == 1]
    not_spam_data = self.reduced_data[self.reduced_data['spam'] == 0]

    self.training = pd.concat([spam_data.iloc[:(int(self.reduced_data.shape[0]*0.7)//2),:], not_spam_data.iloc[:(int(self.reduced_data.shape[0]*0.7)//2),:]], axis=0)

    remaining = pd.concat([spam_data.iloc[(int(self.reduced_data.shape[0]*0.7)//2):,:], not_spam_data.iloc[(int(self.reduced_data.shape[0]*0.7)//2):,:]], axis=0)

    X_cv, X_test, Y_cv, Y_test = train_test_split(remaining.iloc[:,:-1], remaining['spam'], test_size=1/3)

    self.cv = pd.DataFrame(X_cv)
    self.cv['spam'] = Y_cv

    self.testing = pd.DataFrame(X_test)
    self.testing['spam'] = Y_test

    super().__init__(dicriminant_analysis, rda_p)

    del X_test, Y_test, X_cv, Y_cv, remaining, n_comp, pca, tf_idf_matrix, labels, dicriminant_analysis, rda_p

In [None]:
def evaluate(predicted, actual):
    TP = np.count_nonzero((predicted == 1) & (actual == 1))
    TN = np.count_nonzero((predicted == 0) & (actual == 0))
    FP = np.count_nonzero((predicted == 1) & (actual == 0))
    FN = np.count_nonzero((predicted == 0) & (actual == 1))
    
    if (TP + TN + FP + FN) == 0:
        accuracy = 0
    else:
        accuracy = (TP + TN)/(TP + TN + FP + FN)
    
    if (TP + FP) == 0:
        precision = 0
    else:
        precision = TP/(TP + FP)
    
    if (TP + FN) == 0:
        recall = 0
    else:
        recall = TP/(TP + FN)
        
    if (precision + recall) == 0:
        f1_score = 0
    else:
        f1_score = (2 * precision * recall)/(precision + recall)
    
    return (accuracy, precision, recall, f1_score)

In [None]:
Results = dict()
best = {'Accuracy':0, 'Precision':0, 'Recall':0, 'F1 Score':0, 'Best Obj':'obj', 'Parameter':()}

# We will check for various values of N Components in PCA for assumption of Quadratic Discriminant Analysis

In [None]:
for n_comp in np.arange(2,1241,20):
    obj = Email_classification(tf_idf_matrix, data['spam'], n_comp)
    obj.fit(obj.training.iloc[:,:-1], obj.training['spam'])
    obj.predict(obj.cv.iloc[:,:-1])
    Results[('qda', n_comp)] = evaluate(obj.predicted_labels, obj.cv['spam'])
    if Results[('qda', n_comp)][0] > best['Accuracy']:
        best['Accuracy'] = Results[('qda', n_comp)][0]
        best['Precision'] = Results[('qda', n_comp)][1]
        best['Recall'] = Results[('qda', n_comp)][2]
        best['F1 Score'] = Results[('qda', n_comp)][3]
        best['Best Obj'] = obj
        best['Parameter'] = ('qda', n_comp)
    else:
        del obj

In [None]:
best

# We will check for various values of N Components in PCA for assumption of Linear Discriminant Analysis

In [None]:
for n_comp in np.arange(2,1241):
    obj = Email_classification(tf_idf_matrix, data['spam'], n_comp, discriminant_analysis='lda')
    obj.fit(obj.training.iloc[:,:-1], obj.training['spam'])
    obj.predict(obj.cv.iloc[:,:-1])
    Results[('lda', n_comp)] = evaluate(obj.predicted_labels, obj.cv['spam'])
    if Results[('lda', n_comp)][0] > best['Accuracy']:
        best['Accuracy'] = Results[('lda', n_comp)][0]
        best['Precision'] = Results[('lda', n_comp)][1]
        best['Recall'] = Results[('lda', n_comp)][2]
        best['F1 Score'] = Results[('lda', n_comp)][3]
        best['Best Obj'] = obj
        best['Parameter'] = ('lda', n_comp)
    else:
        del obj

In [None]:
best

# We will check for various values of N Components in PCA for assumption of Regularized Discriminant Analysis

In [None]:
for alpha in np.arange(0.1,1,0.1):
    for gamma in np.arange(0.1,1,0.1):
        for n_comp in np.arange(2,1241,20): 
            obj = Email_classification(tf_idf_matrix, data['spam'], n_comp, discriminant_analysis='lda', rda_p=(alpha, gamma))
            obj.fit(obj.training.iloc[:,:-1], obj.training['spam'])
            obj.predict(obj.cv.iloc[:,:-1])
            Results[('rda', n_comp, alpha, gamma)] = evaluate(obj.predicted_labels, obj.cv['labels'])
            if Results[('rda', n_comp, alpha, gamma)][0] > best['Accuracy']:
                best['Accuracy'] = Results[('rda', n_comp, alpha, gamma)][0]
                best['Precision'] = Results[('rda', n_comp, alpha, gamma)][1]
                best['Recall'] = Results[('rda', n_comp, alpha, gamma)][2]
                best['F1 Score'] = Results[('rda', n_comp, alpha, gamma)][3]
                best['Best Obj'] = obj
                best['Parameter'] = ('rda', n_comp, alpha, gamma)
            else:
                del obj

In [None]:
best

# From over ----- Hyperparameter combinaitons we find our best model

In [None]:
best

In [None]:
best_obj = best['Best obj']

In [None]:
best_obj.fit(best_obj.testing)

In [None]:
print(evaluate(best_obj.predicted_labels, best_obj.testing['spam']))