In [1]:
# Import the following

import os
import glob
from sklearn.feature_extraction.text import CountVectorizer
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
import pandas as pd
import numpy as np
import regex as re
import copy
import shutil
import math
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import SGDClassifier
import warnings
warnings.filterwarnings('ignore')
import sys

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [2]:
# Unzipping the datasets if dataset not already unzipped

from zipfile import ZipFile

r = [1,2,4]
for i in r:
  file_name = '/content/enron{}_train.zip'.format(i)
  with ZipFile(file_name, 'r') as z:
    z.extractall()
    print('Done')

for i in r:
  file_name = '/content/enron{}_test.zip'.format(i)
  with ZipFile(file_name, 'r') as z:
    z.extractall()
    print('Done')

# there is a abornmality for enron2 dataset path, hence resolving that
if os.path.exists('/content/enron2/') == False:
  os.mkdir('/content/enron2/')

shutil.move("/content/train/", "/content/enron2/")
shutil.move("/content/test/","/content/enron2/")

Done
Done
Done
Done
Done
Done


'/content/enron2/test'

In [3]:
# To get the files/data/emails from their respective paths

def get_data(dataset_num):
  '''
  This function takes the dataset number
  and returns the list of email or files
  '''
  files_ham = []
  files_spam = []
  test_files_ham = []
  test_files_spam = []
  path = os.getcwd()
  path_train = os.path.join(path, 'enron{}/train'.format(dataset_num))
  path_test = os.path.join(path, 'enron{}/test'.format(dataset_num))
  path_train_ham = os.path.join(path_train, 'ham')
  path_train_spam = os.path.join(path_train, 'spam')
  path_test_ham = os.path.join(path_test, 'ham')
  path_test_spam = os.path.join(path_test, 'spam')

  # list of paths
  files_list_ham = glob.glob(path_train_ham + '/' + '*.txt')
  files_list_spam = glob.glob(path_train_spam + '/' + '*txt')
  test_files_list_ham = glob.glob(path_test_ham + '/' + '*txt')
  test_files_list_spam = glob.glob(path_test_spam + '/' + '*txt')

  # Saving the files in a list
  for ham in files_list_ham:
    files_ham.append(open(ham, 'r', errors='ignore').read())
  for spam in files_list_spam:
    files_spam.append(open(spam, 'r', errors='replace').read()) 

  for ham in test_files_list_ham:
    test_files_ham.append(open(ham, 'r', errors='ignore').read())
  for spam in test_files_list_spam:
    test_files_spam.append(open(spam, 'r', errors='replace').read())

  size_of_ham = len(files_list_ham)
  size_of_spam = len(files_list_spam)
  size_of_total = len(files_list_ham) + len(files_list_spam)

  return files_ham, files_spam, size_of_ham, size_of_spam, size_of_total, test_files_ham, test_files_spam

In [4]:
def convert_bow(temp_ham, temp_spam, Voc_dict):
  '''
  This function takes the list of files in ham and spam, and also the vocabulary dictionary 
  and returns the bag of words representation for the dataset
  '''
  spam_bow = []
  ham_bow = []

  for file in temp_ham:
    temp_dict = copy.deepcopy(Voc_dict)
    file1 = re.findall('[a-zA-Z]+', file)
    for word in file1:
      word = word
      if word in Voc_dict:
        temp_dict[word] = temp_dict[word] + 1
    ham_bow.append(temp_dict)
  for file in temp_spam:
    temp_dict = copy.deepcopy(Voc_dict)
    file1 = re.findall('[a=zA-Z]+', file)
    for word in file1:
      word = word
      if word in Voc_dict:
        temp_dict[word] = temp_dict[word] + 1
    spam_bow.append(temp_dict)

  return ham_bow, spam_bow  

def getting_bow(files_ham, files_spam, size_of_ham, size_of_spam, size_of_total, test_files_ham, test_files_spam):
  '''
  This function takes the list of files in ham and spam
  and returns the bag of words representation for the dataset
  '''
  word_list = []
  ham_train_bow = []
  spam_train_bow = []
  ham_test_bow = []
  spam_test_bow = []

  # creating a Vocabulary
  for file in files_ham:
    file1 = re.findall("[a-zA-Z]+", file)
    for word in file1:
      word_list.append(word)
  for file in files_spam:
    file1 = re.findall("[a-zA-Z]+", file)
    for word in file1:
      word_list.append(word)
 
  Voc = list(set(word_list))

  # creating vocabulary dictionary
  Voc_dict = {}
  for word in Voc:
    Voc_dict[word] = 0 

  # converting data into Bag of Words
  ham_train_bow, spam_train_bow = convert_bow(files_ham, files_spam, Voc_dict)
  ham_test_bow, spam_test_bow = convert_bow(test_files_ham, test_files_spam, Voc_dict) 

  # converting BOW training data into DataFrame to np array
  ham_arr = pd.DataFrame(ham_train_bow).to_numpy()
  spam_arr = pd.DataFrame(spam_train_bow).to_numpy()

  # converting BOW testing data into DataFrame to np array
  test_ham_bow_arr = pd.DataFrame(ham_test_bow).to_numpy()
  test_spam_bow_arr = pd.DataFrame(spam_test_bow).to_numpy()

  train_bow_arr = np.append(ham_arr, spam_arr, axis=0)
  test_bow_arr = np.append(test_ham_bow_arr, test_spam_bow_arr, axis=0)

  return ham_arr, spam_arr, train_bow_arr, test_bow_arr, Voc_dict

In [5]:
def convert_bern(temp_ham, temp_spam, Voc_dict):
  '''
  This function takes the list of files in ham and spam, and also the vocabulary dictionary 
  and returns the bernoulli representation for the dataset
  '''
  ham_bern = []
  spam_bern = []

  for file in temp_ham:
    temp_dict = copy.deepcopy(Voc_dict)
    file1 = re.findall('[a-zA-Z]+', file)
    tr = 0
    for word in file1:
      word = word
      if word in Voc_dict:
        temp_dict[word] = 1
    ham_bern.append(temp_dict)  
  for file in temp_spam:
    temp_dict = copy.deepcopy(Voc_dict)
    file1 = re.findall('[a=zA-Z]+', file)
    for word in file1:
      word = word
      if word in Voc_dict:
        temp_dict[word] = 1
    spam_bern.append(temp_dict)

  return ham_bern, spam_bern  

def getting_bern(files_ham, files_spam, size_of_ham, size_of_spam, size_of_total, test_files_ham, test_files_spam):
  '''
  This function takes the list of files in ham and spam 
  and returns the bernoulli representation for the dataset
  '''
  word_list = []
  ham_train_bern = []
  spam_train_bern = []
  ham_test_bern = []
  spam_test_bern = []

  # creating a Vocabulary
  for file in files_ham:
    file1 = re.findall("[a-zA-Z]+", file)
    for word in file1:
      word_list.append(word)
  for file in files_spam:
    file1 = re.findall("[a-zA-Z]+", file)
    for word in file1:
      word_list.append(word)

  Voc = list(set(word_list))

  # creating vocabulary dictionary
  Voc_dict = {}
  for word in Voc:
    Voc_dict[word] = 0 

  # converting data into Bernoulli model
  ham_train_bern, spam_train_bern = convert_bern(files_ham, files_spam, Voc_dict)
  ham_test_bern, spam_test_bern = convert_bern(test_files_ham, test_files_spam, Voc_dict) 

  # converting Bernoulli training data into DataFrame to np array
  ham_arr = pd.DataFrame(ham_train_bern).to_numpy()
  spam_arr = pd.DataFrame(spam_train_bern).to_numpy()

  # converting Bern testing data into DataFrame to np array
  test_ham_bern_arr = pd.DataFrame(ham_test_bern).to_numpy()
  test_spam_bern_arr = pd.DataFrame(spam_test_bern).to_numpy()

  train_bern_arr = np.append(ham_arr, spam_arr, axis=0)
  test_bern_arr = np.append(test_ham_bern_arr, test_spam_bern_arr, axis=0)

  return ham_arr, spam_arr, train_bern_arr, test_bern_arr, Voc_dict

In [6]:
def mnb_train(ham_bow_arr, spam_bow_arr, size_of_ham, size_of_spam, size_of_total, Voc_dict):
  '''
  This function takes the bag of words representation of the data, and also the vocabulary dictionary 
  and returns the priors and conditional probability for the Multinomial NB
  '''
  prior = {}

  prior['ham'] = math.log(size_of_ham/float(size_of_total))
  prior['spam'] = math.log(size_of_spam/float(size_of_total))

  # summation of all the columns
  ham_bow_column_sum = np.sum(ham_bow_arr, axis=0)
  spam_bow_column_sum = np.sum(spam_bow_arr, axis=0)

  # total number of words
  ham_bow_total = np.sum(ham_bow_column_sum)
  spam_bow_total = np.sum(spam_bow_column_sum)

  # calculating conditional prob and storing them in log
  ham_cond_prob = np.log((ham_bow_column_sum+1)/(ham_bow_total+len(Voc_dict)))
  spam_cond_prob = np.log((spam_bow_column_sum+1)/(spam_bow_total+len(Voc_dict)))

  return prior, ham_cond_prob, spam_cond_prob

In [7]:
def mnb_test(prior, ham_cond_prob, spam_cond_prob, test_bow_arr, Y_test):
  '''
  This function takes the priors and conditional probability, along with the test data for the Multinomial NB
  and returns the necessary metrics such as accuracy
  '''

  # multiplying test_bow_arr with log conditional probability 
  # and then taking the sum over all features along with their respective prior
  # and finally exponenting the result
  Y_predict_ham = np.exp(np.sum(np.multiply(test_bow_arr, ham_cond_prob), axis=1) + (prior['ham']))
  Y_predict_spam = np.exp(np.sum(np.multiply(test_bow_arr, spam_cond_prob), axis=1) + (prior['spam']))

  Y_bow_predict = []
  for i,j in zip(Y_predict_ham, Y_predict_spam):
    if i>j:
      Y_bow_predict.append(1)
    else:
      Y_bow_predict.append(0)

  # calculating the necessary metrics
  accuracy_mnb = accuracy_score(Y_test, Y_bow_predict)
  scores_mnb = precision_recall_fscore_support(Y_test, Y_bow_predict, average="macro")

  return accuracy_mnb, scores_mnb

In [8]:
def mnb(dataset_num):
  '''
  This function takes the dataset number 
  and returns accuracy and other metrics after training and testing the Multinomial NB
  '''

  # Get training and testing dataset from the dataset number
  files_ham, files_spam, size_of_ham, size_of_spam, size_of_total, test_files_ham, test_files_spam = get_data(dataset_num)

  # Get the Training and Testing Bag of Words 
  ham_train_arr, spam_train_arr, train_bow_arr, test_bow_arr, Voc_dict = getting_bow(files_ham, files_spam, size_of_ham, size_of_spam, size_of_total, test_files_ham, test_files_spam)

  # Implement or Train the Multinoial NB and get the priors
  prior, ham_cond_prob, spam_cond_prob = mnb_train(ham_train_arr, spam_train_arr, size_of_ham, size_of_spam, size_of_total, Voc_dict)

  # array of output of testing data
  Y_test = []
  for i in range(len(test_files_ham)):
    Y_test.append(1)
  for i in range(len(test_files_spam)):
    Y_test.append(0)  

  # Testing the MNB implementation
  accuracy_mnb, scores_mnb = mnb_test(prior, ham_cond_prob, spam_cond_prob, test_bow_arr, Y_test)

  return accuracy_mnb, scores_mnb


In [9]:
def dnb_train(ham_bern_arr, spam_bern_arr, size_of_ham, size_of_spam, size_of_total, Voc_dict):
  '''
  This function takes the bernoulli representation of the data, and also the vocabulary dictionary 
  and returns the priors and conditional probability for the Discrete NB
  '''
  prior = {}

  prior['ham'] = math.log(size_of_ham/float(size_of_total))
  prior['spam'] = math.log(size_of_spam/float(size_of_total))

  # summation of all the columns
  ham_bern_column_sum = np.sum(ham_bern_arr, axis=0)
  spam_bern_column_sum = np.sum(spam_bern_arr, axis=0)

  # total number of words
  ham_bern_total = np.sum(ham_bern_column_sum)
  spam_bern_total = np.sum(spam_bern_column_sum)

  # calculating conditional prob and storing them in log
  ham_cond_prob = np.log((np.count_nonzero(ham_bern_arr, axis=0)+1)/(size_of_ham+2))
  spam_cond_prob = np.log((np.count_nonzero(spam_bern_arr, axis=0)+1)/(size_of_spam+2))

  return prior, ham_cond_prob, spam_cond_prob

In [10]:
def dnb_test(prior, ham_cond_prob, spam_cond_prob, test_bern_arr, Y_test):
  '''
  This function takes the priors and conditional probability, along with the test data for the Discrete NB
  and returns the necessary metrics such as accuracy
  '''
  
  # multiplying test_bern_arr with log conditional probability 
  # and then taking the sum over all features along with their respective prior
  # and finally exponenting the result

  Y_predict_ham = np.exp(np.sum(np.multiply(test_bern_arr, ham_cond_prob), axis=1) + (prior['ham']))
  Y_predict_spam = np.exp(np.sum(np.multiply(test_bern_arr, spam_cond_prob), axis=1) + (prior['spam']))

  Y_bern_predict = []
  for i,j in zip(Y_predict_ham, Y_predict_spam):
    if i>j:
      Y_bern_predict.append(1)
    else:
      Y_bern_predict.append(0)

  # calculating the necessary metrics
  accuracy_dnb = accuracy_score(Y_test, Y_bern_predict)
  scores_dnb = precision_recall_fscore_support(Y_test, Y_bern_predict, average="macro")

  return accuracy_dnb, scores_dnb

In [11]:
def dnb(dataset_num):
  '''
  This function takes the dataset number 
  and returns accuracy and other metrics after training and testing the Discrete NB
  '''

  # Get training and testing dataset from the dataset number
  files_ham, files_spam, size_of_ham, size_of_spam, size_of_total, test_files_ham, test_files_spam = get_data(dataset_num)

  # Get the Training and Testing Bernoulli model 
  ham_train_arr, spam_train_arr, train_bern_arr, test_bern_arr, Voc_dict = getting_bern(files_ham, files_spam, size_of_ham, size_of_spam, size_of_total, test_files_ham, test_files_spam)

  # Implement or Train the Discrete NB and get the priors
  prior, ham_cond_prob, spam_cond_prob = dnb_train(ham_train_arr, spam_train_arr, size_of_ham, size_of_spam, size_of_total, Voc_dict)

  # array of output of testing data
  Y_test = []
  for i in range(len(test_files_ham)):
    Y_test.append(1)
  for i in range(len(test_files_spam)):
    Y_test.append(0)  

  # Testing the DNB implementation
  accuracy_dnb, scores_dnb = dnb_test(prior, ham_cond_prob, spam_cond_prob, test_bern_arr, Y_test)

  return accuracy_dnb, scores_dnb

In [12]:
def lr_predict(x, w0, w):
  '''
  This function takes the weights and data
  and returns the sigmoid of the summation
  '''
  summation = w0 + np.sum(np.multiply(w, x), axis=1)
  pred_y = np.where(
            summation >= 0, # condition
            1 / (1 + np.exp(-summation)),
            np.exp(summation) / (1 + np.exp(summation))
            )
  return pred_y

In [13]:
def lr_train(x_train, y_train, lam, S):
  '''
  This function takes the training data, along with the lambda and maximum number of iterations
  and returns the weights after training 
  '''
  W0 = 0.0
  W = np.zeros(len(x_train[0]))  
  L = 0.01     
  CLL_log = []

  for epoch in range(S):
    y_predict_lr = lr_predict(x_train, W0, W)
    cll_w0 = np.sum(np.subtract(y_train, y_predict_lr))
    cll = np.transpose(x_train)@np.subtract(y_train, y_predict_lr)

    # update the weights, according to gradient ascent and L2 regularization
    W0 = W0 + L * (cll_w0 - (lam*W0))
    W = np.add(W, (np.multiply(L, (np.subtract(cll, np.multiply(lam, W))))))

    log_cll = np.sum(cll)/len(cll)
    CLL_log.append(log_cll)

    # Since it is a small dataset the convergence can be achieved very quickly
    if log_cll<0.1:
      break

  return W0, W, CLL_log

In [14]:
def lr_test(x, W0, W):
  '''
  This function takes the weights and data
  and returns the prediction of the test data using the sigmoid function lr_predict
  '''
  pred_y = lr_predict(x, W0, W)
  y_prediction = []
  for pred in pred_y:
    if pred>0.5:
      y_prediction.append(1)
    else:
      y_prediction.append(0)
  return y_prediction

def lr_validate(x_train, y_train, x_vali, y_vali):
  '''
  This function takes the training and validation data
  and returns the Best Lambda by training and validating on the respective data
  '''
  L = 0.01
  best_lambda = 0.2
  best_acc = 0
  for lam in [0.1, 0.3, 0.5]:
    w0, w, cll_log = lr_train(x_train, y_train, lam, 30)
    y_predict_mcap = lr_test(x_vali, w0, w)
    acc = accuracy_score(y_vali, y_predict_mcap)
    if acc > best_acc:
      best_acc = acc
      best_lambda = lam
  return best_lambda

In [15]:
def mcap_bow(dataset_num):
  '''
  This function takes the dataset number
  and returns accuracy and other metrics after training and testing 
  the LR model for Bag of Words representation
  '''
  # Get training and testing dataset from the dataset number
  files_ham, files_spam, size_of_ham, size_of_spam, size_of_total, test_files_ham, test_files_spam = get_data(dataset_num)

  # Get the Training and Testing Bag of Words 
  ham_train_arr, spam_train_arr, train_bow_arr, test_bow_arr, Voc_dict = getting_bow(files_ham, files_spam, size_of_ham, size_of_spam, size_of_total, test_files_ham, test_files_spam)

  # array of output of training data
  Y_train_arr = []
  for i in range(len(files_ham)):
    Y_train_arr.append(1)
  for i in range(len(files_spam)):
    Y_train_arr.append(0)

  # array of output of testing data
  Y_test = []
  for i in range(len(test_files_ham)):
    Y_test.append(1)
  for i in range(len(test_files_spam)):
    Y_test.append(0) 

  # create train and validation sets
  x_train, x_vali, y_train, y_vali = train_test_split(train_bow_arr, Y_train_arr, test_size=0.3, random_state=5)

  # We will now find the best lambda by training on train data and validating on validation data over different lambda values
  Best_lambda = lr_validate(x_train, y_train, x_vali, y_vali)

  # Now train the model using the best lambda and full training data
  W0_final, W_final, CLL_log_final = lr_train(train_bow_arr, Y_train_arr, Best_lambda, 80)

  # Testing the MCAP LR BOW implementation
  Y_mcap_bow_predict = lr_test(test_bow_arr, W0_final, W_final)

  # calculating the necessary metrics
  accuracy_mcap = accuracy_score(Y_test, Y_mcap_bow_predict)
  scores_mcap = precision_recall_fscore_support(Y_test, Y_mcap_bow_predict, average="macro")

  return accuracy_mcap, scores_mcap

In [16]:
def mcap_bern(dataset_num):
  '''
  This function takes the dataset number
  and returns accuracy and other metrics after training and testing 
  the LR model for Bernoulli representation
  '''
  # Get training and testing dataset from the dataset number
  files_ham, files_spam, size_of_ham, size_of_spam, size_of_total, test_files_ham, test_files_spam = get_data(dataset_num)

  # Get the Training and Testing Bernoulli Model
  ham_train_arr, spam_train_arr, train_bern_arr, test_bern_arr, Voc_dict = getting_bern(files_ham, files_spam, size_of_ham, size_of_spam, size_of_total, test_files_ham, test_files_spam)

  # array of output of training data
  Y_train_arr = []
  for i in range(len(files_ham)):
    Y_train_arr.append(1)
  for i in range(len(files_spam)):
    Y_train_arr.append(0)

  # array of output of testing data
  Y_test = []
  for i in range(len(test_files_ham)):
    Y_test.append(1)
  for i in range(len(test_files_spam)):
    Y_test.append(0) 

  # create train and validation sets
  x_train, x_vali, y_train, y_vali = train_test_split(train_bern_arr, Y_train_arr, test_size=0.3, random_state=5)

  # We will now find the best lambda by training on train data and validating on validation data over different lambda values
  Best_lambda = lr_validate(x_train, y_train, x_vali, y_vali)

  # Now train the model using the best lambda and full training data
  W0_final, W_final, CLL_log_final = lr_train(train_bern_arr, Y_train_arr, Best_lambda, 80)

  # Testing the MCAP LR Bernoulli implementation
  Y_mcap_bern_predict = lr_test(test_bern_arr, W0_final, W_final)

  # calculating the necessary metrics
  accuracy_mcap = accuracy_score(Y_test, Y_mcap_bern_predict)
  scores_mcap = precision_recall_fscore_support(Y_test, Y_mcap_bern_predict, average="macro")

  return accuracy_mcap, scores_mcap

In [17]:

def sgd_bow(dataset_num):
  '''
  This function takes the dataset number
  and returns accuracy and other metrics after training and testing 
  the SGDClassifier model for the Bag of Words representation
  '''
  # Get training and testing dataset from the dataset number
  files_ham, files_spam, size_of_ham, size_of_spam, size_of_total, test_files_ham, test_files_spam = get_data(dataset_num)

  # Get the Training and Testing Bag of Words 
  ham_train_arr, spam_train_arr, train_bow_arr, test_bow_arr, Voc_dict = getting_bow(files_ham, files_spam, size_of_ham, size_of_spam, size_of_total, test_files_ham, test_files_spam)

  # array of output of training data
  Y_train_arr = []
  for i in range(len(files_ham)):
    Y_train_arr.append(1)
  for i in range(len(files_spam)):
    Y_train_arr.append(0)

  # array of output of testing data
  Y_test = []
  for i in range(len(test_files_ham)):
    Y_test.append(1)
  for i in range(len(test_files_spam)):
    Y_test.append(0) 

  # create train and validation sets
  x_train, x_vali, y_train, y_vali = train_test_split(train_bow_arr, Y_train_arr, test_size=0.3, random_state=5)

  model = SGDClassifier()
  # Tuning the paramters using GridSearchCV
  parameters = {'alpha': (0.001, 0.01), 'loss': ['squared_hinge', 'hinge', 'log'], 'max_iter': [30, 50, 80]}
  grid_search = GridSearchCV(model, param_grid=parameters)    
  classifier = grid_search.fit(x_vali, y_vali)

  # Now we train the model on the basis of the tuned parameters
  cls = classifier.fit(x_train, y_train)

  # Testing the implementation of SGD BOW
  Y_sgd_bow_predict = cls.predict(test_bow_arr)
  
  # calculating the necessary metrics
  accuracy_sgd = accuracy_score(Y_test, Y_sgd_bow_predict)
  scores_sgd = precision_recall_fscore_support(Y_test, Y_sgd_bow_predict, average="macro")

  return accuracy_sgd, scores_sgd

In [18]:
def sgd_bern(dataset_num):
  '''
  This function takes the dataset number
  and returns accuracy and other metrics after training and testing 
  the SGDClassifier model for the Bernoulli representation
  '''
  # Get training and testing dataset from the dataset number
  files_ham, files_spam, size_of_ham, size_of_spam, size_of_total, test_files_ham, test_files_spam = get_data(dataset_num)

  # Get the Training and Testing Bernoulli
  ham_train_arr, spam_train_arr, train_bern_arr, test_bern_arr, Voc_dict = getting_bern(files_ham, files_spam, size_of_ham, size_of_spam, size_of_total, test_files_ham, test_files_spam)

  # array of output of training data
  Y_train_arr = []
  for i in range(len(files_ham)):
    Y_train_arr.append(1)
  for i in range(len(files_spam)):
    Y_train_arr.append(0)

  # array of output of testing data
  Y_test = []
  for i in range(len(test_files_ham)):
    Y_test.append(1)
  for i in range(len(test_files_spam)):
    Y_test.append(0) 

  # create train and validation sets
  x_train, x_vali, y_train, y_vali = train_test_split(train_bern_arr, Y_train_arr, test_size=0.3, random_state=5)

  model = SGDClassifier()
  # Tuning the paramters using GridSearchCV
  parameters = {'alpha': (0.001, 0.01), 'loss': ['squared_hinge', 'hinge', 'log'], 'max_iter': [30, 50, 80]}
  grid_search = GridSearchCV(model, param_grid=parameters)  
  classifier = grid_search.fit(x_vali, y_vali)

  # Now we train the model on the basis of the tuned parameters
  cls = classifier.fit(x_train, y_train)

  # Testing the implementation of SGD BOW
  Y_sgd_bern_predict = cls.predict(test_bern_arr)
  
  # calculating the necessary metrics
  accuracy_sgd = accuracy_score(Y_test, Y_sgd_bern_predict)
  scores_sgd = precision_recall_fscore_support(Y_test, Y_sgd_bern_predict, average="macro")

  return accuracy_sgd, scores_sgd

In [None]:
# Getting input from user for dataset name, algorithm name, and type of representation
'''
Enter the dataset number: This can take the numerical values such as '1', '2', and '4' which coreesponds 
	to the datasets named - enron1, enron2, enron4
Enter the algorithm name: This can take the following values:
	'mnb' for Multinomial Naive Bayes
	'dnb' for Discrete Naive Bayes
	'mcap' for MCAP Logistic Regression
	'sgd' for SGDClassifier
Enter the representation: This can take only 2 values:
	'bow' for Bag of Words representation
	'bern' for Bernoulli representation
Kindly input the 'bow' for MNB and 'bern' for DNB
'''

dataset_number = input('Enter the dataset number: ')
algo_name = input('Enter the algorithm name: ')
rep_type = input('Enter the type of representation: ')

if algo_name == 'mnb':
  rep_type = 'bow'
  Acc, Scores = mnb(dataset_number)

elif algo_name == 'dnb':
  rep_type = 'bern'
  Acc, Scores = dnb(dataset_number)

elif algo_name == 'mcap':
  if rep_type == 'bow':
    Acc, Scores = mcap_bow(dataset_number)
  elif rep_type == 'bern':
    Acc, Scores = mcap_bern(dataset_number)

elif algo_name == 'sgd':
  if rep_type == 'bow':
    Acc, Scores = sgd_bow(dataset_number)
  elif rep_type == 'bern':
    Acc, Scores = sgd_bern(dataset_number)

print('Following evaluations are for {} algorithm and {} representation type'.format(algo_name, rep_type), '\n', 'Model Accuracy :', Acc, '\n', 'Model Precision :', Scores[0], '\n', 'Model Recall :', Scores[1], '\n', 'Model F1 Score : ', Scores[2])

In [20]:

# Run this only if you want to run and get metrics for all Algorithms with each of the 3 datasets and 2 representation type

def show(Acc, Scores, algo_name, rep_type, i):
  '''
  This function takes accuracy and other metrics, along with the algorithm name and representation type
  and dataset name
  and outputs them in a neat fashion
  '''
  print('Following evaluations are for {} algorithm and {} representation type with dataset {}'.format(algo_name, rep_type, i), '\n', 'Model Accuracy :', Acc, '\n', 'Model Precision :', Scores[0], '\n', 'Model Recall :', Scores[1], '\n', 'Model F1 Score : ', Scores[2])

for algo_name in ['mnb', 'dnb', 'mcap', 'sgd']:
  if algo_name == 'mnb':
    rep_type = 'bow'
    for i in [1,2,4]:
      Acc, Scores = mnb(i)
      show(Acc, Scores, algo_name, rep_type, i)

  elif algo_name == 'dnb':
    rep_type = 'bern'
    for i in [1,2,4]:
      Acc, Scores = dnb(i)
      show(Acc, Scores, algo_name, rep_type, i)

  elif algo_name == 'mcap':
    rep_type = 'bow'
    for i in [1,2,4]:
      Acc, Scores = mcap_bow(i)
      show(Acc, Scores, algo_name, rep_type, i)
    rep_type = 'bern'
    for i in [1,2,4]:
      Acc, Scores = mcap_bern(i)
      show(Acc, Scores, algo_name, rep_type, i)

  elif algo_name == 'sgd':
    rep_type = 'bow'
    for i in [1,2,4]:
      Acc, Scores = sgd_bow(i)
      show(Acc, Scores, algo_name, rep_type, i)
    rep_type = 'bern'
    for i in [1,2,4]:
      Acc, Scores = sgd_bern(i)
      show(Acc, Scores, algo_name, rep_type, i)


Following evaluations are for mnb algorithm and bow representation type with dataset 1 
 Model Accuracy : 0.7280701754385965 
 Model Precision : 0.7608108906794855 
 Model Recall : 0.7894104015915004 
 Model F1 Score :  0.7255145631067961
Following evaluations are for mnb algorithm and bow representation type with dataset 2 
 Model Accuracy : 0.7364016736401674 
 Model Precision : 0.749543475207192 
 Model Recall : 0.8141467727674625 
 Model F1 Score :  0.725322436470438
Following evaluations are for mnb algorithm and bow representation type with dataset 4 
 Model Accuracy : 0.8876611418047882 
 Model Precision : 0.9325221238938053 
 Model Recall : 0.799342105263158 
 Model F1 Score :  0.8383052882855175
Following evaluations are for dnb algorithm and bern representation type with dataset 1 
 Model Accuracy : 0.8377192982456141 
 Model Precision : 0.8820698342969734 
 Model Recall : 0.7585860131604836 
 Model F1 Score :  0.7861054766734281
Following evaluations are for dnb algorithm an