<a href="https://colab.research.google.com/github/HuyenNguyenHelen/LING-5412/blob/main/Assignment1_NaiveBayes.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Loading dataset

In [29]:
import numpy as np
import os
import tarfile
import re
import sklearn
from sklearn.metrics import accuracy_score
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import cross_validate
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer


In [26]:
my_tar = tarfile.open('/content/lingspam_public.tar.gz')
my_tar.extractall('/content/') 
my_tar.close()
train_path = '/content/lingspam_public/lemm_stop/part1'  # for training      #spams: spmsg*.txt
test_path = '/content/lingspam_public/lemm_stop/part10'   # for testing

In [27]:
def to_dict (path):
  data_dict = dict()
  data_dict[1] = []
  data_dict[0] = []
  for file in os.listdir(path):  
    doc = open (path + '/'+ file, 'r')
    if 'spmsg' in file:
      data_dict[1].append(doc.read())
    else:
      data_dict[0].append(doc.read())
  print ('number of spams: {}'.format(len(data_dict[1])))
  print ('number of not_spams: {}'.format(len(data_dict[0])))
  n_docs = len(os.listdir(path))
  return data_dict, n_docs

print('training set:')
training, n_docs_train = to_dict (train_path)
print('number of doc: {}'.format(n_docs_train))

print('\ntesting set:')
testing, n_docs_test = to_dict (test_path)
print('number of doc: {}'.format(n_docs_test))


training set:
number of spams: 48
number of not_spams: 241
number of doc: 289

testing set:
number of spams: 49
number of not_spams: 242
number of doc: 291


In [35]:
def tokenizer (doc):
  doc = doc.lower()
  doc_clean = doc.lower()
  return re.split("\W+", doc)
  
def count_words(data_dict):
  tf = {}
  tf[0], tf[1] = {}, {}
  all_docs = data_dict[0] + data_dict[1]
  temp0, temp1 = {},{}
  for doc in data_dict[0]:
    for token in tokenizer(doc):
      temp0[token] = temp0.get(token, 0)+1
    tf[0] = temp0
  for doc in data_dict[1]: 
    for token in tokenizer(doc):
      temp1[token] = temp1.get(token, 0)+1
    tf[1] = temp1
  print('sum of tf0: {}, sum of tf1 {}'. format(sum(tf[0].values()), sum(tf[1].values())))
  return tf

def to_bow (data_dict):
  bows = {}
  bows[0], bows[1] = [], []
  for doc in data_dict[0]:
    bows[0].extend(tokenizer(doc))
  for doc in data_dict[1]:
    bows[1].extend(tokenizer(doc))
  return bows

def logprior(data_dict, n_docs):
  logprior = {}
  n_spams = len(data_dict[1])
  n_nonspams = len(data_dict[0])
  print('length of spams: {}, nonspam: {}'.format(n_spams, n_nonspams))
  logprior[1] = np.log(n_spams/n_docs)
  logprior[0] = np.log(n_nonspams/n_docs)
  return logprior


def train_NB (training, alpha):
  bows = to_bow (training)
  set_V = set(to_bow (training)[1] + to_bow (training)[0])
  tf = count_words(training)
  logpriors = logprior(training,n_docs_train)
  loglikelihood = {}
  loglikelihood[0], loglikelihood[1] = {}, {}
  for c in training.keys():
    bow_c = bows[c]
    print('.....length of bow {}:  {}.......'.format(c, len(bow_c)))
    loglikelihood_c = {}
    for w in set_V:
      if w in tf[c]:
        count_w = tf[c][w]  
      else:
        count_w = 0
      # print('count ---{} ---in c: {} '.format(w, count_w))
      loglikelihood_w = np.log((count_w + alpha)/(len(bow_c)+(len(set_V)*alpha)))
      loglikelihood_c[w] = loglikelihood_w
    loglikelihood[c] = loglikelihood_c
  print('length of set V: {}'. format(len(set_V)))
  print('log prior class {}: {}'. format(0, logpriors[0]))
  print('log prior class {}: {}'. format(1, logpriors[1]))

  return logpriors, loglikelihood, set_V

training_result = train_NB (training, alpha = 1)


sum of tf0: 47459, sum of tf1 21015
length of spams: 48, nonspam: 241
.....length of bow 1:  21015.......
.....length of bow 0:  47459.......
length of set V: 11315
log prior class 0: -0.18162975462177716
log prior class 1: -1.7952256772045412


In [37]:
def test_NB (testing):
  # logpriors= logprior(testing,n_docs_test)
  prediction = []
  X_test = testing[0]+ testing[1]
  for doc in X_test:
    spam_loglikelihoods, nonspam_loglikelihoods = [], []
    spam_score=0
    nonspam_score = 0
    spam_bow = training_result[1][1]
    nonspam_bow = training_result[1][0]
    spam_logprior = training_result[0][1]
    nonspam_logprior = training_result[0][0]
    set_V = training_result[2]
    for w in tokenizer(doc):
      if w not in set_V: 
        continue
      if w in spam_bow.keys():
        spam_score += spam_bow[w]
      if w in nonspam_bow.keys():
        nonspam_score += nonspam_bow[w]
        
    spam_score += spam_logprior
    nonspam_score += nonspam_logprior

    if spam_score > nonspam_score:
      prediction.append(1)
    else:
      prediction.append(0)
  return prediction

X_train = training[0]+ training[1]
y_train = [0]*len(training[0]) + [1]*len(training[1])

X_test = testing[0]+ testing[1]
y_true = [0]*len(testing[0]) + [1]*len(testing[1])

y_pred = test_NB (testing)

print('accuracy on training set: {}'.format(sklearn.metrics.accuracy_score(y_train, test_NB(training))))
print(classification_report(y_train, test_NB(training)))

print('accuracy on test set: {}'.format(sklearn.metrics.accuracy_score(y_true, y_pred)))
print(classification_report(y_true, y_pred))

accuracy on training set: 0.9896193771626297
              precision    recall  f1-score   support

           0       1.00      0.99      0.99       241
           1       0.94      1.00      0.97        48

    accuracy                           0.99       289
   macro avg       0.97      0.99      0.98       289
weighted avg       0.99      0.99      0.99       289

accuracy on test set: 0.9381443298969072
              precision    recall  f1-score   support

           0       0.99      0.94      0.96       242
           1       0.75      0.94      0.84        49

    accuracy                           0.94       291
   macro avg       0.87      0.94      0.90       291
weighted avg       0.95      0.94      0.94       291



In [None]:


X_train = training[0]+ training[1]
y_train= [0]*len(training[0]) + [1]*len(training[1])

# Creating a vectorizer model that convert a collection of text documents to a matrix of token counts
vectorizer = CountVectorizer(lowercase = False)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)
# Converting  sparse matrix to a dense matrix
X_train_vec = X_train_vec.toarray()
X_test_vec = X_test_vec.toarray()
# y_train = y_train.reshape(y_train.shape[0],-1)

nb_2 = MultinomialNB()
nb_2.fit(X_train_vec, y_train)
y_pred_2 = nb_2.predict(X_test_vec)

print('accuracy on training set: {}'.format(sklearn.metrics.accuracy_score(y_train, nb_2.predict(X_train_vec))))
print(classification_report(y_train, nb_2.predict(X_train_vec)))

print('accuracy on test set: {}'.format(sklearn.metrics.accuracy_score(y_true, y_pred_2)))
print(classification_report(y_true, y_pred_2))

accuracy on training set: 0.9965397923875432
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       241
           1       0.98      1.00      0.99        48

    accuracy                           1.00       289
   macro avg       0.99      1.00      0.99       289
weighted avg       1.00      1.00      1.00       289

accuracy on test set: 0.9896907216494846
              precision    recall  f1-score   support

           0       0.99      1.00      0.99       242
           1       1.00      0.94      0.97        49

    accuracy                           0.99       291
   macro avg       0.99      0.97      0.98       291
weighted avg       0.99      0.99      0.99       291

