In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Data Exploration

I have the languageID dataset in the directory: /content/drive/MyDrive/ECE:760/hw4/data. A brief look at the data shows all the characters are english characters, which makes the task straightforward. Given in the question, that the 26 characters of english(lower-cased) and space are the characters used in the documents. The three langauges are English, Spanish and Japanese. Each language has 20 docs named 0-19. The task is Bag of Characters Multinomial Bayes Model. The question says that the characters like \n are to be ignored, which means a round of additional preprocessing is required.

## Multinomial Naive Bayes Model for Language Classification: Building the model using the triaining set

In [72]:
## Importing necessary libraries
import glob
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math
import random
from sklearn.metrics import confusion_matrix,accuracy_score

In [3]:
data_root_dir="/content/drive/MyDrive/ECE:760/hw4/data"

In [4]:
## Making the train set using the docs 0-9 from the three languages

#Read the filenames  
train_files=glob.glob('/content/drive/MyDrive/ECE:760/hw4/data/[ejs][0-9].txt')

#Create a mapping for the languageID('e' or 'j' or 's') to (0,1,2)
lang_to_num_map={'e':0,
                 'j':1,
                 's':2}
num_to_lang_map={0:'e',
                 1:'j',
                 2:'s'}

#Let's define the function to create the dataset (List of Tuples(List,Int)) consisting of first item that is a list(bag) of characters and the second item referring to the label(language)


def create_dataset(file_list):
  
  """
  Creates a dataset of character-level text data from a list of filenames.

  Args:
  - file_list: A list of filenames to read data from.

  Returns:
  - data: A list of tuples where each tuple contains a list of characters (char_bag) and a label.
    The label is an integer representing the language of the text(e->0,j->1,s->2)
  """

  data=[]

  for filename in file_list:
    with open(filename) as fin:
      lines=fin.readlines()
      char_bag=[]
      for line in lines:
        line=str(line).replace("\n","")
        if len(list(line))!=0:
          for char in list(line):
            char_bag.append(char)
      
      label=lang_to_num_map[filename.split('/')[-1][0]]
      data.append((char_bag,label))


  return data

train_set=create_dataset(sorted(train_files))

In [20]:
### Getting the class conditional probabilties of the characters 

#Let's define a function to get the 27 dimensional count vector for each of the docs in the dataset
def get_count_vector(dataset):
    """
  Get a 27-dimensional count vector for each document in the dataset.

  Args:
  - dataset: a list of tuples, where each tuple contains a bag of characters for a document
            and its corresponding label.

  Returns:
  - char_count_dataset: a 2D numpy array with shape (num_docs, 27), where each row corresponds to a
                        document's count vector for the 27 characters in the English alphabet and space.
                        The order of characters is ['a', 'b', 'c', ..., 'z', ' '].
  """
    char_count_dataset=np.zeros(shape=(len(dataset),27),dtype=np.float32)
    char_list=['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l',
  'm',  'n',  'o',  'p', 'q', 'r',  's',  't', 'u', 'v', 'w', 'x', 'y', 'z',' ']
    for doc_no,(doc_bag_of_chars,label) in enumerate(dataset):
      for idx,character in enumerate(char_list):
        char_count_dataset[doc_no][idx]+=doc_bag_of_chars.count(character)
    return char_count_dataset

char_count_train=get_count_vector(train_set)

smoothing_parameter=0.5

char_count_eng=np.sum(char_count_train[:10],axis=0)
class_conditional_prob_eng=(char_count_eng+smoothing_parameter)/(np.sum(char_count_eng)+27*smoothing_parameter)

char_count_jap=np.sum(char_count_train[10:20],axis=0)
class_conditional_prob_jap=(char_count_jap+smoothing_parameter)/(np.sum(char_count_jap)+27*smoothing_parameter)

char_count_spa=np.sum(char_count_train[20:],axis=0)
class_conditional_prob_spa=(char_count_spa+smoothing_parameter)/(np.sum(char_count_spa)+27*smoothing_parameter)

    


In [21]:
np.set_printoptions(suppress = True)
print(f"The class conditional probabaility for english is:{class_conditional_prob_eng}")
print(f"The class conditional probabaility for japanese is:{class_conditional_prob_jap}")
print(f"The class conditional probabaility for spanish is:{class_conditional_prob_spa}")

The class conditional probabaility for english is:[0.06016851 0.01113497 0.02151    0.02197258 0.10536924 0.01893276
 0.01747894 0.04721626 0.05541054 0.00142078 0.00373369 0.02897737
 0.02051875 0.05792169 0.0644639  0.01675202 0.0005617  0.05382455
 0.06618206 0.08012556 0.02666446 0.00928465 0.01549645 0.00115645
 0.01384437 0.00062779 0.17924996]
The class conditional probabaility for japanese is:[0.1317656  0.01086691 0.00548587 0.01722632 0.06020476 0.00387854
 0.01401167 0.03176212 0.09703344 0.0023411  0.05740941 0.00143261
 0.03979874 0.05671058 0.09116321 0.00087355 0.00010483 0.04280373
 0.04217478 0.05699011 0.07061742 0.00024459 0.01974213 0.00003494
 0.01415144 0.00772214 0.12344946]
The class conditional probabaility for spanish is:[0.10456045 0.00823286 0.03752583 0.03974592 0.11381086 0.00860288
 0.00718448 0.0045327  0.0498597  0.00662946 0.00027751 0.05294317
 0.02580864 0.05417656 0.07249237 0.0242669  0.00767784 0.05929512
 0.0657704  0.03561407 0.03370232 0.005889

In [22]:
## Getting the count vector for e10.txt
e10_data=create_dataset([f'{data_root_dir}/e10.txt'])
char_count_e10=get_count_vector(e10_data)

In [8]:
print(f"The count vector for the file e10.txt is {char_count_e10}")

The count vector for the file e10.txt is [[164.  32.  53.  57. 311.  55.  51. 140. 140.   3.   6.  85.  64. 139.
  182.  53.   3. 141. 186. 225.  65.  31.  47.   4.  38.   2. 498.]]


In [32]:
#Getting the likelihood estimate for e10
#Using the formula \log \hat p(x \mid y) = x_i\sum_{i=1}^d \log \theta_{i,y}\

def get_log_likelihood_estimate(count_vector,class_conditional_probabilties):
  """
Calculate the log-likelihood estimate for a given count vector and class conditional probabilities.

Args:
- count_vector: a 1D numpy array with length 27, representing the count of each character
                in a document.
- class_conditional_probabilities: a 1D numpy array with length 27, representing the class conditional
                                   probabilities of each character in a language model.

Returns:
- log_likelihood_estimate: a float representing the log-likelihood estimate for the given count vector
                           and class conditional probabilities.
"""
  log_likelihood_estimate=0
  for i in range(27):
    log_likelihood_estimate+=count_vector[i]*math.log(class_conditional_probabilties[i])

  return log_likelihood_estimate

e10_llhood_eng=get_log_likelihood_estimate(char_count_e10[0],class_conditional_prob_eng)
e10_llhood_jap=get_log_likelihood_estimate(char_count_e10[0],class_conditional_prob_jap)
e10_llhood_spa=get_log_likelihood_estimate(char_count_e10[0],class_conditional_prob_spa)

In [33]:
print(f"The likelihood estimate in the log space for english is:{e10_llhood_eng}")
print(f"The likelihood estimate in the log space for japanese is:{e10_llhood_jap}")
print(f"The likelihood estimate in the log space for spanish is:{e10_llhood_spa}")

The likelihood estimate in the log space for english is:-7841.865448746583
The likelihood estimate in the log space for japanese is:-8771.43306428609
The likelihood estimate in the log space for spanish is:-8467.282069010776


In [34]:
#The priors are three for all the languages. So the posterior estimate is log(1/3) + likelihood(log)estimate
e10_log_posterior_eng=math.log(1/3)+e10_llhood_eng
e10_log_posterior_jap=math.log(1/3)+e10_llhood_jap
e10_log_posterior_spa=math.log(1/3)+e10_llhood_spa

In [35]:
print(f"The posterior estimate in the log space for english is:{e10_log_posterior_eng}")
print(f"The posterior estimate in the log space for japanese is:{e10_log_posterior_jap}")
print(f"The posterior estimate in the log space for spanish is:{e10_log_posterior_spa}")

The posterior estimate in the log space for english is:-7842.964061035251
The posterior estimate in the log space for japanese is:-8772.531676574758
The posterior estimate in the log space for spanish is:-8468.380681299444


In [46]:
### Shuffling the character order in e10 to see if the estimates change or not(Should not change as Naive Bayes considers the features to be independent)
e10_data=create_dataset([f'{data_root_dir}/e10.txt'])
random.shuffle(e10_data[0][0])

In [48]:
print(e10_data[0][0])

['n', 'o', 'r', 's', ' ', ' ', 'a', 'n', ' ', 'e', ' ', 'h', 'n', ' ', 'n', 'a', 'v', 'i', 'c', 'a', 'p', 't', 't', 'v', 'e', 'e', 'w', 's', 'o', 'e', 'n', 'h', 'o', 'e', 's', 'l', ' ', 'c', 's', ' ', 'f', 'r', 't', 'h', 'l', 'r', 'a', ' ', 's', ' ', 'a', 'e', 'f', 'l', 'h', 'm', 'i', 'e', 'f', 'u', 'r', 'o', 't', 'e', ' ', 'r', 't', 'g', 'e', ' ', 'u', ' ', 'g', 's', 'h', 'm', 'u', 'f', 'l', 'a', 't', 't', 'p', 'n', 'e', 'i', 't', 'i', 't', 'r', 'b', ' ', 't', 'h', 'n', 't', 'a', 'i', 'e', ' ', 'r', 's', 'r', 'o', 'f', 'e', 't', 'o', 'p', 'n', 'i', 'd', 'r', 't', 'e', 'o', 'e', 'e', 'e', 'f', 'n', 'c', 'e', ' ', 's', 'r', 'e', 'm', ' ', 'e', 'm', 'p', ' ', ' ', ' ', 'o', ' ', ' ', ' ', 'i', 'h', 'o', 'c', 'u', 'l', 'i', 'o', 'i', 't', 'e', 'f', 'o', 's', 's', 'u', 'e', 'p', 'y', 'e', ' ', 'n', 'p', ' ', 'l', ' ', ' ', 'a', 'w', 'h', 'c', 'r', 's', 't', 'g', 'e', 't', 'n', 'i', 's', 'e', 'm', 'e', 'm', 'i', 't', 'o', 'e', 'r', ' ', ' ', 's', 's', 't', 'o', 'o', 'h', 'n', 'k', 'u', 'a',

In [49]:
get_count_vector(e10_data) #Same count vector

array([[164.,  32.,  53.,  57., 311.,  55.,  51., 140., 140.,   3.,   6.,
         85.,  64., 139., 182.,  53.,   3., 141., 186., 225.,  65.,  31.,
         47.,   4.,  38.,   2., 498.]], dtype=float32)

## Evaluating the test set using the built classifier model

In [50]:
test_files=sorted(glob.glob('/content/drive/MyDrive/ECE:760/hw4/data/[ejs][1][0-9].txt'))

In [52]:
#Let's first make the test dataset
test_data=create_dataset(test_files)
#Get the count vector for each of the files in the test set
count_vector_test=get_count_vector(test_data)

In [60]:
#Now let's estimate the log posteriors for all the docs for the three languages
log_posterior_estimate_test=np.zeros(shape=(30,3),dtype=np.float32)

for idx,cvector in enumerate(count_vector_test):
  log_posterior_estimate_test[idx][0]=get_log_likelihood_estimate(cvector,class_conditional_prob_eng) +math.log(1/3)
  log_posterior_estimate_test[idx][1]=get_log_likelihood_estimate(cvector,class_conditional_prob_jap) +math.log(1/3)
  log_posterior_estimate_test[idx][2]=get_log_likelihood_estimate(cvector,class_conditional_prob_spa) +math.log(1/3)

In [67]:
#Get the prediction and the confusion matrix 
prediction=np.argmax(log_posterior_estimate_test,axis=1)
true_values=np.array([tup[1] for tup in test_data])

In [75]:
print(f"The accuracy of the classifier on this test set is:{accuracy_score(true_values,prediction)*100}%")

The accuracy of the classifier on this test set is:100.0%


In [71]:
confusion_matrix(true_values, prediction)

array([[10,  0,  0],
       [ 0, 10,  0],
       [ 0,  0, 10]])