In [1]:
import numpy as np
import pandas as pd
import os
import math

## Constructing our Dataset

In [2]:
dir_path = './languageID/'
spanish_files = []
english_files = []
japanese_files = []
for file_name in os.listdir(dir_path):
    file_path = os.path.join(dir_path, file_name)
    if os.path.isfile(file_path):
        if file_name.startswith('s'):
            spanish_files.append(file_path)
        elif file_name.startswith('e'):
            english_files.append(file_path)
        elif file_name.startswith('j'):
            japanese_files.append(file_path)
slash_index = len(dir_path) + 1
spanish_files.sort(key=lambda x: int(x[slash_index:-4]))
english_files.sort(key=lambda x: int(x[slash_index:-4]))
japanese_files.sort(key=lambda x: int(x[slash_index:-4]))

In [3]:
english_data = []
japanese_data = []
spanish_data = []
symbols = ['a','b','c','d','e','f','g','h','i','j','k','l','m',\
           'n','o','p','q','r','s','t','u','v','w','x','y','z',' ']

for file_path in english_files:
    symbol_counts = {s:0 for s in symbols}
    with open(file_path, 'r') as file:
        for line in file:
            for char in line:
                if char in symbol_counts:
                    symbol_counts[char] += 1
    data_point = list(symbol_counts.values())
    data_point.append(0)
    english_data.append(data_point)

for file_path in japanese_files:
    symbol_counts = {s:0 for s in symbols}
    with open(file_path, 'r') as file:
        for line in file:
            for char in line:
                if char in symbol_counts:
                    symbol_counts[char] += 1
    data_point = list(symbol_counts.values())
    data_point.append(1)
    japanese_data.append(data_point)

for file_path in spanish_files:
    symbol_counts = {s:0 for s in symbols}
    with open(file_path, 'r') as file:
        for line in file:
            for char in line:
                if char in symbol_counts:
                    symbol_counts[char] += 1
    data_point = list(symbol_counts.values())
    data_point.append(2)
    spanish_data.append(data_point)

In [4]:
training_data = []
training_data.extend(english_data[:10])
training_data.extend(japanese_data[:10])
training_data.extend(spanish_data[:10])

test_data = []
test_data.extend(english_data[10:])
test_data.extend(japanese_data[10:])
test_data.extend(spanish_data[10:])

# each row of training and test data contains 28 columns:
#       counts for a-z, count for space,
#       last column is 0, 1, or 2 for english, japanese, and spanish, respectively
training_data = np.array(training_data)
test_data = np.array(test_data)

In [5]:
training_data

array([[ 67,   8,  28,  25, 142,  16,  12,  66,  61,   0,   9,  46,  25,
         75,  88,  17,   0,  72,  76, 100,  38,  13,  28,   0,  16,   1,
        236,   0],
       [ 69,   5,  10,  23, 108,  16,   8,  48,  51,   3,   1,  22,  25,
         43,  59,  12,   0,  57,  73,  87,  27,   6,  15,   0,  19,   1,
        181,   0],
       [ 40,   8,  15,  17,  92,  18,  15,  43,  44,   0,   5,  20,  13,
         41,  55,  19,   0,  44,  54,  60,  14,   6,  10,   1,   8,   2,
        139,   0],
       [138,  30,  48,  50, 225,  41,  24, 101, 123,   3,   5,  62,  54,
        134, 148,  32,   3, 122, 151, 189,  59,  24,  42,   1,  35,   0,
        391,   0],
       [ 41,   5,  16,  18, 102,  21,  24,  52,  51,   0,   2,  24,  20,
         52,  72,  25,   0,  56,  51,  66,  24,  10,  10,   3,  13,   1,
        145,   0],
       [210,  32,  63,  54, 313,  55,  68, 136, 178,   6,  10, 107,  60,
        188, 179,  46,   1, 156, 204, 256,  81,  31,  52,   4,  32,   1,
        557,   0],
       [ 4

## Naive Bayes Implementation

### Q3.2, Q3.3

In [6]:
e_mask = (training_data[:, 27] == 0)
j_mask = (training_data[:, 27] == 1)
s_mask = (training_data[:, 27] == 2)

# Pr(ci | y = e)
pce = np.array([np.sum(training_data[e_mask][:, i]) for i in range(0, 27)])
pce = (pce + 0.5) / (np.sum(pce) + (27/2))

# Pr(ci | y = j)
pcj = np.array([np.sum(training_data[j_mask][:, i]) for i in range(0, 27)])
pcj = (pcj + 0.5) / (np.sum(pcj) + (27/2))

# Pr(ci | y = s)
pcs = np.array([np.sum(training_data[s_mask][:, i]) for i in range(0, 27)])
pcs = (pcs + 0.5) / (np.sum(pcs) + (27/2))

In [7]:
print(pce)
print(pcj)
print(pcs)

[0.06016851 0.01113497 0.02151    0.02197258 0.10536924 0.01893276
 0.01747894 0.04721626 0.05541054 0.00142078 0.00373369 0.02897737
 0.02051875 0.05792169 0.0644639  0.01675202 0.0005617  0.05382455
 0.06618206 0.08012556 0.02666446 0.00928465 0.01549645 0.00115645
 0.01384437 0.00062779 0.17924996]
[1.31765610e-01 1.08669066e-02 5.48586603e-03 1.72263182e-02
 6.02047591e-02 3.87854223e-03 1.40116706e-02 3.17621161e-02
 9.70334393e-02 2.34110207e-03 5.74094133e-02 1.43261470e-03
 3.97987351e-02 5.67105769e-02 9.11632132e-02 8.73545547e-04
 1.04825466e-04 4.28037318e-02 4.21747790e-02 5.69901115e-02
 7.06174220e-02 2.44592753e-04 1.97421294e-02 3.49418219e-05
 1.41514379e-02 7.72214263e-03 1.23449457e-01]
[1.04560451e-01 8.23286362e-03 3.75258241e-02 3.97459221e-02
 1.13810860e-01 8.60287996e-03 7.18448398e-03 4.53270019e-03
 4.98597021e-02 6.62945947e-03 2.77512257e-04 5.29431717e-02
 2.58086399e-02 5.41765595e-02 7.24923684e-02 2.42669051e-02
 7.67783910e-03 5.92951189e-02 6.5770404

In [8]:
# Store log probabilities in case of underflow
lpce = np.log(pce)
lpcj = np.log(pcj)
lpcs = np.log(pcs)

### Q3.4

In [9]:
x = test_data[0][:-1] # e10.txt
x

array([164,  32,  53,  57, 311,  55,  51, 140, 140,   3,   6,  85,  64,
       139, 182,  53,   3, 141, 186, 225,  65,  31,  47,   4,  38,   2,
       498])

### Q3.5

In [10]:
# log Pr(X | y)
def lpr_x_given_y(X: np.array, y: str) -> float:
    theta = None
    if y == 'e':
        theta = lpce
    elif y == 'j':
        theta = lpcj
    elif y == 's':
        theta = lpcs
    else:
        return None
    prob = 0
    for i, x in enumerate(X):
        prob += x*theta[i]
    return prob

In [11]:
lpr_e = lpr_x_given_y(x, 'e')
lpr_j = lpr_x_given_y(x, 'j')
lpr_s = lpr_x_given_y(x, 's')

In [12]:
print('log probability given y=e: ' + str(lpr_e))
print('log probability given y=j: ' + str(lpr_j))
print('log probability given y=s: ' + str(lpr_s))

log probability given y=e: -7841.865447060635
log probability given y=j: -8771.433079075032
log probability given y=s: -8467.282044010557


### Q3.7

In [13]:
# log Pr(X | y)
def con_lpr(X: np.array, y: int) -> int:
    theta = None
    if y == 0:
        theta = lpce
    elif y == 1:
        theta = lpcj
    elif y == 2:
        theta = lpcs
    else:
        return None
    prob = 0
    for i, x in enumerate(X):
        prob += x*theta[i]
    return prob

def predict(X: np.array) -> int:
    lpr_y = []
    lpr_y.append((con_lpr(X, 0), 0))
    lpr_y.append((con_lpr(X, 1), 1))
    lpr_y.append((con_lpr(X, 2), 2))
    lpr_y.sort(key=lambda x: x[0], reverse=True)
    return lpr_y[0][1]

In [14]:
conf_mat = np.zeros((3, 3))
for datum in test_data:
    X = datum[:-1]
    y = datum[-1]
    yhat = predict(X)
    conf_mat[yhat][y] += 1
print(conf_mat)

[[10.  0.  0.]
 [ 0. 10.  0.]
 [ 0.  0. 10.]]
