In [1]:
import numpy as np;
import matplotlib as plt;
import pandas as pd;

class Naive_Bayes:
    def __init__(self, theta, prior):
        ## theta is the dictionary; so is prior
        self.theta = theta;
        self.theta_len = len(theta);
        
        self.prior = prior;
        self.prior_len = len(prior);
        self.train = {}; ## for counting the freq
        for i in prior:
            self.train[i] = np.array([0] * self.theta_len); # now it is an array, but finally will convert to a list
        self.prior_cts = np.array([0] * self.prior_len);
    
    #---------------------------------------------------------------------------------------
    # set training
    # 1.
    def Load_data(self,pathname, filename):
        text_len = {};
        # this record the total length of the training texts in all the priors
        for i in self.prior:
            text_len[i] = 0;
        
        for i in range(len(filename)):
            ## count the freq of prior            
            label = filename[i][0]; ## it is 'e' 'j' 's' in this case
            
            full_name = pathname + filename[i] + ".txt"
            filetest = open(full_name,"r");
            text = filetest.read();
            filetest.close();
            
            cts = self.Counting_theta(text); # this is a list
            text_len[label] += len(text);
            #text_len[label] += sum(cts);
            
            self.train[label] = (self.train[label]) + np.array(cts); 
            ## convert to array first and do the vectorization calculation    
            
        for label in self.train:
            self.train[label] = self.getDistribution(self.train[label],text_len[label],self.theta_len); 
        
        ## save the prior in self.prior_cts
        r = 0;
        for i in self.train:
            self.prior_cts[r] = len(self.train[i]) / len(filename);
            r += 1;
            
        self.prior_cts = self.getDistribution(self.prior_cts, len(filename), (self.prior_len));        
      
    # 2.    
    # return a list counts the number of features
    def Counting_theta(self, text):
        cts = [0] * (self.theta_len);
        for j in text:
            if j in self.theta:
                cts[self.theta[j]] += 1;  
            #if j == '\n':
                #cts[26] += 1;
        return cts;
    # 3.            
    def getDistribution(self,cts, N, V):
        #cts = self.Counting_theta(text);
        #cts = self.Add_1_Smooth(cts,len(text),len(self.theta));        
        cts = self.Add_1_Smooth(cts,N,V);
        cts /= np.sum(cts); ## normalize
        
        cts = np.log10(cts); ## out as log        
        
        return cts;
        
    # 4.    
    def Add_1_Smooth(self,Cws,N,V):
        ## Cws is the count of the word w
        ## N is total number of words
        ## V is the vocabulary size (distinct words)
        
        return ((Cws + 1) / (N + V)); 
    
    # test
    #---------------------------------------------------------------------------------------
    # 5
    def Load_test(self,pathname,filename):
        # filename is a list, thus it can deal with multiple files at the same time
        self.test_cts = [0] * (len(filename));
        for i in range(len(filename)):
            ## count the freq of prior            
            #label = filename[i][0];
            
            full_name = pathname + filename[i] + ".txt"
            filetest = open(full_name,"r");
            text = filetest.read();
            filetest.close();
            
            self.test_cts[i] = self.Counting_theta(text);  
            self.test_cts[i] = np.array(self.test_cts[i]);
            # the counts of test case
    
    # 6        
    # this function calculate the "likelihood and posterior" of a single test case 
    def Classify(self,cts):
        posterior = [0] * self.prior_len;
        likelihood = [0] * self.prior_len;
        
        def getProb(cts,theta):                        
            return cts.dot(theta);
        
        r = 0; ## r is the r_th language, 
        for i in self.train: ## i is exactly the language, 's' 'j' 'e' in this case
            likelihood[r] = getProb(cts, self.train[i]);
            posterior[r] = likelihood[r] + (self.prior_cts[r]);
            r += 1;        
            
        return [likelihood, posterior];   # pred[i] means the log likelihood of the i_th language
    
    # 7 
    # using 6
    # filename is a list, thus it will deal with a series of files and return their likelihood and posterior as a list
    def Pred(self,pathname,filename):        
        result = [];
        for i in self.prior:
            result.append(i);
        self.Load_test(pathname,filename);       
        
        self.likelihood = [0] * len(filename); ## this is p(x|y)
        self.posterior = [0] * len(filename);  ## this is p(x|y) * p(y) = p(y|x);
        self.pred_list = [0] * len(filename);  ## this is the index of y
        self.pred_label = [0] * len(filename); ## this is the name of this y
        for i in range(len(filename)):
            [like, post] = self.Classify(self.test_cts[i]); 
            self.likelihood[i] = like;
            self.posterior[i] = post;
            self.pred_list[i] = post.index(max(post));
            self.pred_label[i] = result[self.pred_list[i]];
    #---------------------------------------------------------------------------------------        
    # 8        
    def Bag_of_words(self,counts): ## counting the frequency of each char for a testing case
        vector = {};
        r = 0;        
        for i in self.theta:
            vector[i] = counts[r];
            r += 1;
        print(vector);
    
    #---------------------------------------------------------------------------------------
    # 9 
    ## this is used to get the confusion matrix
    ## only when you know the true labels of the test cases!!!
    def Confusion_matrix(self, filename):
        y_true =[];
        for i in filename:
            label = i[0];
            y_true.append(self.prior[label]);
            
        rank = self.prior_len;
        mat = np.zeros((rank,rank));
        for i in range(len(y_true)):
            mat[self.pred_list[i],y_true[i]] += 1;
        
        mat = mat.astype(int);        
        labels = [];
        for i in self.prior: ## making dataframe
            labels.append(i);
        
        frame = {};
        for i in range(self.prior_len):
            series = pd.Series(mat[:,i], index = labels);
            frame[labels[i]] = series;
        
        df = pd.DataFrame(frame);        
        return df;
            
            
    #---------------------------------------------------------------------------------------
    # 10    
    def Text_shuffle(self,text):
        text =list(text);
        return ''.join(np.random.shuffle(text));
    #---------------------------------------------------------------------------------------
    # 11
    ## prepare WEKA text files
    def Weka_prep(self,pathname_read,filename,pathname_save):
        def writeFile(fname, content):
            f = open(fname, 'w');
            f.write(content);
            f.close()
        
        def Space(text):
            text = list(text);            
            for i in range(len(text)):                
                if text[i] == ' ':
                    text[i] = 'space';
            return ' '.join(text);
        
        for i in range(len(filename)):        
            label = filename[i][0];
            
            full_name = pathname_read + filename[i] + ".txt"
            filetest = open(full_name,"r");
            text = filetest.read();
            filetest.close();
            
            text = Space(text);
            
            full_name = pathname_save + label + '/' + filename[i][1:] + ".txt";
            writeFile(full_name, text);
            
        print("Weka prepared!");
    

In [2]:
alphabet = {"a":0, "b":0,"c":0, "d":0,"e":0,"f":0,"g":0, "h":0,"i":0, "j":0, "k":0, "l":0, "m":0, "n":0, "o":0, "p":0, "q":0, "r":0,
            "r":0, "s":0, "t":0, "u":0, "v":0, "w":0, "x":0, "y":0, "z":0, " ": 0};
r = 0;
for i in alphabet:
    alphabet[i] += r;
    r += 1;
    
language = {"e":0, "j":0, "s":0};
r = 0;
for i in language:
    language[i] += r;
    r += 1;
    
training_files = [];
for j in language:
    for i in range(0,10):
        training_files += [j + str(i)];

pathname = 'languageID/languageID/languageID/'; 
## filename = pathname + such as e2 + .txt


In [3]:
NB = Naive_Bayes(alphabet,language);
NB.Load_data(pathname,training_files);

In [4]:
## Q 3.1:
print(np.power(10,NB.prior_cts));

[0.33333333 0.33333333 0.33333333]


In [5]:
## Q 3.(2-3)
## print the thetas
for i in NB.train:
    print("Theta_%s:"%(i))
    theta = np.power(10,NB.train[i]);   
    theta = np.round(theta,6)
    ans = {};
    r = 0;
    for i in NB.theta:
        if i == " ":
            ans['space'] = theta[r];
        else:
            ans[i] = theta[r];
        r += 1;
    print(ans);

Theta_e:
{'a': 0.060148, 'b': 0.011158, 'c': 0.021524, 'd': 0.021986, 'e': 0.105308, 'f': 0.018949, 'g': 0.017496, 'h': 0.047207, 'i': 0.055394, 'j': 0.001453, 'k': 0.003763, 'l': 0.028985, 'm': 0.020533, 'n': 0.057903, 'o': 0.064439, 'p': 0.01677, 'q': 0.000594, 'r': 0.05381, 's': 0.066156, 't': 0.080087, 'u': 0.026674, 'v': 0.009309, 'w': 0.015516, 'x': 0.001188, 'y': 0.013865, 'z': 0.00066, 'space': 0.179123}
Theta_j:
{'a': 0.131676, 'b': 0.010892, 'c': 0.005516, 'd': 0.017245, 'e': 0.060183, 'f': 0.00391, 'g': 0.014033, 'h': 0.031767, 'i': 0.096977, 'j': 0.002374, 'k': 0.05739, 'l': 0.001466, 'm': 0.039796, 'n': 0.056692, 'o': 0.091112, 'p': 0.000908, 'q': 0.00014, 'r': 0.042798, 's': 0.04217, 't': 0.056971, 'u': 0.070586, 'v': 0.000279, 'w': 0.019758, 'x': 7e-05, 'y': 0.014173, 'z': 0.00775, 'space': 0.123368}
Theta_s:
{'a': 0.104504, 'b': 0.008257, 'c': 0.037525, 'd': 0.039744, 'e': 0.113747, 'f': 0.008627, 'g': 0.007209, 'h': 0.00456, 'i': 0.049849, 'j': 0.006655, 'k': 0.000308,

In [6]:
test_files = [];
for j in language:
    for i in range(10,20):
        test_files += [j + str(i)];
print(test_files)
NB.Pred(pathname,test_files)
#print(NB.pred_list)
#NB.pred_label


['e10', 'e11', 'e12', 'e13', 'e14', 'e15', 'e16', 'e17', 'e18', 'e19', 'j10', 'j11', 'j12', 'j13', 'j14', 'j15', 'j16', 'j17', 'j18', 'j19', 's10', 's11', 's12', 's13', 's14', 's15', 's16', 's17', 's18', 's19']


In [7]:
## Q 3.4
NB.Bag_of_words(NB.test_cts[0]) ## the e10.text is the first element in this list!!

{'a': 164, 'b': 32, 'c': 53, 'd': 57, 'e': 311, 'f': 55, 'g': 51, 'h': 140, 'i': 140, 'j': 3, 'k': 6, 'l': 85, 'm': 64, 'n': 139, 'o': 182, 'p': 53, 'q': 3, 'r': 141, 's': 186, 't': 225, 'u': 65, 'v': 31, 'w': 47, 'x': 4, 'y': 38, 'z': 2, ' ': 498}


In [8]:
## Q 3.5
ans = NB.likelihood[0];
print(ans) ## the e10.txt is the first element in this list
def sci_exp(x):
    intx = int(x);
    if intx > x:
        intx -= 1;
    delta = x - intx;
    coe = 10 ** delta;
    coe = round(coe,3);
    print(coe,'e',intx);
    
for i in ans:
    sci_exp(i)

[-3405.6445560384063, -3804.210716450759, -3670.8233803708918]
2.267 e -3406
6.156 e -3805
1.502 e -3671


In [9]:
## Q 3.6
ans = NB.posterior[0];
print('post :\n',ans);
for i in ans:
    sci_exp(i)
print('pred_label:\n', NB.pred_label[0])

post :
 [-3406.121677293126, -3804.6878377054786, -3671.3005016256116]
7.557 e -3407
2.052 e -3805
5.006 e -3672
pred_label:
 e


In [10]:
## prepare weka files
'''
weka_files = [];
for j in language:
    for i in range(0,20):
        weka_files += [j + str(i)];
save_path =  'Weka/'       
NB.Weka_prep(pathname, weka_files, save_path);
'''

"\nweka_files = [];\nfor j in language:\n    for i in range(0,20):\n        weka_files += [j + str(i)];\nsave_path =  'Weka/'       \nNB.Weka_prep(pathname, weka_files, save_path);\n"

In [11]:
NB.Confusion_matrix(test_files)

Unnamed: 0,e,j,s
e,10,0,0
j,0,10,0
s,0,0,10
