In [6]:
"""
In this project, we will use
Naïve Bayes to predict if a word is German or English.
"""

'\nIn this project, we will use\nNaïve Bayes to predict if a word is German or English.\n'

In [2]:
import numpy as np
import sys
import cvxpy 
from matplotlib import pyplot as plt

%matplotlib inline

In [1]:
%pip install cvxpy

--- Logging error ---
Traceback (most recent call last):
  File "/Applications/anaconda3/lib/python3.9/site-packages/pip/_internal/utils/logging.py", line 177, in emit
    self.console.print(renderable, overflow="ignore", crop=False, style=style)
  File "/Applications/anaconda3/lib/python3.9/site-packages/pip/_vendor/rich/console.py", line 1673, in print
    extend(render(renderable, render_options))
  File "/Applications/anaconda3/lib/python3.9/site-packages/pip/_vendor/rich/console.py", line 1305, in render
    for render_output in iter_render:
  File "/Applications/anaconda3/lib/python3.9/site-packages/pip/_internal/utils/logging.py", line 134, in __rich_console__
    for line in lines:
  File "/Applications/anaconda3/lib/python3.9/site-packages/pip/_vendor/rich/segment.py", line 249, in split_lines
    for segment in segments:
  File "/Applications/anaconda3/lib/python3.9/site-packages/pip/_vendor/rich/console.py", line 1283, in render
    renderable = rich_cast(renderable)
  File 

In [3]:
"""
Helper function to extract features where B = 26 (dimension) 
"""
def feature_extraction_letters(word, B):
    v = np.zeros(B)
    for letter in word:
        v[ord(letter) - 97] += 1
    return v

In [4]:
"""
Another helper function to read in the file
"""
def language2features(filename, B=26, LoadFile=True):
    """
    Output:
    X : n feature vectors of dimension B, (nxB)
    """
    if LoadFile:
        with open(filename, 'r') as f:
            words = [x.rstrip() for x in f.readlines() if len(x) > 0]
    else:
        words = filename.split('\n')
    n = len(words)
    X = np.zeros((n, B))
    for i in range(n):
        X[i,:] = feature_extraction_letters(words[i].lower(), B)
    return X

In [5]:
"""
Load the provided function genFeatures, which calls this language2features, 
transforms the words into features and loads them into memory. 
"""
def genFeatures(dimension, language2features, file_german, file_english):
    """
    function [x,y]=genFeatures
    
    This function calls "language2features.py" to convert 
    words into feature vectors and load training data. 
    
    language2features: function that extracts features from language word
    dimension: dimensionality of the features
    
    Output: 
    x: n feature vectors of dimensionality d [n,d]
    y: n labels (-1 = German, +1 = English)
    """
    
    # Load in the data
    Xgerman = language2features(file_german, B=dimension)
    Xenglish = language2features(file_english, B=dimension)
    X = np.concatenate([Xgerman, Xenglish])
    
    # Generate Labels
    Y = np.concatenate([-np.ones(len(Xgerman)), np.ones(len(Xenglish))])
    
    # shuffle data into random order
    ii = np.random.permutation([i for i in range(len(Y))])
    
    return X[ii, :], Y[ii]

In [6]:
"""
Call the following command to load features and labels of all German and English words
"""
X,Y = genFeatures(26, language2features, "german_train.txt", "english_train.txt")
xTe, yTe = genFeatures(26, language2features, "german_test.txt", "english_test.txt")

In [7]:
"""
Estimate the class probability P(Y) in naivebayesPY. Should 
return the probability that a sample in the training set is positive 
or negative, independent of its features.
"""
def naivebayesPY(x,y):
    """
    function [pos,neg] = naivebayesPY(x,y);

    Computation of P(Y)
    Input:
        x : n input vectors of d dimensions (n,d)
        y : n labels (-1 or +1) (n,)

    Output:
    pos: probability p(y=1)
    neg: probability p(y=-1)
    """
    
    
    ## TODO 1
    #like the coin toss, count the number of labels?
    negcount = 0
    for i in y:
        if i == -1:
            negcount += 1
    neg = negcount / len(y)
    pos = 1 - (negcount/len(y))
    ## TODO 1
    
    return pos, neg


pos,neg = naivebayesPY(X,Y)

In [8]:
"""
Estimate the conditional probabilities P(X|Y) (Maximum Likelihood Estimate) 
without smoothing in naivebayesPXY_mle. We use a multinomial distribution as model. 
Return the probability vectors for all features given a class label. 
"""
def naivebayesPXY_mle(x,y):
    """
    function [posprob,negprob] = naivebayesPXY(x,y);
    
    Computation of P(X|Y) -- Maximum Likelihood Estimate
    Input:
        x : n input vectors of d dimensions (n,d)
        y : n labels (-1 or +1) (n,)
    
    Output:
    posprob: probability vector of p(x=ith letter of alphabet|y=1) (d,)
    negprob: probability vector of p(x=ith letter of alphabet|y=-1) (d,)
    """
    #need to compute P(X | -1) English and P(X | 1) German
    """
    focus on german first: counting total number of letters in all words while also 
    filling out the number of times a letter appears in each of the wrods (in one 26-d array)
    divide each by the total letters in all words
    this rep the probability of a german thing container the word
    """
    """
    german = np.zeros(26)
    german_count = 0
    english = np.zeros(26)
    english_count = 0
    
    
    for i in range(len(x)):
        word = x[i]
        #word is a vector representing a word
        if y[i] == 1:
            # german
            for j in range(len(word)):
                #j is index of letter
                #word[j] is how many times it occurs in word
                #individual entries in vector word
                german[j] += word[j]
                german_count += word[j]
        else:
            # english
            for j in range(len(word)):
                #j is index of letter
                #word[j] is how many times it occurs in word
                #individual entries in vector word
                english[j] += word[j]
                english_count += word[j]
            
        
    for i in german:
        i /= german_count
    for i in english:
        i /= english_count
        
    posprob = german
    negprob = english
    
    return posprob, negprob

    """
    n, d = x.shape
    pos_count = np.zeros(d)  # Count of each letter for positive class
    neg_count = np.zeros(d)  # Count of each letter for negative class

    # Iterate over each input vector and update counts based on class label
    for i in range(n):
        if y[i] == 1:
            pos_count += x[i]
        else:
            neg_count += x[i]

    # Calculate probabilities by dividing counts by the total count for each class
    total_pos = np.sum(pos_count)
    total_neg = np.sum(neg_count)
    posprob = pos_count / total_pos
    negprob = neg_count / total_neg

    return posprob, negprob
    

posprob_mle,negprob_mle = naivebayesPXY_mle(X,Y)

In [9]:
"""
Estimate the conditional probabilities P(X|Y) (Smoothing with Laplace estimate) 
in naivebayesPXY_smoothing. 
Used a multinomial distribution as model. 
Will return the probability vectors for all features given a class label
"""
def naivebayesPXY_smoothing(x,y):
    """
    function [posprob,negprob] = naivebayesPXY(x,y);
    
    Computation of P(X|Y) -- Smoothing with Laplace estimate
    Input:
        x : n input vectors of d dimensions (n,d)
        y : n labels (-1 or +1) (n,)
    
    Output:
    posprob: probability vector of p(x|y=1) (d,)
    negprob: probability vector of p(x|y=-1) (d,)
    """
  
    """
    german = np.zeros(26)
    german_count = 0
    english = np.zeros(26)
    english_count = 0
    
    
    for i in range(len(x)):
        word = x[i]
        #word is a vector representing a word
        if y[i] == 1:
            # german
            for j in range(len(word)):
                #j is index of letter
                #word[j] is how many times it occurs in word
                #individual entries in vector word
                german[j] += word[j] + 1
                german_count += word[j] + 1
        else:
            # english
            for j in range(len(word)):
                #j is index of letter
                #word[j] is how many times it occurs in word
                #individual entries in vector word
                english[j] += word[j] + 1
                english_count += word[j] + 1
            
        
    for i in german:
        i /= german_count
    for i in english:
        i /= english_count
        
    posprob = german
    negprob = english
    return posprob, negprob
    """
    """
    german = np.zeros(26)
    english = np.zeros(26)
    alpha = 1  # Laplace smoothing constant

    for i in range(len(x)):
        word = x[i]
        if y[i] == 1:
            # German
            for j in range(len(word)):
                german[j] += word[j] + alpha  # Add 1 (Laplace smoothing) to each count
        else:
            # English
            for j in range(len(word)):
                english[j] += word[j] + alpha  # Add 1 (Laplace smoothing) to each count

    # Normalize the probabilities
    german_sum = np.sum(german)
    english_sum = np.sum(english)
    german /= (german_sum + alpha * 26)  # Normalize with Laplace smoothing
    english /= (english_sum + alpha * 26)  # Normalize with Laplace smoothing
    
    posprob = german
    negprob = english
    return posprob,negprob

    """
    n, d = x.shape
    pos_count = np.zeros(d)  # Count of occurrences of each letter in the alphabet for positive class
    neg_count = np.zeros(d)  # Count of occurrences of each letter in the alphabet for negative class

    # Iterate over each input vector and update counts based on class label
    for i in range(n):
        if y[i] == 1:
            pos_count += x[i]
        else:
            neg_count += x[i]

    # Calculate probabilities by dividing counts by the total count for each class
    total_pos = np.sum(pos_count)
    total_neg = np.sum(neg_count)
    
    # Smoothing with Laplace estimate (add 1 to counts and d to the denominator)
    posprob = (pos_count + 1) / (total_pos + d)  # d is the number of dimensions (letters)
    negprob = (neg_count + 1) / (total_neg + d)

    return posprob, negprob


posprob_smoothing,negprob_smoothing = naivebayesPXY_smoothing(X,Y)

In [10]:
"""
Solve for log ratio using Bayes Rule
"""
def naivebayes(x,y,xtest,naivebayesPXY):
    """
    function logratio = naivebayes(x,y);
    
    Computation of log P(Y|X=x1) using Bayes Rule
    Input:
    x : n input vectors of d dimensions (n,d)
    y : n labels (-1 or +1)(n,)
    xtest: input vector of d dimensions (d,)
    naivebayesPXY: input function for getting conditional probabilities (naivebayesPXY_smoothing OR naivebayesPXY_mle)
    
    Output:
    logratio: log (P(Y = 1|X=xtest)/P(Y=-1|X=xtest))
    """
    
    ## TODO 4
    pos_prior, neg_prior = naivebayesPY(x,y)
    
    """
    pos_prior = np.sum(y == 1) / len(y)
    neg_prior = 1 - pos_prior
    """
    

    # Calculate conditional probabilities
    pos_cond, neg_cond = naivebayesPXY(x, y)

    # Calculate the numerator and denominator of the log ratio
    numerator = pos_prior * np.prod(pos_cond[xtest == 1]) #* neg_prior * np.prod(neg_cond[xtest == 1])
    denominator = neg_prior * np.prod(neg_cond[xtest==1]) #* pos_prior * np.prod(pos_cond) 

    # Compute the log ratio using Bayes' Rule
    logratio = np.log(numerator / denominator)    ## TODO 4
    
    return logratio
    

p_smoothing = naivebayes(X,Y,X[0,:], naivebayesPXY_smoothing)
p_mle = naivebayes(X,Y,X[0,:], naivebayesPXY_mle)

In [12]:
"""
NB as linear classifier
"""

def naivebayesCL(x,y,naivebayesPXY):
    """
    function [w,b]=naivebayesCL(x,y);
    Implementation of a Naive Bayes classifier
    Input:
    x : n input vectors of d dimensions (n,d)
    y : n labels (-1 or +1)(n,)
    naivebayesPXY: input function for getting conditional probabilities (naivebayesPXY_smoothing OR naivebayesPXY_mle)

    Output:
    w : weight vector of d dimensions (d,)
    b : bias (scalar)
    """
    
    n, d = x.shape
    posprob, negprob = naivebayesPXY(x, y)  # Get conditional probabilities

    # Compute the log-odds ratio w and bias b
    w = np.log(posprob / negprob)
    b = np.log(np.sum(y == 1) / np.sum(y == -1))

    return w, b

w_smoothing,b_smoothing = naivebayesCL(X,Y, naivebayesPXY_smoothing)
w_mle,b_mle = naivebayesCL(X,Y, naivebayesPXY_mle)

In [14]:
"""
Classifier + test
"""
def classifyLinear(x,w,b=0):
    """
    function preds=classifyLinear(x,w,b);
    
    Make predictions with a linear classifier
    Input:
    x : n input vectors of d dimensions (n,d)
    w : weight vector of d dimensions (d,)
    b : bias (optional)
    
    Output:
    preds: predictions
    """
    
    preds = np.dot(x, w) + b  # Compute linear combination
    preds = np.sign(preds)  # Apply sign function to get class predictions

    return preds

print('Training error (Smoothing with Laplace estimate): %.2f%%' % (100 *(classifyLinear(X, w_smoothing, b_smoothing) != Y).mean()))
print('Training error (Maximum Likelihood Estimate): %.2f%%' % (100 *(classifyLinear(X, w_mle, b_mle) != Y).mean()))
print('Test error (Smoothing with Laplace estimate): %.2f%%' % (100 *(classifyLinear(xTe, w_smoothing, b_smoothing) != yTe).mean()))
print('Test error (Maximum Likelihood Estimate): %.2f%%' % (100 *(classifyLinear(xTe, w_mle, b_mle) != yTe).mean()))


Training error (Smoothing with Laplace estimate): 25.56%
Training error (Maximum Likelihood Estimate): 25.44%
Test error (Smoothing with Laplace estimate): 25.67%
Test error (Maximum Likelihood Estimate): 25.50%
