
<h1 style="font-family:verdana;font-size:300%;text-align:center;background-color:#f2f2f2;color:#0d0d0d">AMMI NLP - Part 1</h1>

<h1 style="font-family:verdana;font-size:150%;text-align:Center;color:#993333"> Lab 1: Introduction to text classification  </h1>

<h1 style="font-family:verdana;font-size:150%;text-align:left;color:blue">Section 1: Text Classification with Naive Bayes Classifier </h1>

##### In this part you'll implement Naive Bayes classifier to classify the text. You need to build a model that predicts the langauge of the text given the words of the text

In [1]:
import io, sys, math, re
from collections import defaultdict
import numpy as np

In [2]:
def load_data(filename):
    '''
    Parameters:
    filename (string): path to file to be read
    
    Return: 
    List of tuples (explained in first question)
    '''
    fin = io.open(filename, 'r', encoding='utf-8')
    data = []
    for line in fin:
        tokens = line.split()
        data.append((tokens[0], tokens[1:]))
    return data

In [3]:
data = load_data("train1.txt")
data[1]

('__label__deu', ['Tom', 'ist', 'an', 'Kunst', 'völlig', 'uninteressiert.'])


![for](images/1.png)

![for](images/2.png)

In [4]:
def count_words(data):
    '''
    Parameters:
    
    data is  list of [(label, words), (label, worlds), ......]
    list of tuples in the shape (string, [list of strings]) )
    
    Returns: 
    
    This function should return a dictionary containing the following:
    { 
    # label_counts (python dictionary): 
         {label:  no. of times the label appeared },
    # word_counts  (dictionary of dictionaries): 
         {label: {word: no. of times this word appeared with this label }},
    # label_total (int): 
        total number of labels. (size of train data),
    # word_total  (python dictionary) total number of words (from the entire corupus) of the particular label:
          {label: no.of words}
          
          }
    
    '''
    label_total = 0
    word_total = defaultdict(lambda: 0)
    label_counts = defaultdict(lambda: 0)
    word_counts = defaultdict(lambda: defaultdict(lambda: 0.0))
    
    for example in data:
        label, sentence = example
        #LABEL_COUNTS
        label_counts[label] += 1.0
           
        #word_counts
        for word in sentence:
            word_counts[label][word] += 1.0
            word_total[label] += 1.0
            
    #LABEL_TOTAL
    label_total = len(label_counts.keys())

    return {'label_counts': label_counts, 
            'word_counts': word_counts, 
            'label_total': label_total, 
            'word_total': word_total}

In [5]:
label_counts= count_words(data)['label_counts']
word_counts= count_words(data)['word_counts']
label_total= count_words(data)['label_total']
word_total= count_words(data)['word_total']

word_counts['__label__deu']['Ich']

140.0

In [6]:
import operator
def predict(sentence, mu, label_counts, word_counts, label_total, word_total):
    '''
     Parameters: 
        sentence (string): sentence to be classified
        mu (positive real number): Laplace Smoothing hyperparameter
        ** The other parameters introduced in the count_words function
    
    Returns:
    best_label (string): the label that has the highest score. 
    
    Implement the function to predict the best label for the given sentence using Naive Bayes algorithm 
    
    '''
    best_label = None
    best_score = float('-inf')

    label_outputs = defaultdict(lambda: 0)
    for label in word_counts.keys():
        score = 0.0
        
        sentence_count = len(word_counts[label].values())
            
        prod = 0
        for i in sentence:
            prod += np.log((word_counts[label][i]+mu)/(sentence_count+(mu*word_total[label])))
            
        lebel_prob = np.log(label_counts[label]/sum(list(label_counts.values())))
            
        
        label_outputs[label] = prod +  lebel_prob
        
            
    best_score = max(list(label_outputs.values()))
    sorted_x = sorted(label_outputs.items(), key=operator.itemgetter(1))
    best_label = sorted_x[-1][0]
    
    

    return best_label


In [7]:
for example in range(len(data)):
        label, sentence = data[example]
        print(' '.join(sentence),predict(sentence, 3, label_counts, word_counts, label_total, word_total))
        if example == 10:
            break
        

Ich würde alles tun, um dich zu beschützen. __label__deu
Tom ist an Kunst völlig uninteressiert. __label__deu
Végeztem Tomival. __label__hun
„Wird das in der Werkstatt gemacht?“ – „Nein, das muss an Ort und Stelle erledigt werden.“ __label__deu
У меня есть яблоко. __label__rus
Non possiamo lasciarle lì. __label__ita
Том считает, что школа — это пустая трата времени. __label__rus
My fathers don't speak Dutch. __label__hun
El niño no sabe cómo comportarse. __label__spa
Она думала, что он переночует у неё. __label__rus
Helikopter neden kentin üstünde uçuyor? __label__hun


In [8]:
def compute_accuracy(valid_data, mu, counts):
    '''
    Parameters:
    valid_data (list of tuples): returned value of load_data function 
    mu (positive real): Laplace smoothing hyper-parameter
    counts (dictionary of dictionaries): return value of count_words_function
    
    Returns: 
    accuracy (float): the accuracy of the Naive Bayes classifier
    '''
    accuracy = 0.0
    for label, sentence in valid_data:
        predicted_label = predict(sentence, mu, label_counts, word_counts, label_total, word_total)
        if predicted_label==label:
            accuracy += 1.0
         

    return accuracy/len(valid_data)

In [9]:
print("")
print("** Naive Bayes **")
print("")

mu = 1.0
train_data = load_data("train1.txt")
valid_data = load_data("valid1.txt")
counts = count_words(train_data)

print("Training accuracy: %.3f" % compute_accuracy(train_data, mu, counts))
print("")


** Naive Bayes **

Training accuracy: 0.994



In [10]:
print("")
print("** Naive Bayes **")
print("")

mu = 1.0
train_data = load_data("train1.txt")
valid_data = load_data("valid1.txt")
counts = count_words(valid_data)

print("Validation accuracy: %.3f" % compute_accuracy(valid_data, mu, counts))
print("")


** Naive Bayes **

Validation accuracy: 0.947



<h1 style="font-family:verdana;font-size:150%;text-align:left;color:black">***************************************************************</h1>


<h1 style="font-family:verdana;font-size:150%;text-align:left;color:blue">Section 2: Softmax Classification of Text  </h1>

##### In this part you'll implement a Softmax Classifier to classify the text (think of it as a 1 layer feedforward neural network). You need to build a model that predicts the langauge of the text given the words of the text

In [11]:
def build_dict(filename, threshold=1):
    '''
    Parameters:
    filename (string): path to the data file
    
    Returns:
    word_dic: dictionary maps words to number of times it appeard in the corpus
            dic {word: no of times word appears }
    label_dic: dictionary maps labels to integers
        dic {label: label_id}
    '''
    fin = io.open(filename, 'r', encoding='utf-8')
    word_dict, label_dict = {}, {}
    counts = defaultdict(lambda: 0)
    for line in fin:
        tokens = line.split()
        label = tokens[0]

        if not label in label_dict:
            label_dict[label] = len(label_dict)

        for w in tokens[1:]:
            counts[w] += 1
            
    for k, v in counts.items():
        if v > threshold:
            word_dict[k] = len(word_dict)
    return word_dict, label_dict

In [12]:
word_dict, label_dict = build_dict('train1.txt')

In [13]:
def load_data(filename, word_dict, label_dict):
    '''
    ## This function converts the text to a list of tuples of 
    [(label_id, word_representation),...]
    
    Parameters:
    filename (string): path to the file which contains the data
    word_dict (python dictionary): returned by build_dict() function above.
    label_dict (python dictionary): reutrned by build_dict() function above() 
    
    Returns:
    data (list of tuples): 
    The representation of the data in the form 
    [(y_0, x_0, .. (y_i, x_i), ... (y_n, x_n))]
    where y is the value of the class 
    x is the representation of the sentence as a word count vector 
    
    '''
    fin = io.open(filename, 'r', encoding='utf-8')
    data = []
    dim = len(word_dict)
    for line in fin:
        tokens = line.split()
        label = tokens[0]

        yi = label_dict[label]
        xi = np.zeros(dim)
        for word in tokens[1:]:
            if word in word_dict:
                wid = word_dict[word]
                xi[wid] += 1.0
        data.append((yi, xi))
    return data

In [14]:
d = load_data("train1.txt", word_dict, label_dict)

In [15]:
d[:5]

[(0, array([1., 1., 1., ..., 0., 0., 0.])),
 (0, array([0., 0., 0., ..., 0., 0., 0.])),
 (1, array([0., 0., 0., ..., 0., 0., 0.])),
 (0, array([0., 0., 0., ..., 0., 0., 0.])),
 (2, array([0., 0., 0., ..., 0., 0., 0.]))]

In [16]:
def softmax(x):
    '''
    This function should apply softmax to vector x
    
    Parameter:
    x (numpy array)
    Returns: 
    softmax(x) (numpy array)
    
    '''
    z = np.sum(np.exp(x-max(x)))
    
    return np.exp(x-max(x))/z

## (Hint) Derivatives:

<h1 style="font-family:verdana;font-size:150%;text-align:center;background-color:#f2f2f2;color:#993333; border:2px; border-style:solid; border-color:gray; padding: 1em"> 
   Let $x_i$ be the input vector $W$ is the weight vector $m=$no. of labels, $n=$vocab size
    $$ {\bf S} = W × x_i $$  $x_{i} \in R^{nx1} W \in R^{m×n}$
    $$  $$
    $${\bf O} = softmax(s) $$
    $${\bf L} = -log(O[y_{i}]) $$
    $$  $$
    $$\frac{\partial L}{\partial W} = \frac{\partial L}{\partial S} . \frac{\partial S}{\partial W} $$
    $$  $$
    $ \nabla L_{W} = (O-y_{true})$  x   $x_{i}^{T} $  
    $$ (O-y_{true}) \in R^{mx1}, x_{i}  \in R^{nx1}$$

</h1>

In [17]:
def sgd(w, data, niter):
    '''
    This function should perform the Stochastic Gradient Descent algorithm 
    
    Parameter:
    w (numpy array): weight vector
    data (list of tuples): [...(y_i, x_i)...] from above
    niter (int): number of iterations
    
    Retunrs:
    w (numpy array): weight vector after training
    '''
    nlabels, dim = w.shape
    for iter in range(niter):
        for label,x in data:
            
            one_hot_label = np.eye(nlabels)[label]
            
            s = np.dot(w,x)
            o = softmax(s)
            l = -np.log(o)

            d_l = (o - one_hot_label).reshape(-1,1)*x.reshape(-1,1).T
            
            w = w - 0.1*d_l
    return w

In [18]:
def predict(w, x):
    '''
    This function should compute and return the prediction. 
    Parameters:
    w (numpy array): trained weight vector
    x (numpy array): word count vector
    
    Returns: 
    prediction (int): index of the correct prediction (y_i)
    '''
    return np.argmax(np.dot(w,x))

In [19]:
def compute_accuracy(w, valid_data):
    '''
    This function should compute the accuracy of the classifier 
    Parameters:
    w (numpy array): trained weight vector
    valid_data (list of tuples): loaded validation data using load_data() function 
    
    Returns: 
    accuracy (float): accuracy of the classifier 
    '''
    accuracy = 0.0
    for label,x in valid_data:
        new_label = predict(w,x)
        if new_label==label:
            accuracy += 1.0

    return accuracy/len(valid_data)

In [20]:
print("")
print("** Logistic Regression **")
print("")

word_dict, label_dict = build_dict("train1.txt")
train_data = load_data("train1.txt", word_dict, label_dict)
valid_data = load_data("valid1.txt", word_dict, label_dict)

nlabels = len(label_dict)
dim = len(word_dict)
w = np.zeros([nlabels, dim])
w = sgd(w, train_data, 20)
print("")
print("Validation accuracy: %.3f" % compute_accuracy(w, valid_data))
print("")


** Logistic Regression **


Validation accuracy: 0.929

