# MMI_2024_NLP - Week 1

#Lab 1: Part 2

# (B) Logistic Regression Model

In this second part of the lab, we will implement a language identifier trained on the same data, but using Logistic Regression instead of Naive Bayes.

In [150]:
import io, sys, math
import numpy as np
from collections import defaultdict
from tqdm.notebook import tqdm,trange
from typing import Tuple, List, Dict
import random

This function is used to build the dictionary, or vocabulary, which is a mapping from strings (or words) to integers (or indices). This will allow to build vector representations of documents.

In [151]:
def build_dict(filename:str, threshold:int=1)->Tuple[Dict]:
    """
    Input:
    - filename: the name of the data file.
    - threshold: is the minimum number of times the word has to appear in the data to be added to the vocabulary.
    Output:
    - word_dict: the vocabulary generated from the dataset.
    - label_dict: the dictionary of the labels, with labels as keys and their indices as values of these keys.
    """
    fin = io.open(filename, 'r', encoding='utf-8')
    word_dict, label_dict = {}, {}
    counts = defaultdict(lambda: 0)
    for line in tqdm(fin):
        tokens = line.split()
        label = tokens[0]

        if not label in label_dict:
            label_dict[label] = len(label_dict)

        for w in tokens[1:]:
            counts[w] += 1

    for k, v in counts.items():
        if v > threshold:
            word_dict[k] = len(word_dict)
    return word_dict, label_dict

This function is used to load the training dataset, and build vector representations of the training examples. In particular, a document or sentence is represented as a bag of words. Each example correspond to a sparse vector ` x` of dimension `V`, where `V` is the size of the vocabulary. The element `j` of the vector `x` is the number of times the word `j` appears in the document.

In [152]:
def load_data(filename:str, word_dict:Dict, label_dict:Dict)->List[Tuple]:
    fin = io.open(filename, 'r', encoding='utf-8')
    data = []
    dim = len(word_dict) #The size of the vocabulary.
    for line in tqdm(fin):
        tokens = line.split() #Consider tokenization by space in this case.
        label = tokens[0]

        yi = label_dict[label]
        xi = np.zeros(dim)
        for word in tokens[1:]:
            if word in word_dict:
                wid = word_dict[word]
                xi[wid] += 1.0
        data.append((yi, xi))
    return data

First, let's implement the softmax function. Don't forget numerical stability!

In [153]:
import numpy as np

# def softmax(x: np.ndarray) -> np.ndarray:
#     ##########################################################################
#     #                      TODO: Implement this function                     #
#     ##########################################################################
#     # Subtract the maximum value from each row for numerical stability
#     x_max = np.max(x)
#     e_x = np.exp(x - x_max)
#     softmax_output = e_x / np.sum(e_x, axis=1, keepdims=True)
#     ##########################################################################
#     #                            END OF YOUR CODE                            #
#     ##########################################################################
#     return softmax_output
def softmax(x:np.ndarray)->np.ndarray:
  ##########################################################################
  #                      TODO: Implement this function                     #
  ##########################################################################
  # Replace "pass" statement with your code
    c = np.max(x)
    
    log_sum_exp = c + np.log(np.sum(np.exp(x - c),-1,keepdims=True))

    return np.exp(x - log_sum_exp)
  ##########################################################################
  #                            END OF YOUR CODE                            #
  ##########################################################################

Now, let's implement the main training loop, by using stochastic gradient descent. The function will iterate over the examples of the training set. For each example, we will first compute the loss, before computing the gradient and performing the update.

In [154]:
import numpy as np
import random
from typing import List, Tuple
from tqdm import trange

def sgd(w: np.ndarray, data: List[Tuple[np.ndarray, int]], niter: int, lr: float = 0.001) -> np.ndarray:
    """
    Input:
    - w: the weight matrix of shape (length of label dictionary, length of word dictionary)
    - data: the dataset.
    - niter: number of epochs, or number of passes on the all dataset.
    - lr: the learning rate.

    Output:
    - w: the weight matrix.
    """
    random.seed(123)
    nlabels, dim = w.shape
    loss_list = []

    for _ in trange(niter):
        random.shuffle(data)
        
        total_loss = 0
        for y, x in data:
            # print(x.shape)
            # Compute the scores
            #scores = np.dot(w, x)
            
            # Compute the probabilities
            probs = predict(w,x)#.flatten()
            
            # Compute the loss (cross-entropy loss)
            #y = y.astype('int')
            loss = -np.log(probs[y])
            total_loss += loss
            
            # Compute the gradient
            grad = probs.copy()
            grad[y] -= 1
            #print(x.shape, grad.shape)
            grad = grad.reshape(-1,1) * x.reshape(1, -1)
            #print(grad.shape)
            # Update the weights
            w -= lr * grad
        print(total_loss / len(data))  
        loss_list.append(total_loss / len(data))
        

    return w#, loss_list

The next function will predict the most probable label corresponding to example `x`, given the trained classifier `w`.

In [155]:
def predict(w: np.ndarray, x: np.ndarray) -> int:
    ##########################################################################
    #                      TODO: Implement this function                     #
    ##########################################################################
    # Compute the scores
    scores = np.dot(w, x)
    
    # Compute the probabilities
    probs = softmax(scores).flatten()
    
    # Predict the label with the highest probability
    #predicted_label = np.argmax(probs)
    ##########################################################################
    #                            END OF YOUR CODE                            #
    ##########################################################################
    
    return probs#predicted_label

Finally, this function will compute the accuracy of a trained classifier `w` on a validation set.

In [156]:
def compute_accuracy(w: np.ndarray, valid_data: List[Tuple[np.ndarray, int]]) -> float:
    ##########################################################################
    #                      TODO: Implement this function                     #
    ##########################################################################
    correct_predictions = 0
    total_predictions = len(valid_data)
    
    for y, x in valid_data:
        predicted_label = predict(w, x)
        if np.argmax(predicted_label) == y:
            correct_predictions += 1
    
    accuracy = correct_predictions / total_predictions
    ##########################################################################
    #                            END OF YOUR CODE                            #
    ##########################################################################
    
    return accuracy

In [157]:
print("")
print("** Logistic Regression **")
print("")

word_dict, label_dict = build_dict("train1.txt")
train_data = load_data("train1.txt", word_dict, label_dict)
valid_data = load_data("valid1.txt", word_dict, label_dict)
print(len(train_data))
nlabels = len(label_dict)

dim = len(word_dict)
w = np.zeros([nlabels, dim])
w = sgd(w, train_data, 20)
print("")
print("Validation accuracy: %.3f" % compute_accuracy(w, valid_data))
print("")


** Logistic Regression **



0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

10000


  5%|▌         | 1/20 [02:33<48:27, 153.05s/it]

2.170989652456757


 10%|█         | 2/20 [03:52<32:53, 109.62s/it]

1.9609525434155675


 15%|█▌        | 3/20 [05:17<27:50, 98.27s/it] 

1.814166025440323


 20%|██        | 4/20 [06:54<26:07, 97.96s/it]

1.7015617874079856


 25%|██▌       | 5/20 [08:39<25:08, 100.56s/it]

1.6095158060959445


 30%|███       | 6/20 [10:26<23:58, 102.76s/it]

1.5317479493586756


 35%|███▌      | 7/20 [12:07<22:05, 101.96s/it]

1.4646650935126033


 40%|████      | 8/20 [13:49<20:27, 102.28s/it]

1.4059572801703397


 45%|████▌     | 9/20 [15:32<18:44, 102.23s/it]

1.3540256235961516


 50%|█████     | 10/20 [17:16<17:09, 103.00s/it]

1.3076669791312625


 55%|█████▌    | 11/20 [18:37<14:25, 96.16s/it] 

1.2659639987722109


 60%|██████    | 12/20 [20:18<13:00, 97.52s/it]

1.228198333785034


 65%|██████▌   | 13/20 [21:56<11:23, 97.68s/it]

1.1937932970545346


 70%|███████   | 14/20 [23:42<10:02, 100.40s/it]

1.1622803308649672


 75%|███████▌  | 15/20 [25:08<07:59, 95.83s/it] 

1.1332783728795672


 80%|████████  | 16/20 [26:45<06:25, 96.38s/it]

1.1064699896891959


 85%|████████▌ | 17/20 [28:07<04:36, 92.03s/it]

1.0815957723845266


 90%|█████████ | 18/20 [29:35<03:01, 90.84s/it]

1.0584298512414623


 95%|█████████▌| 19/20 [31:06<01:30, 90.73s/it]

1.0367805974339883


100%|██████████| 20/20 [32:31<00:00, 97.57s/it]

1.0164917414673957






Validation accuracy: 0.877



# Recommended Reading:

- https://people.tamu.edu/~sji/classes/LR.pdf
