# MMI_2024_NLP - Week 1

# Lab 1: Part 1

# (A) Naive Bayes model

In this lab, we will implement a language identifier (LID).

Our first model will be based on Naive Bayes.

In [1]:
import io, sys, math, re
from collections import defaultdict
from typing import List, Tuple, Dict

The next function is used to load the data. Each line of the data consist of a label (corresponding to a language), followed by some text, written in that language. Here is an example of data:

```__label__de Zur Namensdeutung gibt es mehrere Varianten.```


In [2]:
def load_data(filename:str)->List[Tuple]:
    fin = io.open(filename, 'r', encoding='utf-8')
    data = []
    for line in fin:
        tokens = line.split()
        data.append((tokens[0], tokens[1:]))
    return data

You can now try loading the first dataset `train1.txt` and look what examples look like.

In [4]:
data = load_data("train1.txt")
print(data[1])

('__label__de', ['Tom', 'ist', 'an', 'Kunst', 'völlig', 'uninteressiert.'])


Next, we will start implementing the Naive Bayes method. This technique is based on word counts, and we thus need to start by implementing a function to count the words and labels of our training set.

`n_examples` is the total number of examples

`n_words_per_label` is the total number of words for a given label

`label_counts` is the number of times a given label appears in the training data

`word_counts` is the number of times a word appears with a given label

In [30]:
def count_words(data:str)->Dict:
    n_examples = 0
    n_words_per_label = defaultdict(lambda: 0)
    label_counts = defaultdict(lambda: 0)
    word_counts = defaultdict(lambda: defaultdict(lambda: 0.0))

    for example in data:
        label, sentence = example
        ##########################################################################
        #                      TODO: Implement this function                     #
        ##########################################################################
        # Replace "pass" statement with your code
        n_examples += 1
        label_counts[label] += 1
        #print(sentence)
        words = sentence
        n_words_per_label[label] += len(words)

        for word in words:
            word_counts[label][word] += 1
        ##########################################################################
        #                            END OF YOUR CODE                            #
        ##########################################################################

        

    return {'label_counts': label_counts,
            'word_counts': word_counts,
            'n_examples': n_examples,
            'n_words_per_label': n_words_per_label}

Next, using the word and label counts from the previous function, we can implement the prediction function.

Here, `mu` is a regularization parameter (Laplace smoothing), and `sentence` is the list of words corresponding to the test example.

In [31]:
import numpy as np
from typing import Dict, List, Tuple
def predict(sentence:List, mu:float, label_counts:Dict, word_counts:Dict, n_examples:int, n_words_per_label:Dict)->str:
    best_label = None
    best_score = float('-inf')

    for label in word_counts.keys():
        score = 0.0
        prior = label_counts[label] / sum(label_counts.values())
        #P(Class | Word) = P(Class) * P(word | Class)
        ##########################################################################
        #                      TODO: Implement this function                     #
        ##########################################################################
        # Replace "pass" statement with your code
        score += np.log(prior)  # Use log to avoid underflow

        for word in sentence:
            word_likelihood = (word_counts[label][word] + mu) / (n_words_per_label[label] + mu * len(word_counts[label]))
            score += np.log(word_likelihood)

        if score > best_score:
            best_score = score
            best_label = label
        ##########################################################################
        #                            END OF YOUR CODE                            #
        ##########################################################################

    return best_label

The next function will be used to evaluate the Naive Bayes model on a validation set. It computes the accuracy for a particular regularization parameter `mu`.

In [32]:
def compute_accuracy(valid_data:str, mu:float, counts:Dict)->float:
    accuracy = 0.0
    correct_predictions = 0
    total_predictions = len(valid_data)
    for label, sentence in valid_data:
      ##########################################################################
      #                      TODO: Implement this function                     #
      ##########################################################################
      # Replace "pass" statement with your code
      sentence_words = sentence#.split()

      predicted_label = predict(sentence_words, mu, counts['label_counts'], counts['word_counts'], counts['n_examples'], counts['n_words_per_label'])
      if predicted_label == label:
        correct_predictions += 1

      accuracy = correct_predictions / total_predictions
      ##########################################################################
      #                            END OF YOUR CODE                            #
      ##########################################################################

    return accuracy # Replace "..." statement with your code

In [34]:
print("")
print("** Naive Bayes **")
print("")

mu = 0.2
train_data = load_data("train1.txt")
valid_data = load_data("valid1.txt")
counts = count_words(train_data)

print("Validation accuracy: %.3f" % compute_accuracy(valid_data, mu, counts))
print("")


** Naive Bayes **

Validation accuracy: 0.953



# Now, it is your turn, try to do it with train2.txt and valid2.txt.


In [27]:
import io
import numpy as np
from collections import defaultdict
from typing import List, Tuple, Dict

class NaiveBaye_LID:
    
    def load_data(self, filename: str) -> List[Tuple]:
        fin = io.open(filename, 'r', encoding='utf-8')
        data = []
        for line in fin:
            tokens = line.split()
            data.append((tokens[0], tokens[1:]))
        return data
    
    def count_words(self, data: List[Tuple]) -> Dict:
        n_examples = 0
        n_words_per_label = defaultdict(lambda: 0)
        label_counts = defaultdict(lambda: 0)
        word_counts = defaultdict(lambda: defaultdict(lambda: 0.0))

        for example in data:
            label, sentence = example
            n_examples += 1
            label_counts[label] += 1
            words = sentence
            n_words_per_label[label] += len(words)

            for word in words:
                word_counts[label][word] += 1

        return {'label_counts': label_counts,
                'word_counts': word_counts,
                'n_examples': n_examples,
                'n_words_per_label': n_words_per_label}
    
    def predict(self, sentence: List[str], mu: float, label_counts: Dict, word_counts: Dict, n_examples: int, n_words_per_label: Dict) -> str:
        best_label = None
        best_score = float('-inf')

        for label in word_counts.keys():
            score = 0.0
            prior = label_counts[label] / sum(label_counts.values())
            score += np.log(prior)  # Use log to avoid underflow

            for word in sentence:
                word_likelihood = (word_counts[label][word] + mu) / (n_words_per_label[label] + mu * len(word_counts[label]))
                score += np.log(word_likelihood)

            if score > best_score:
                best_score = score
                best_label = label

        return best_label
    
    def compute_accuracy(self, valid_data: List[Tuple], mu: float, counts: Dict) -> float:
        correct_predictions = 0
        total_predictions = len(valid_data)

        for label, sentence in valid_data:
            sentence_words = sentence
            predicted_label = self.predict(sentence_words, mu, counts['label_counts'], counts['word_counts'], counts['n_examples'], counts['n_words_per_label'])
            if predicted_label == label:
                correct_predictions += 1

        accuracy = correct_predictions / total_predictions
        return accuracy
        
    def train(self):
        print("")
        print("** Naive Bayes **")
        print("")

        mu = 1.0
        train_data = self.load_data("train2.txt")
        valid_data = self.load_data("valid2.txt")
        counts = self.count_words(train_data)

        print("Validation accuracy: %.3f" % self.compute_accuracy(valid_data, mu, counts))
        print("")

if __name__ == "__main__":
    NaiveBaye = NaiveBaye_LID()
    NaiveBaye.train()


** Naive Bayes **

Validation accuracy: 0.980

