# SNLP Assignment 7

Name 1: Entang Wang<br>
Student id 1: 7069521<br>
Email 1: enwa00001@stud.uni-saarland.de<br>

Name 2: Zichao Wei<br>
Student id 2: 7063941<br>
Email 2: ziwe00001@stud.uni-saarland.de<br>

Name 3: Xiao Wang<br>
Student id 3: 7039023<br>
Email 3: xiwa00004@stud.uni-saarland.de<br>

**Instructions:** Read each question carefully. <br/>
Make sure you appropriately comment your code wherever required. Your final submission should contain the completed Notebook. There is no need to submit the data files. <br/>
Upload the zipped folder on CMS. Please follow the naming convention of **Name1_studentID1_Name2_studentID2_Name3_studentID3.zip**. Make sure to click on "Turn-in" (or the equivalent on CMS) after you upload your submission, otherwise the assignment will not be considered as submitted. Only one member of the group should make the submisssion.

---

In [None]:
from tkinter import Label
! pip install nltk
! pip install numpy
! pip install scikit-learn
! pip install matplotlib
! pip install seaborn
! pip install pandas

In [2]:
import nltk
import math
import numpy as np
import pandas as pd
from typing import List, Dict, Tuple
from string import punctuation
from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt

## Exercise 1: BoW and TF-IDF [4.5 points]

### Exercise 1.1

Write a function that takes a preprocessed tokenized text corpus and a word2id mapping and turns it into a Bag-of-Words matrix. You would need to generate the mapping yourself in the form of a dictionary (e.g. `{"the": 0, "of": 1, ...}`) [1 point]

In [3]:
def tokenize_corpus(corpus: List[str]) -> List[List[str]]:
    """
    Tokenize the given corpus.
    
    Args:
        corpus - List of sentences.
    Returns:
        tokenized_corpus - List of sentences, where each sentence is a list of tokens.
    """
    # Your code here!
    tokenized_corpus = []
    for sentence in corpus:
        tokens = sentence.split()
        tokenized_corpus.append(tokens)
    return tokenized_corpus

def preprocess_corpus(corpus: List[List[str]]) -> List[List[str]]:
    """
    Preprocess the given corpus by lowercasing and removing punctuation.
    
    Args:
        corpus: tokenized corpus.
    Returns:
        preprocessed_corpus - preprocessed corpus.
    """
    # Your code here!
    preprocessed_corpus = []
    for sentence in corpus:
        preprocessed_sentence = []
        for token in sentence:
            # Convert to lowercase
            token = token.lower()
            # Remove punctuation
            token = token.translate(str.maketrans('', '', punctuation))
            if token:
                preprocessed_sentence.append(token)
        preprocessed_corpus.append(preprocessed_sentence)
    return preprocessed_corpus

def token2id(corpus: List[List[str]]) -> Dict[str, int]:
    """
    Create a mapping from tokens to unique IDs.
    
    Args:
        corpus - List of sentences, where each sentence is a list of tokens.
    Returns:
        token2id - Dictionary mapping tokens to unique IDs.
    """
    # Your code here!
    # Collect all unique tokens from the corpus
    token_set = set()
    for sentence in corpus:
        for token in sentence:
            token_set.add(token)
    
    token_to_id = {}
    for i, token in enumerate(sorted(token_set)):  
        token_to_id[token] = i
    
    return token_to_id

def bow_matrix(corpus: List[List[str]], token2id: Dict[str, int]) -> np.ndarray:
    """
    Create a bag-of-words representation of the given corpus.
    
    Args:
        corpus - Tokenized corpus.
        token2id - Dictionary mapping tokens to unique IDs.
    Returns:
        bag_of_words - Bag-of-words matrix (document_size x vocab_size).
    """
    # Your code here!
    num_documents = len(corpus)
    vocab_size = len(token2id)
    bow_matrix = np.zeros((num_documents, vocab_size), dtype=int)
    
    for doc_idx, sentence in enumerate(corpus):
        for token in sentence:
            if token in token2id: 
                token_idx = token2id[token]
                bow_matrix[doc_idx, token_idx] += 1
    
    return bow_matrix

Now write a function that would include both unigrams and birgams in the BoW matrix. [1 point]

In [4]:
def token2id_with_bigrams(corpus: List[List[str]]) -> Dict[str, int]:
    """
    Create a mapping from tokens to unique IDs, with bigrams.
    
    Args:
        corpus - List of sentences, where each sentence is a list of tokens.
    Returns:
        token2id - Dictionary mapping tokens and bigrams to unique IDs.
    """
    # Your code here!
    token_set = set()
    # Collect all unique unigrams and bigrams from the corpus
    for sentence in corpus:
        # Add unigrams
        for token in sentence:
            token_set.add(token)
        
        # Add bigrams
        for i in range(len(sentence) - 1):
            bigram = f"{sentence[i]}_{sentence[i+1]}"
            token_set.add(bigram)
    
    token_to_id = {}
    for i, token in enumerate(sorted(token_set)):  # Sort for consistency
        token_to_id[token] = i
    
    return token_to_id

def bow_matrix_with_bigrams(corpus: List[List[str]], token2id: Dict[str, int]) -> np.ndarray:
    """
    Create a bag-of-words representation of the given corpus, with bigrams.
    
    Args:
        corpus - Tokenized corpus.
        token2id - Dictionary mapping tokens and bigrams to unique IDs.
    Returns:
        bag_of_words - Bag-of-words matrix (document_size x vocab_size).
    """
    # Your code here!
    num_documents = len(corpus)
    vocab_size = len(token2id)
    bow_matrix = np.zeros((num_documents, vocab_size), dtype=int)
    
    # Fill the matrix
    for doc_idx, sentence in enumerate(corpus):
        # Count unigrams
        for token in sentence:
            if token in token2id:  
                token_idx = token2id[token]
                bow_matrix[doc_idx, token_idx] += 1
        
        # Count bigrams
        for i in range(len(sentence) - 1):
            bigram = f"{sentence[i]}_{sentence[i+1]}"
            if bigram in token2id: 
                bigram_idx = token2id[bigram]
                bow_matrix[doc_idx, bigram_idx] += 1
    
    return bow_matrix

Take the Yahoo! Answers corpus and its corresponding labels.

In [6]:
df = pd.read_csv('data/yahoo.csv')
id2label = {
    0: "Society & Culture",
    1: "Science & Mathematics",
    2: "Health",
    3: "Education & Reference",
    4: "Computers & Internet",
    5: "Sports",
    6: "Business & Finance",
    7: "Entertainment & Music",
    8: "Family & Relationships",
    9: "Politics & Government"
}

df.head()

Unnamed: 0,label,title,question,best_answer,text
0,0,What's the big deal about Valetine's Day?,"I mean , seriously, what's so great about Vale...",Valentine's Day is a big deal because we live ...,What's the big deal about Valetine's Day? I me...
1,0,What are some Native American Manners?,,This question is way to general. Every region ...,What are some Native American Manners?
2,0,Why did the Pastor said every Deities like Bud...,Is Buddha statue consist of demon inside,Because he's ignorant!,Why did the Pastor said every Deities like Bud...
3,0,can ahuman forgive the mistakes for another ah...,why we don't go directly to our god ???\nif u ...,Yes and no. I can forgive people for things th...,can ahuman forgive the mistakes for another ah...
4,0,Who is Statira?,,Daughter of Darius III of Persia,Who is Statira?


Create and preprocess a text corpus, consisting of the question title and body (ignore answers). If the title/body is empty, add an empty string (so, if the question has the title "How to quit smoking?" and the body is empty, the resulting question would be "How to quit smoking?").

In [None]:
# Your code here!
def create_corpus(df):
    corpus = []
    for idx, row in df.iterrows():
        title = str(row['title']) if pd.notna(row['title']) else ""
        question = str(row['question']) if pd.notna(row['question']) else ""
        
        combined_text = f"{title} {question}".strip()
        if combined_text.lower() in ['nan nan', 'nan', '']:
            combined_text = ""
            
        corpus.append(combined_text)
    
    return corpus

corpus = create_corpus(df)

Create a BoW matrix out of the corpus.

Using the matrix, find the ids of questions which contain the word "heart" and print out the total count of each label assigned to them. Then look at the questions which contain the bigram "heart attack" and the corresponding labels.
Do the same with another word and bigram of your choosing. [0.5 points]

In [8]:
# creating the matrix
tokenized_corpus = tokenize_corpus(corpus)
preprocessed_corpus = preprocess_corpus(tokenized_corpus)
token2id_dict = token2id_with_bigrams(preprocessed_corpus)
bow = bow_matrix_with_bigrams(preprocessed_corpus, token2id_dict)

In [23]:
def analyze_term_labels(term, bow_matrix, token2id_dict, df, id2label):
    if term in token2id_dict:
        term_idx = token2id_dict[term]
        term_doc_indices = np.where(bow_matrix[:, term_idx] > 0)[0]
        term_labels = df['label'].iloc[term_doc_indices]
        term_label_counts = term_labels.value_counts().sort_index()
        
        print(f"Documents containing '{term}': {len(term_doc_indices)} total")
        for label_id, count in term_label_counts.items():
            print(f"  {id2label[label_id]}: {count} documents")
    else:
        print(f"'{term}' not found in vocabulary")
# counting the labels for "heart"
analyze_term_labels("heart", bow, token2id_dict, df, id2label)
# for "heart attack"
analyze_term_labels("heart_attack", bow, token2id_dict, df, id2label)
# for unigram and bigram of your choice
unigram = "good"
bigram = "good_day"
analyze_term_labels(unigram, bow, token2id_dict, df, id2label)
analyze_term_labels(bigram, bow, token2id_dict, df, id2label)


Documents containing 'heart': 3 total
  Health: 1 documents
  Family & Relationships: 2 documents
'heart_attack' not found in vocabulary
Documents containing 'good': 22 total
  Society & Culture: 3 documents
  Science & Mathematics: 1 documents
  Health: 3 documents
  Education & Reference: 4 documents
  Sports: 1 documents
  Business & Finance: 3 documents
  Entertainment & Music: 1 documents
  Family & Relationships: 5 documents
  Politics & Government: 1 documents
'good_day' not found in vocabulary


### Exercise 1.2: TF-IDF

TF-IDF is a metric that is calculated for each term $t$ in each document $d$ with the following formula:

$$\text{TF-IDF}_{t, d} = \text{TF}(t, d) \times \text{IDF}(t)$$

where $\text{TF}(t, d)$ is the *term frequency*, i.e. number of times term $t$ appears in document $d$ and $\text{IDF}(t)$ is the *inverse document frequency*, which is defined as follows:

$$\text{IDF}(t) = \log{\frac{N}{1 + \text{df}_{t}}}$$

where $N$ is the total number of documents and $\text{df}_{t}$ is the number of documents containing $t$.

Now compute a TF-IDF matrix of the corpus of the shape $\text{number of documents} \times \text{number of terms}$. [1 point]

In [24]:
def tf_idf(corpus: List[List[str]], token2id: Dict[str, int]) -> np.ndarray:
    """
    Compute the TF-IDF representation of the given corpus.
    
    Args:
        corpus - Tokenized corpus.
        token2id - Dictionary mapping tokens to unique IDs.
    Returns:
        tfidf_matrix - TF-IDF matrix (document_size x vocab_size).
    """
    # Your code here!
    num_documents = len(corpus)
    vocab_size = len(token2id)
    
    # Create Term Frequency matrix
    tf_matrix = np.zeros((num_documents, vocab_size), dtype=float)
    for doc_idx, document in enumerate(corpus):
        for token in document:
            if token in token2id:
                token_idx = token2id[token]
                tf_matrix[doc_idx, token_idx] += 1

    # Calculate Document Frequency for each term
    df = np.zeros(vocab_size)
    for token, token_idx in token2id.items():
        # Count how many documents contain this token
        documents_with_token = np.sum(tf_matrix[:, token_idx] > 0)
        df[token_idx] = documents_with_token
    
    # Calculate IDF (Inverse Document Frequency) for each term
    idf = np.log(num_documents / (1 + df))
    
    # Calculate TF-IDF matrix
    tfidf_matrix = tf_matrix * idf
    
    return tfidf_matrix

In your own words, what role could IDF play in TF-IDF? [0.5 points]

Answer: Some words like 'is', 'the', ... are the common words in a document, but they can't represent the topic the the document. Since they're universal in each document. With IDF term, we can add more values to the words that are unique in the document which is more relevant to the document topic.

### Exercise 1.3: Stop Words

Use nltk's stop words (`nltk.corpus.stopwords.words('english')`) to calculate the amount of stop words in the corpus (number of stop words divided by total number of words in the whole corpus). What does the result tell you about the data? [0.5 points]

In [None]:
def stopword_rate(corpus: List[List[str]], stopwords: List[str]) -> float:
    """
    Calculate the rate of stopwords in the corpus.
    
    Args:
        corpus - Tokenized corpus.
        stopwords - List of stopwords.
    Returns:
        stopword_rate - Rate of stopwords in the corpus.
    """
    # Your code here!
    stopwords_set = set(stopwords)
    total_words = 0
    stopword_count = 0
    
    for document in corpus:
        for word in document:
            total_words += 1
            if word in stopwords_set:
                stopword_count += 1
    
    if total_words == 0:
        return 0.0
    
    return stopword_count / total_words

In [30]:
stopword_rate(preprocessed_corpus, nltk.corpus.stopwords.words('english'))

0.49670218163368846

## Exercise 2: Classifiers [3.5 points]

### Exercise 2.1

1. What is the difference between classification and clustering? Describe with an example of a text dataset. [0.5 points]
2. Name one text classification task that you encounter in your day-to-day life. [0.25 points]
3. Provide an pair of examples for datasets that are suitable for:
    - Binary classification and multi-class classification. [0.25 points]
    - Flat classification and hierarchical classification. [0.25 points]
    - Single-category classification and multi-category classification [0.25 points]

### Answer:
1. Classification is a supervised task in which each text comes with a known label and the model learns to assign those predefined labels to unseen texts, e.g. IMDB movie reviews, while clustering is an unsupervised task that discovers its own groupings among texts that arrive with no labels, e.g. BBC news. 

2. E-mail spam filtering. The incoming emails are automatically classified as spam or inbox. These filters rely on supervised models trained on enormous corpora of previously labelled e-mail.

3. 
- Binary: IMDB Movie Reviews Multi-class: 20 Newsgroups
- Flat: 20 Newsgroups Hierarchical: RCV1-v2
- Single-category: IMDB Movie Reviews Multi-category: Reuters-21578
### Exercise 2.2

Load the dataset of tweets from the `data/` folder.

In [31]:
tweet_df = pd.read_csv('data/twitter.csv')
tweet_df.head()

Unnamed: 0,textID,text,selected_text,sentiment,Time of Tweet,Age of User,Country,Population (2020),Land Area (Km),Density (P/Km),label
0,32883c8788,wishes the rain would stop so my stupid headac...,wishes the rain would stop so my stupid headac...,negative,night,70-100,Croatia,4105267,55960.0,73,0
1,899ba63056,Sorry to disappoint. Not a big Nascar fan but...,Sorry,negative,noon,60-70,Colombia,50882891,1109500.0,46,0
2,d31f708485,playing singstar without my fave duetter,playing singstar without my fave duetter,negative,night,70-100,India,1380004385,2973190.0,464,0
3,cea2861940,spray tan = fail on legs and feet. I`ve been s...,spray tan = fail on legs and feet. I`ve been s...,negative,noon,60-70,Belgium,11589623,30280.0,383,0
4,52baecd545,: first impression is that it`s considerably ...,first impression is that it`s considerably sl...,negative,night,31-45,Australia,25499884,7682300.0,3,0


Use `LabelEncoder` to turn the `sentiment` columns into numerical labels.

In [32]:
from sklearn.preprocessing import LabelEncoder

# Your code here!
label_encoder = LabelEncoder()
tweet_df['sentiment'] = label_encoder.fit_transform(tweet_df['sentiment'])
tweet_df.head()

Unnamed: 0,textID,text,selected_text,sentiment,Time of Tweet,Age of User,Country,Population (2020),Land Area (Km),Density (P/Km),label
0,32883c8788,wishes the rain would stop so my stupid headac...,wishes the rain would stop so my stupid headac...,0,night,70-100,Croatia,4105267,55960.0,73,0
1,899ba63056,Sorry to disappoint. Not a big Nascar fan but...,Sorry,0,noon,60-70,Colombia,50882891,1109500.0,46,0
2,d31f708485,playing singstar without my fave duetter,playing singstar without my fave duetter,0,night,70-100,India,1380004385,2973190.0,464,0
3,cea2861940,spray tan = fail on legs and feet. I`ve been s...,spray tan = fail on legs and feet. I`ve been s...,0,noon,60-70,Belgium,11589623,30280.0,383,0
4,52baecd545,: first impression is that it`s considerably ...,first impression is that it`s considerably sl...,0,night,31-45,Australia,25499884,7682300.0,3,0


Split the dataset into train and test data using `train_test_split` by `scikit-learn` (80/20 split). Use the `sentiment` label as the class. 

In [33]:
# Your code here!
X_train_tweet, X_test_tweet, y_train_tweet, y_test_tweet = train_test_split(
    tweet_df['text'], tweet_df['sentiment'], test_size=0.2, random_state=42, stratify=tweet_df['sentiment']
)
print(X_train_tweet[:5])
print(y_train_tweet[:5])

41            I`m stuck with BOO!!!! Jeeeez shoot me now
165                  If I may suggest: http://tr.im/kXkw
246    Got the sniffles   I SO don`t want to get sick...
179     Um yeah ... role model for your peers you may...
192    Missed the UPS guy again! Ugh so sad  But i go...
Name: text, dtype: object
41     0
165    1
246    2
179    1
192    1
Name: sentiment, dtype: int64


Use the `CountVectorizer` of `scikit-learn` to create the following 5 bag-of-words matrixes, passing the parameter `ngram_range`. Don't worry about lowercasing and tokenizing, since the CountVectorizer has these functionalities already built-in. Checking out the documentation is highly recommended! [0.5 points]

- uni-gram
- bi-gram
- tri-gram
- uni-gram & bi-gram
- uni-gram & bi-gram & tri-gram

In [34]:
from sklearn.feature_extraction.text import CountVectorizer

# Your code here!
ngram_ranges = {
    "unigram": (1, 1),
    "bigram": (2, 2),
    "trigram": (3, 3),
    "unigram_bigram": (1, 2),
    "unigram_bigram_trigram": (1, 3)
}

vectorizers_tweet = {}
X_train_tweet_bow = {}
X_test_tweet_bow = {}

for name, ngram_range in ngram_ranges.items():
    print(f"Creating BoW for: {name} with ngram_range={ngram_range}")
    vectorizer = CountVectorizer(ngram_range=ngram_range)
    X_train_tweet_bow[name] = vectorizer.fit_transform(X_train_tweet)
    X_test_tweet_bow[name] = vectorizer.transform(X_test_tweet)

Creating BoW for: unigram with ngram_range=(1, 1)
Creating BoW for: bigram with ngram_range=(2, 2)
Creating BoW for: trigram with ngram_range=(3, 3)
Creating BoW for: unigram_bigram with ngram_range=(1, 2)
Creating BoW for: unigram_bigram_trigram with ngram_range=(1, 3)



Train a kNN classifier on the five matrices. Print out the resulting accuracies on the test data. [0.5 points]

In [35]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

# Your code here!
knn_accuracies_tweet = {}
k_neighbors = 5 # A common default value for k

for name in ngram_ranges.keys():
    knn = KNeighborsClassifier(n_neighbors=k_neighbors)
    knn.fit(X_train_tweet_bow[name], y_train_tweet)
    
    predictions = knn.predict(X_test_tweet_bow[name])
    accuracy = accuracy_score(y_test_tweet, predictions)
    knn_accuracies_tweet[name] = accuracy
    
print("kNN Accuracies (Twitter data):")
for name, acc in knn_accuracies_tweet.items():
    print(f"{name}: {acc:.4f}")

kNN Accuracies (Twitter data):
unigram: 0.3333
bigram: 0.3167
trigram: 0.3500
unigram_bigram: 0.2833
unigram_bigram_trigram: 0.3500


Train a Random Forest classifier on your matrices from Exercise 1.1 and 1.2. Print out the resulting accuracies. [0.5 points]

In [41]:
from sklearn.ensemble import RandomForestClassifier

# Your code here!
# TODO Fill in the matrices from Exercise 1.1 and 1.2
token2id_unigram = token2id(preprocessed_corpus)
bow_yahoo_unigram = bow_matrix(preprocessed_corpus, token2id_unigram)

token2id_unigram_bigram = token2id_with_bigrams(preprocessed_corpus)
bow_yahoo_unigram_bigram = bow_matrix_with_bigrams(preprocessed_corpus, token2id_unigram_bigram)

tfidf_yahoo_unigram = tf_idf(preprocessed_corpus, token2id_unigram)
tfidf_yahoo_unigram_bigram = tf_idf(preprocessed_corpus, token2id_unigram_bigram)

yahoo_data_matrices = {
    "Yahoo_BoW_Unigrams": bow_yahoo_unigram,
    "Yahoo_BoW_UniBiGrams": bow_yahoo_unigram_bigram,
    "Yahoo_TFIDF_Unigrams": tfidf_yahoo_unigram,
    "Yahoo_TFIDF_UniBiGrams": tfidf_yahoo_unigram_bigram
}

# TODO Make sure the labels are aligned with the matrices
yahoo_labels = df['label'].values

yahoo_rf_accuracies = {}

for name, matrix_data in yahoo_data_matrices.items():
    # Splitting Yahoo data
    X_train_yahoo, X_test_yahoo, y_train_yahoo, y_test_yahoo = train_test_split(
        matrix_data, yahoo_labels, test_size=0.2, random_state=42, stratify=yahoo_labels
    )
    
    # Initialize RandomForest Classifier
    rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
    rf_classifier.fit(X_train_yahoo, y_train_yahoo)
    
    predictions_yahoo = rf_classifier.predict(X_test_yahoo)
    accuracy_yahoo = accuracy_score(y_test_yahoo, predictions_yahoo)
    yahoo_rf_accuracies[name] = accuracy_yahoo
    
print("Random Forest Accuracies (Yahoo data):")
for name, acc in yahoo_rf_accuracies.items():
    print(f"{name}: {acc:.4f}")

Random Forest Accuracies (Yahoo data):
Yahoo_BoW_Unigrams: 0.3100
Yahoo_BoW_UniBiGrams: 0.2600
Yahoo_TFIDF_Unigrams: 0.3100
Yahoo_TFIDF_UniBiGrams: 0.3300


Train a Naive Bayes classifier on your matrices from Exercise 1.1 and 1.2. Print out the resulting accuracy. [0.5 points]

In [None]:
from sklearn.naive_bayes import MultinomialNB

# Your code here!
yahoo_nb_accuracies = {}

for name, matrix_data in yahoo_data_matrices.items():
    # Splitting Yahoo data
    X_train_yahoo, X_test_yahoo, y_train_yahoo, y_test_yahoo = train_test_split(
        matrix_data, yahoo_labels_numeric, test_size=0.2, random_state=42, stratify=yahoo_labels_numeric
    )
    
    # Initialize Naive Bayes Classifier
    nb_classifier = MultinomialNB()
    nb_classifier.fit(X_train_yahoo, y_train_yahoo)
    
    predictions_yahoo_nb = nb_classifier.predict(X_test_yahoo)
    accuracy_yahoo_nb = accuracy_score(y_test_yahoo, predictions_yahoo_nb)
    yahoo_nb_accuracies[name] = accuracy_yahoo_nb
    
print("Naive Bayes Accuracies (Yahoo data):")
for name, acc in yahoo_nb_accuracies.items():
    print(f"{name}: {acc:.4f}")

## Exercise 3: Pointwise Mutual Information [2 points]

Write a function to calculate word frequencies for each class and store the counts of each word within each class. Then, implement a function to compute PMI for each word per class. We are interested in identifying the words most strongly correlated with each class. Print the top 20 features (words) for each class. Use the twitter dataset. [1 point]

In [24]:
from collections import defaultdict

# Calculate word frequencies for each class on train data
def calculate_word_frequencies(X: List[List[str]], y: List[int], num_labels: int = 3) -> Tuple[Dict[str, List[int]], List[int]]:
    """
    Calculate word frequencies for each class in the dataset.
    
    Args:
        X - List of tokenized sentences.
        y - List of labels corresponding to the sentences.
        num_labels - Number of unique labels in the dataset.
    
    Returns:
        word_counts - Dictionary mapping words to their frequency counts for each class.
        class_counts - List of counts for each class."""
    # Your code here!
    pass

def compute_pmi(word_counts: Dict[str, List[int]], class_counts: List[int]) -> Dict[Tuple[str, int], float]:
    """
    Compute Pointwise Mutual Information (PMI) for each word in each class.
    
    Args:
        word_counts - Dictionary mapping words to their frequency counts for each class.
        class_counts - List of counts for each class.
        
    Returns:
        pmi - Dictionary mapping (word, class_index) to PMI score."""
    # Your code here!
    pass

# Print top features (words) for each class based on PMI
def top_n_features(pmi: Dict[Tuple[str, int], float], label_mapping: Dict[int, str], top_n: int = 100) -> Dict[int, List[Tuple[str, float]]]:
    """
    Get the top N features (words) for each class based on PMI scores.
    
    Args:
        pmi - Dictionary mapping (word, class_index) to PMI score.
        label_mapping - Dictionary mapping class indices to class labels.
        top_n - Number of top features to return for each class.
        
    Returns:
        top_features - Dictionary mapping class indices to lists of top N features (words) and their PMI scores.
    """
    # Your code here!
    pass

# Tokenize, preprocess, and split the Twitter dataset

# Calculate word frequencies and PMI for the Twitter dataset

# Print top features for each class

Train a decision tree classifier on the top N best words from PMI (regardless of class). Use the `CountVectorizer` to force the train and test sets into this new vocabulary of the best N words. Plot your results, showing how the accuracy behaves with different sizes of the vocabulary.
You can use `N_values = np.linspace(10, num_unique_words, 20, dtype=int)` for some nice N values for your top features vocabulary. [1 point]

In [25]:
def top_n_features_overall(pmi: Dict[Tuple[str, int], float], N: int) -> List[str]:
    """Get top N features based on overall PMI scores.

    Args:
        pmi - Dictionary mapping (word, class_index) to PMI scores.
        N - Number of top features to return.
    Returns:
        top_features - List of top N features based on overall PMI scores.
    """
    # Your code here!
    pass

# Determine number of unique words

# Define intervals for N

# Train decision tree classifier for each value of N

# Plot results

## Bonus: Neural Classifier [2 points]

In this exercise, we are going to use a classifier based on DistilBERT, which is a smaller, faster, cheaper and lighter version of BERT.

A general recommendation here is to run this on [Google Colab](https://colab.research.google.com/) or [Kaggle](https://www.kaggle.com/), unless you have a relatively powerful computer and you know what you are doing.

You might have to install a few requirements by running `!pip install <package>` if you get errors.

In the following cell, you will find everything you need in order to train your classifier, but a couple of lines have been erased for you to fill in.

You will be working with the [FinancialPhraseBank](https://www.kaggle.com/datasets/ankurzing/sentiment-analysis-for-financial-news) dataset, which you can find in the data folder under `data/finance.csv`.

In [None]:
#
# Your code here!
#
import torch
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, Trainer, TrainingArguments
from transformers import DataCollatorWithPadding
from datasets import Dataset
from tqdm import tqdm as tqdm

# create 2 pandas dataframes train_df and test_df
# train_df holds the sentences of X_train and the labels y_train ("sentence": X_train, "label": y_train)
# test_df is analogous for the test set
train_df = _
test_df = _

# now convert the pandas DataFrames into HuggingFace Dataset objects
train_dataset = _
test_dataset = _

# import the distilbert-base-uncased tokenizer using .from_pretrained
tokenizer = _

# write your tokenize function that passes a sentence from the data to the tokenizer
def tokenize_function(data):
    return tokenizer( _ , truncation=True)

# now call the .map() function of your both huggingface datasets from above and pass the
# tokenize_function, but without parentheses. Also pass batched=True
train_dataset = _
test_dataset = _

# now call the .set_format() function of your datasets and set the format to type='torch' 
# and columns=['input_ids', 'attention_mask', 'label']
train_dataset.set_format(_, _)
test_dataset.set_format(_, _)

# load distilbert-base-uncased as a model for sequence classification and set the correct number of labels
model = _

# Simply use these predefined arguments for the trainer
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',
    save_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=1,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
)

# use this data collator 
data_collator = DataCollatorWithPadding(tokenizer)

# Define Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

# Train the model
trainer.train()

eval_results = trainer.evaluate()

# Get predictions
preds_output = trainer.predict(test_dataset)
preds = preds_output.predictions.argmax(-1)

# Calculate accuracy
accuracy = accuracy_score(y_test, preds)

print(f"Evaluation results: {eval_results}")
print(f"Accuracy: {accuracy:.4f}")
print(f"Evaluation results: {eval_results}")