# Dependencies

## Install

In [1]:
!pip install ipython-autotime
!pip install contractions

Collecting ipython-autotime
  Downloading ipython_autotime-0.3.1-py2.py3-none-any.whl (6.8 kB)
Installing collected packages: ipython-autotime
Successfully installed ipython-autotime-0.3.1


## Imports

In [1]:
import os
import re
import unicodedata

import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd

from collections import Counter

import nltk
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

nltk.download('punkt', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('averaged_perceptron_tagger', quiet=True)

%load_ext autotime

time: 0 ns (started: 2023-09-21 21:48:04 -07:00)


# Config

In [2]:
class PathConfig:
    HW2_DIR = os.path.dirname(os.getcwd())
    OUTPUT_DIR = os.path.join(HW2_DIR, "solution", "output")

    DATA_PATH = os.path.join(HW2_DIR, "CSCI544_HW2/data")
    VERIFICATION_DATA_PATH = os.path.join(HW2_DIR, "CSCI544_HW2/verification")

    VOCAB_FILE_PATH = os.path.join(OUTPUT_DIR, "vocab.txt")

time: 0 ns (started: 2023-09-21 21:48:05 -07:00)


In [3]:
class VocabConfig:
    UNKNOWN_TOKEN = "<unk>"
    THRESHOLD = 3

time: 15 ms (started: 2023-09-21 21:48:06 -07:00)


# Task 1: Vocabulary Creation

In [19]:
class VocabularyGenerator:
    def __init__(self, threshold: int, unknown_token: str = None, save: bool = True, path: str = None):
        """Initialize a VocabularyGenerator

        Args:
            threshold (int): Frequency threshold for rare words.
            unknown_token (str, optional): Token to replace rare words. Defaults to None.
            save (bool, optional): Flag to save the vocabulary. Default is True.
            path (str, optional): Path to save the vocabulary. Defaults to None.

        Usage:
            vocab_generator = VocabularyGenerator(threshold=3, unknown_token="<unk>")
            vocab_df = vocab_generator.generate_vocabulary(data, "sentence")
        """
        self.threshold = threshold
        self.unknown_token = (
            unknown_token if unknown_token is not None else VocabConfig.UNKNOWN_TOKEN
        )
        self._save = save

        if self._save and path is None:
            self._path = PathConfig.VOCAB_FILE_PATH
            
        self.path = path

    def _count_word_frequency(self, data, sentence_col_name):
        word_freq = data[sentence_col_name].apply(lambda sentence: Counter(sentence))

        # Initialize an empty counter
        combined_counter = Counter()
        # Loop through each counter and update the combined counter
        for counter in word_freq:
            combined_counter.update(counter)

        # Convert combined_counter to a list of tuples (word, frequency)
        return combined_counter.items()
    
    def generate_vocabulary(self, data: pd.DataFrame, sentence_col_name: str):
        """Generate a vocabulary from the provided dataset.

        Args:
            data (pd.DataFrame): The DataFrame containing the dataset.
            sentence_col_name (str): The name of the column containing sentences.

        Returns:
            pd.DataFrame: A DataFrame with the generated vocabulary.

        This method takes a DataFrame with sentences and generates a vocabulary based on word frequencies.
        It replaces words with frequencies less than the specified threshold with the unknown token ("<unk>").
        The resulting DataFrame is sorted by frequency and indexed.

        If the 'save' flag is set, the vocabulary will be saved to the specified path.

        Usage:
            ```py
            vocab_generator = VocabularyGenerator(threshold=3, unknown_token="<unk>")
            vocab_df = vocab_generator.generate_vocabulary(data, sentence_col_name)
            ```
        """
        word_freq_list = self._count_word_frequency(data, sentence_col_name)

        # Create a DataFrame
        word_freq_df = pd.DataFrame(word_freq_list, columns=['Word', 'Frequency'])

        # Replace words with frequency less than threshold with '<unk>'
        word_freq_df['Word'] = word_freq_df.apply(
            lambda row: self.unknown_token if row['Frequency'] <= self.threshold else row['Word'], 
            axis=1
        )
        
        # Group by 'Word' and aggregate by sum
        word_freq_df = word_freq_df.groupby('Word', as_index=False)['Frequency'].agg('sum')

        # Sort the DataFrame by frequency
        word_freq_df = word_freq_df.sort_values(by='Frequency', ascending=False, ignore_index=True)

        # Add an index column
        word_freq_df['Index'] = range(len(word_freq_df))

        if self._save:
            self.save_vocab(word_freq_df, self.path)

        return word_freq_df
    
    def save_vocab(self, word_freq_df, path):
        """Write your vocabulary to the file"""
        if not os.path.exists(os.path.dirname(path)):
            os.makedirs(os.path.dirname(path))

        with open(path, 'w') as file:
            vocabulary = word_freq_df.to_records(index=False)
            for word, frequency, index in vocabulary:
                file.write(f'{word}\t{index}\t{frequency}\n')


df = pd.read_json(os.path.join(PathConfig.DATA_PATH, "train.json"))

vocab_generator = VocabularyGenerator(threshold=3, unknown_token="<unk>", save=False)
vocab_df = vocab_generator.generate_vocabulary(df, "sentence")
vocab_df.head(10)

Unnamed: 0,Word,Frequency,Index
0,",",46476,0
1,<unk>,42044,1
2,the,39533,2
3,.,37452,3
4,of,22104,4
5,to,21305,5
6,a,18469,6
7,and,15346,7
8,in,14609,8
9,'s,8872,9


time: 2.28 s (started: 2023-09-22 01:53:26 -07:00)


## Trials

In [17]:
df = pd.read_json(os.path.join(PathConfig.DATA_PATH, "train.json"))
print(df.shape)
df.head(10)

(38218, 3)


Unnamed: 0,index,sentence,labels
0,0,"[Pierre, Vinken, ,, 61, years, old, ,, will, j...","[NNP, NNP, ,, CD, NNS, JJ, ,, MD, VB, DT, NN, ..."
1,1,"[Mr., Vinken, is, chairman, of, Elsevier, N.V....","[NNP, NNP, VBZ, NN, IN, NNP, NNP, ,, DT, NNP, ..."
2,2,"[Rudolph, Agnew, ,, 55, years, old, and, forme...","[NNP, NNP, ,, CD, NNS, JJ, CC, JJ, NN, IN, NNP..."
3,3,"[A, form, of, asbestos, once, used, to, make, ...","[DT, NN, IN, NN, RB, VBN, TO, VB, NNP, NN, NNS..."
4,4,"[The, asbestos, fiber, ,, crocidolite, ,, is, ...","[DT, NN, NN, ,, NN, ,, VBZ, RB, JJ, IN, PRP, V..."
5,5,"[Lorillard, Inc., ,, the, unit, of, New, York-...","[NNP, NNP, ,, DT, NN, IN, JJ, JJ, NNP, NNP, WD..."
6,6,"[Although, preliminary, findings, were, report...","[IN, JJ, NNS, VBD, VBN, RBR, IN, DT, NN, IN, ,..."
7,7,"[A, Lorillard, spokewoman, said, ,, ``, This, ...","[DT, NNP, NN, VBD, ,, ``, DT, VBZ, DT, JJ, NN, .]"
8,8,"[We, 're, talking, about, years, ago, before, ...","[PRP, VBP, VBG, IN, NNS, IN, IN, NN, VBD, IN, ..."
9,9,"[There, is, no, asbestos, in, our, products, n...","[EX, VBZ, DT, NN, IN, PRP$, NNS, RB, ., '']"


time: 1.05 s (started: 2023-09-22 01:44:32 -07:00)


In [16]:
word_freq = df['sentence'].apply(lambda sentence: Counter(sentence))

# Initialize an empty counter
combined_counter = Counter()

# Loop through each counter and update the combined counter
for counter in word_freq:
    combined_counter.update(counter)

# Convert combined_counter to a list of tuples (word, frequency)
word_freq_list = list(combined_counter.items())

# Create a DataFrame
word_freq_df = pd.DataFrame(word_freq_list, columns=['Word', 'Frequency'])

# Replace words with frequency less than threshold with '<unk>'
word_freq_df['Word'] = word_freq_df.apply(
    lambda row: VocabConfig.UNKNOWN_TOKEN if row['Frequency'] <= VocabConfig.THRESHOLD else row['Word'], 
    axis=1
)

# Group by 'Word' and aggregate by sum
word_freq_df = word_freq_df.groupby('Word', as_index=False)['Frequency'].agg('sum')
# Sort the DataFrame by frequency

word_freq_df = word_freq_df.sort_values(by='Frequency', ascending=False, ignore_index=True)
# Add an index column
word_freq_df['Index'] = range(len(word_freq_df))
word_freq_df.head(10)

Unnamed: 0,Word,Frequency,Index
0,",",46476,0
1,<unk>,42044,1
2,the,39533,2
3,.,37452,3
4,of,22104,4
5,to,21305,5
6,a,18469,6
7,and,15346,7
8,in,14609,8
9,'s,8872,9


time: 1.42 s (started: 2023-09-22 01:44:18 -07:00)


In [14]:
if not os.path.exists(os.path.dirname(PathConfig.VOCAB_FILE_PATH)):
    os.makedirs(os.path.dirname(PathConfig.VOCAB_FILE_PATH))

with open(PathConfig.VOCAB_FILE_PATH, 'w') as file:
    # Write your vocabulary to the file
    # Assuming you have a method that returns the vocabulary
    vocabulary = word_freq_df.to_records(index=False)
    for word, frequency, index in vocabulary:
        file.write(f'{word}\t{index}\t{frequency}\n')

time: 187 ms (started: 2023-09-22 01:34:23 -07:00)


# Task 2: Model Learning

# Task 3: Greedy Decoding with HMM

# Task 4: Viterbi Decoding with HMM 

# THE END