# Dependencies


## Install


In [8]:
%pip install ipython-autotime
%pip install contractions


Note: you may need to restart the kernel to use updated packages.
time: 3.38 s (started: 2023-09-24 17:59:45 -07:00)


## Imports


In [2]:
import itertools
import json
import os

import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd

from collections import Counter
from typing import List

%load_ext autotime

time: 0 ns (started: 2023-09-26 13:29:17 -07:00)


# Config


In [501]:
class PathConfig:
    HW2_DIR = os.path.dirname(os.getcwd())
    OUTPUT_DIR = os.path.join(HW2_DIR, "solution", "output")

    DATA_PATH = os.path.join(HW2_DIR, "CSCI544_HW2", "data")
    VERIFICATION_DATA_PATH = os.path.join(HW2_DIR, "CSCI544_HW2", "verification")

    VOCAB_FILE_PATH = os.path.join(OUTPUT_DIR, "vocab.txt")
    HMM_MODEL_SAVE_PATH = os.path.join(OUTPUT_DIR, "hmm.json")
    GREEDY_ALGO_OUTPUT_PATH = os.path.join(OUTPUT_DIR, "greedy.json")
    VITERBI_ALGO_OUTPUT_PATH = os.path.join(OUTPUT_DIR, "viterbi.json")


class WSJDatasetConfig:
    cols = ["index", "sentences", "labels"]

    train_file_path = os.path.join(PathConfig.DATA_PATH, "train.json")
    dev_file_path = os.path.join(PathConfig.DATA_PATH, "dev.json")
    test_file_path = os.path.join(PathConfig.DATA_PATH, "test.json")


class VocabConfig:
    UNKNOWN_TOKEN = "<unk>"
    THRESHOLD = 2
    FILE_HEADER = ["word", "index", "frequency"]

    VOCAB_FILE = PathConfig.VOCAB_FILE_PATH


class HMMConfig:
    HMM_MODEL_SAVED = PathConfig.HMM_MODEL_SAVE_PATH

time: 15 ms (started: 2023-09-26 22:56:21 -07:00)


# Dataset Preparation


In [480]:
class WSJDataset:
    def __init__(self, path):
        self.path = path

        self.data: pd.DataFrame = None
        self.cols = WSJDatasetConfig.cols

    def read_data(self):
        self.data = pd.read_json(self.path)

    def process_sentences(self):
        self.data["sentence"] = self.data["sentence"].apply(
            lambda sentence: [word.lower() for word in sentence],
        )

    def prepare_dataset(self):
        self.read_data()
        self.process_sentences()
        return self.data

    def get_sentences_with_pos_tags(self):
        if "labels" in self.data.columns:
            sentences_with_pos_tags = self.data.loc[:, ["sentence", "labels"]].apply(
                lambda row: list(zip(row["sentence"], row["labels"])), axis=1
            )
        else:
            sentences_with_pos_tags = self.data["sentence"].apply(
                lambda sentence: list(zip(sentence, [None] * len(sentence)))
            )
        sentences_with_pos_tags = sentences_with_pos_tags.tolist()
        return sentences_with_pos_tags

time: 16 ms (started: 2023-09-26 22:49:44 -07:00)


In [481]:
train_dataset = WSJDataset(path=WSJDatasetConfig.train_file_path)
df_train = train_dataset.prepare_dataset()
print(df_train.shape)
df_train.head()

(38218, 3)


Unnamed: 0,index,sentence,labels
0,0,"[pierre, vinken, ,, 61, years, old, ,, will, j...","[NNP, NNP, ,, CD, NNS, JJ, ,, MD, VB, DT, NN, ..."
1,1,"[mr., vinken, is, chairman, of, elsevier, n.v....","[NNP, NNP, VBZ, NN, IN, NNP, NNP, ,, DT, NNP, ..."
2,2,"[rudolph, agnew, ,, 55, years, old, and, forme...","[NNP, NNP, ,, CD, NNS, JJ, CC, JJ, NN, IN, NNP..."
3,3,"[a, form, of, asbestos, once, used, to, make, ...","[DT, NN, IN, NN, RB, VBN, TO, VB, NNP, NN, NNS..."
4,4,"[the, asbestos, fiber, ,, crocidolite, ,, is, ...","[DT, NN, NN, ,, NN, ,, VBZ, RB, JJ, IN, PRP, V..."


time: 3.17 s (started: 2023-09-26 22:49:47 -07:00)


In [482]:
valid_dataset = WSJDataset(path=WSJDatasetConfig.dev_file_path)
df_valid = valid_dataset.prepare_dataset()
print(df_valid.shape)
df_valid.head()

(5527, 3)


Unnamed: 0,index,sentence,labels
0,0,"[the, arizona, corporations, commission, autho...","[DT, NNP, NNP, NNP, VBD, DT, CD, NN, NN, NN, I..."
1,1,"[the, ruling, follows, a, host, of, problems, ...","[DT, NN, VBZ, DT, NN, IN, NNS, IN, NNP, NNP, ,..."
2,2,"[the, arizona, regulatory, ruling, calls, for,...","[DT, NNP, JJ, NN, VBZ, IN, $, CD, CD, IN, JJ, ..."
3,3,"[the, company, had, sought, increases, totalin...","[DT, NN, VBD, VBN, NNS, VBG, $, CD, CD, ,, CC,..."
4,4,"[the, decision, was, announced, after, trading...","[DT, NN, VBD, VBN, IN, NN, VBD, .]"


time: 140 ms (started: 2023-09-26 22:49:51 -07:00)


In [483]:
test_dataset = WSJDataset(path=WSJDatasetConfig.test_file_path)
df_test = test_dataset.prepare_dataset()
print(df_test.shape)
df_test.head()

(5462, 2)


Unnamed: 0,index,sentence
0,0,"[influential, members, of, the, house, ways, a..."
1,1,"[the, bill, ,, whose, backers, include, chairm..."
2,2,"[the, bill, intends, to, restrict, the, rtc, t..."
3,3,"[``, such, agency, `, self-help, ', borrowing,..."
4,4,"[the, complex, financing, plan, in, the, s&l, ..."


time: 125 ms (started: 2023-09-26 22:49:53 -07:00)


# Task 1: Vocabulary Creation


In [484]:
class VocabularyGenerator:
    def __init__(
        self, threshold: int, unknown_token: str = None, save: bool = False, path: str = None
    ):
        """Initialize a VocabularyGenerator

        Args:
            threshold (int): Frequency threshold for rare words.
            unknown_token (str, optional): Token to replace rare words. Defaults to None.
            save (bool, optional): Flag to save the vocabulary. Default is True.
            path (str, optional): Path to save the vocabulary. Defaults to None.

        Usage:
            vocab_generator = VocabularyGenerator(threshold=3, unknown_token="<unk>")
            vocab_df = vocab_generator.generate_vocabulary(data, "sentence")
        """
        self.threshold = threshold
        self.unknown_token = (
            unknown_token if unknown_token is not None else VocabConfig.UNKNOWN_TOKEN
        )
        self._save = save

        if self._save and path is None:
            self.path = VocabConfig.VOCAB_FILE
        else:
            self.path = path

    def _count_word_frequency(self, data, sentence_col_name):
        word_freq = (
            data[sentence_col_name]
            .explode()
            .value_counts()
            .rename_axis("word")
            .reset_index(name="frequency")
        )
        return word_freq

    def generate_vocabulary(self, data: pd.DataFrame, sentence_col_name: str):
        """Generate a vocabulary from the provided dataset.

        Args:
            data (pd.DataFrame): The DataFrame containing the dataset.
            sentence_col_name (str): The name of the column containing sentences.

        Returns:
            pd.DataFrame: A DataFrame with the generated vocabulary.

        This method takes a DataFrame with sentences and generates a vocabulary based on word frequencies.
        It replaces words with frequencies less than the specified threshold with the unknown token ("<unk>").
        The resulting DataFrame is sorted by frequency and indexed.

        If the 'save' flag is set, the vocabulary will be saved to the specified path.

        Usage:
            ```py
            vocab_generator = VocabularyGenerator(threshold=3, unknown_token="<unk>")
            vocab_df = vocab_generator.generate_vocabulary(data, sentence_col_name)
            ```
        """
        word_freq_df = self._count_word_frequency(data, sentence_col_name)

        # Replace words with frequency less than threshold with '<unk>'
        word_freq_df["word"] = word_freq_df.apply(
            lambda row: self.unknown_token if row["frequency"] <= self.threshold else row["word"],
            axis=1,
        )

        # Group by 'Word' and aggregate by sum
        word_freq_df = word_freq_df.groupby("word", as_index=False)["frequency"].agg("sum")

        # Sort the DataFrame by frequency
        word_freq_df = word_freq_df.sort_values(by="frequency", ascending=False, ignore_index=True)

        # Placing Special Tokens at the top of the DataFrame
        unk_df = word_freq_df.loc[word_freq_df["word"] == self.unknown_token]
        word_freq_df = word_freq_df.loc[word_freq_df["word"] != self.unknown_token]

        word_freq_df = pd.concat([unk_df, word_freq_df], ignore_index=True)

        # Add an index column
        word_freq_df["index"] = range(len(word_freq_df))

        if self._save:
            self.save_vocab(word_freq_df, self.path)

        return word_freq_df

    def save_vocab(self, word_freq_df, path):
        """Write your vocabulary to the file"""
        if not os.path.exists(os.path.dirname(path)):
            os.makedirs(os.path.dirname(path))

        with open(path, "w") as file:
            vocabulary = word_freq_df.to_records(index=False)
            for word, frequency, index in vocabulary:
                file.write(f"{word}\t{index}\t{frequency}\n")

time: 0 ns (started: 2023-09-26 22:50:09 -07:00)


In [485]:
vocab_generator = VocabularyGenerator(
    threshold=VocabConfig.THRESHOLD, unknown_token=VocabConfig.UNKNOWN_TOKEN, save=True
)
vocab_df = vocab_generator.generate_vocabulary(df_train, "sentence")
vocab_df.head(10)

Unnamed: 0,word,frequency,index
0,<unk>,28581,0
1,",",46476,1
2,the,46144,2
3,.,37452,3
4,of,22176,4
5,to,21459,5
6,a,19338,6
7,in,16320,7
8,and,15875,8
9,'s,8886,9


time: 1 s (started: 2023-09-26 22:50:10 -07:00)


In [486]:
print("Selected threshold for unknown words: ", VocabConfig.THRESHOLD)
print("Vocabulary size: ", vocab_df.shape[0])
print(
    "Total occurrences of the special token <unk>: ",
    int(vocab_df[vocab_df["word"] == "<unk>"].frequency),
)

Selected threshold for unknown words:  2
Vocabulary size:  15568
Total occurrences of the special token <unk>:  28581
time: 16 ms (started: 2023-09-26 22:50:12 -07:00)


## Preparing Dataset


In [487]:
unique_pos_tags = df_train.labels.explode().unique()
print("Number of unique POS tags =", unique_pos_tags.shape[0])
print("Unique Part-of-speech tags:\n", unique_pos_tags)
unique_pos_tags = unique_pos_tags.tolist()

Number of unique POS tags = 45
Unique Part-of-speech tags:
 ['NNP' ',' 'CD' 'NNS' 'JJ' 'MD' 'VB' 'DT' 'NN' 'IN' '.' 'VBZ' 'VBG' 'CC'
 'VBD' 'VBN' 'RB' 'TO' 'PRP' 'RBR' 'WDT' 'VBP' 'RP' 'PRP$' 'JJS' 'POS'
 '``' 'EX' "''" 'WP' ':' 'JJR' 'WRB' '$' 'NNPS' 'WP$' '-LRB-' '-RRB-'
 'PDT' 'RBS' 'FW' 'UH' 'SYM' 'LS' '#']
time: 188 ms (started: 2023-09-26 22:50:14 -07:00)


In [488]:
train_sentences_with_pos_tags = train_dataset.get_sentences_with_pos_tags()
valid_sentences_with_pos_tags = valid_dataset.get_sentences_with_pos_tags()
test_sentences_with_pos_tags = test_dataset.get_sentences_with_pos_tags()

time: 1.62 s (started: 2023-09-26 22:50:16 -07:00)


In [489]:
train_sentences_with_pos_tags[0]

[('pierre', 'NNP'),
 ('vinken', 'NNP'),
 (',', ','),
 ('61', 'CD'),
 ('years', 'NNS'),
 ('old', 'JJ'),
 (',', ','),
 ('will', 'MD'),
 ('join', 'VB'),
 ('the', 'DT'),
 ('board', 'NN'),
 ('as', 'IN'),
 ('a', 'DT'),
 ('nonexecutive', 'JJ'),
 ('director', 'NN'),
 ('nov.', 'NNP'),
 ('29', 'CD'),
 ('.', '.')]

time: 0 ns (started: 2023-09-26 22:50:18 -07:00)


# Task 2: Model Learning


In [490]:
class HMM:
    def __init__(self, vocab_file: str, labels: List[str]):
        """_summary_

        Args:
            train_data (pd.DataFrame): _description_
            vocab_file (str): _description_
        """
        self.vocab = self._read_vocab(vocab_file)
        self.labels = labels

        # Hidden Markov Model Parameters
        self.states = list()
        self.priors = None
        self.transitions = None
        self.emissions = None

        # Laplace Smoothing
        self.smoothing_constant = 1e-10

    def _read_vocab(self, vocab_file: str):
        return pd.read_csv(vocab_file, sep="\t", names=VocabConfig.FILE_HEADER)

    def _initialize_params(self):
        self.states = list(self.labels)

        # N = Number of states i.e. number of distinct tags
        num_states = len(self.labels)
        # M = Number of observable symbols i.e. number of distinct words
        num_observations = len(self.vocab)

        # State transition probability matrix of size N * N
        self.transitions = np.zeros((num_states, num_states))

        # Obseravtion Emission probability matrix of size N * M
        self.emissions = np.zeros((num_states, num_observations))

        # Prior probability matrix of size N * 1
        self.priors = np.zeros(num_states)

    def _smoothen_propabilities(self, prob_mat: np.array, smoothing_constant: float):
        """Handle cases where the probabilities is 0"""
        return np.where(prob_mat == 0, smoothing_constant, prob_mat)

    def _compute_prior_params(self, train_data):
        tag_to_index = {tag: i for i, tag in enumerate(self.labels)}
        num_sentences = len(train_data)

        for sentence in train_data:
            label = sentence[0][1]
            state_idx = tag_to_index[label]
            self.priors[state_idx] += 1

        self.priors = self.priors / num_sentences
        self.priors = self._smoothen_propabilities(self.priors, self.smoothing_constant)

    def _compute_transition_params(self, train_data):
        tag_to_index = {tag: i for i, tag in enumerate(self.labels)}

        for sentence in train_data:
            label_indices = [tag_to_index.get(label) for _, label in sentence]

            for i in range(1, len(label_indices)):
                prev_state = label_indices[i - 1]
                curr_state = label_indices[i]
                self.transitions[prev_state, curr_state] += 1

        row_agg = self.transitions.sum(axis=1)[:, np.newaxis]
        self.transitions = self.transitions / row_agg
        self.transitions = self._smoothen_propabilities(self.transitions, self.smoothing_constant)

    def _compute_emission_params(self, train_data):
        word_to_index = dict(zip(self.vocab["word"], self.vocab["index"]))
        tag_to_index = {tag: i for i, tag in enumerate(self.labels)}

        for sentence in train_data:
            for word, label in sentence:
                state_idx = tag_to_index[label]
                word_idx = word_to_index.get(word, word_to_index[VocabConfig.UNKNOWN_TOKEN])
                self.emissions[state_idx, word_idx] += 1

        row_agg = self.emissions.sum(axis=1)[:, np.newaxis]
        self.emissions = self.emissions / row_agg
        self.emissions = self._smoothen_propabilities(self.emissions, self.smoothing_constant)

    def fit(self, train_data: pd.DataFrame):
        self._initialize_params()
        self._compute_prior_params(train_data)
        self._compute_transition_params(train_data)
        self._compute_emission_params(train_data)

    @property
    def get_all_probability_matrices(self):
        return self.priors, self.transitions, self.emissions

    def save_model(self, file_path=None):
        if file_path is None:
            file_path = HMMConfig.HMM_MODEL_SAVED

        if not os.path.exists(os.path.dirname(file_path)):
            os.makedirs(os.path.dirname(file_path))

        transition_prob = {
            f"({s1}, {s2})": self.transitions[self.states.index(s1), self.states.index(s2)]
            for s1, s2 in itertools.product(self.states, repeat=2)
        }

        emission_prob = {
            f"({s}, {w})": p
            for s in self.states
            for w, p in zip(self.vocab["word"], self.emissions[self.states.index(s), :])
        }

        model_params = {"transition": transition_prob, "emission": emission_prob}

        with open(file_path, "w") as json_file:
            json.dump(model_params, json_file, indent=4)

time: 0 ns (started: 2023-09-26 22:50:45 -07:00)


In [491]:
model = HMM(vocab_file=VocabConfig.VOCAB_FILE, labels=unique_pos_tags)
model.fit(train_sentences_with_pos_tags)

time: 1.69 s (started: 2023-09-26 22:50:46 -07:00)


In [492]:
p, t, e = model.get_all_probability_matrices
print("Number of Transition Parameters =", len(t.flatten()))
print("Number of Emission Parameters =", len(e.flatten()))

Number of Transition Parameters = 2025
Number of Emission Parameters = 700560
time: 0 ns (started: 2023-09-26 22:50:48 -07:00)


In [498]:
model.save_model()

time: 9.64 s (started: 2023-09-26 22:51:04 -07:00)


# Task 3: Greedy Decoding with HMM


In [494]:
class GreedyDecoding:
    def __init__(self, prior_probs, transition_probs, emission_probs, states, vocab):
        self.priors = prior_probs
        self.transitions = transition_probs
        self.emissions = emission_probs
        self.states = states
        self.vocab = vocab

        self.tag_to_idx = {tag: idx for idx, tag in enumerate(states)}
        self.word_to_index = dict(zip(self.vocab["word"], self.vocab["index"]))

        # Precompute scores for each word-tag pair
        self.priors_emissions = prior_probs[:, np.newaxis] * emission_probs

    def _decode_single_sentence(self, sentence):
        predicted_tags = []

        prev_tag_idx = None

        for word in sentence:
            word_idx = self.word_to_index.get(word, self.word_to_index[VocabConfig.UNKNOWN_TOKEN])

            if prev_tag_idx is None:
                # scores = self.priors * self.emissions[:, word_idx]
                scores = self.priors_emissions[:, word_idx]
            else:
                scores = self.transitions[prev_tag_idx] * self.emissions[:, word_idx]

            prev_tag_idx = np.argmax(scores)
            predicted_tags.append(self.states[prev_tag_idx])

        return predicted_tags

    def decode(self, sentences):
        predicted_tags_list = []

        for sentence in sentences:
            predicted_tags = self._decode_single_sentence([word for word, tag in sentence])
            predicted_tags_list.append(predicted_tags)

        return predicted_tags_list

time: 0 ns (started: 2023-09-26 22:50:53 -07:00)


In [495]:
def calculate_accuracy(predicted_sequences, true_sequences):
    """
    Calculate the accuracy of predicted sequences compared to true sequences.

    Args:
        predicted_sequences (list): List of predicted sequences.
        true_sequences (list): List of true sequences.

    Returns:
        float: Accuracy as a percentage.
    """
    assert len(predicted_sequences) == len(true_sequences), "Lists must have the same length."

    total_count = 0
    correct_count = 0

    for true_label, predicted_label in zip(true_sequences, predicted_sequences):
        for true_tag, predicted_tag in zip(true_label, predicted_label):
            total_count += 1
            if true_tag == predicted_tag:
                correct_count += 1

    accuracy = correct_count / total_count
    return accuracy

time: 0 ns (started: 2023-09-26 22:50:54 -07:00)


In [496]:
# Assuming you have the probability matrices and other data
greedy_decoder = GreedyDecoding(p, t, e, model.states, model.vocab)

# Apply Greedy Decoding on development data
predicted_dev_tags = greedy_decoder.decode(valid_sentences_with_pos_tags)

time: 1.41 s (started: 2023-09-26 22:50:55 -07:00)


In [497]:
acc = calculate_accuracy(predicted_dev_tags, df_valid.labels.tolist())
print("Greedy Decoding Accuracy: ", round(acc, 4))

Greedy Decoding Accuracy:  0.9155
time: 47 ms (started: 2023-09-26 22:50:57 -07:00)


In [503]:
# Apply Greedy Decoding on Test data
predicted_test_tags = greedy_decoder.decode(test_sentences_with_pos_tags)

df_greedy_preds = df_test.copy(deep=True)
df_greedy_preds["labels"] = predicted_test_tags

df_greedy_preds.to_json(PathConfig.GREEDY_ALGO_OUTPUT_PATH, orient="records", indent=4)

df_greedy_preds.head()

Unnamed: 0,index,sentence,labels
0,0,"[influential, members, of, the, house, ways, a...","[JJ, NNS, IN, DT, NNP, NNS, CC, VBZ, NNP, VBD,..."
1,1,"[the, bill, ,, whose, backers, include, chairm...","[DT, NN, ,, WP$, NNS, VBP, NN, NNP, NNP, -LRB-..."
2,2,"[the, bill, intends, to, restrict, the, rtc, t...","[DT, NN, VBZ, TO, VB, DT, NNP, TO, NNP, NNS, R..."
3,3,"[``, such, agency, `, self-help, ', borrowing,...","[``, JJ, NN, ``, JJ, '', NN, VBZ, JJ, CC, JJ, ..."
4,4,"[the, complex, financing, plan, in, the, s&l, ...","[DT, JJ, NN, NN, IN, DT, NN, NN, NN, VBZ, VBG,..."


time: 859 ms (started: 2023-09-26 22:59:10 -07:00)


# Task 4: Viterbi Decoding with HMM


In [506]:
class ViterbiDecoding:
    def __init__(self, prior_probs, transition_probs, emission_probs, states, vocab):
        self.priors = prior_probs
        self.transitions = transition_probs
        self.emissions = emission_probs
        self.states = states
        self.vocab = vocab

        self.tag_to_idx = {tag: idx for idx, tag in enumerate(states)}
        self.word_to_index = dict(zip(self.vocab["word"], self.vocab["index"]))

        # Precompute scores for each word-tag pair
        self.priors_emissions = prior_probs[:, np.newaxis] * emission_probs

    def _decode_single_sentence(self, sentence):
        V = np.zeros((len(sentence), len(self.states)))
        path = np.zeros((len(sentence), len(self.states)), dtype=int)

        word_idx = np.array(
            [
                self.word_to_index.get(word, self.word_to_index[VocabConfig.UNKNOWN_TOKEN])
                for word in sentence
            ]
        )

        V[0] = np.log(self.priors_emissions[:, word_idx[0]] + 1e-10)

        for t in range(1, len(sentence)):
            scores = (
                V[t - 1, :, np.newaxis]
                + np.log(self.transitions + 1e-10)
                + np.log(self.emissions[:, word_idx[t]] + 1e-10)
            )
            V[t] = np.max(scores, axis=0)
            path[t] = np.argmax(scores, axis=0)

        predicted_tags = [0] * len(sentence)
        predicted_tags[-1] = np.argmax(V[-1])

        for t in range(len(sentence) - 2, -1, -1):
            predicted_tags[t] = path[t + 1, predicted_tags[t + 1]]

        return [self.states[tag_idx] for tag_idx in predicted_tags]

    def decode(self, sentences):
        predicted_tags_list = []

        for sentence in sentences:
            predicted_tags = self._decode_single_sentence([word for word, tag in sentence])
            predicted_tags_list.append(predicted_tags)

        return predicted_tags_list

time: 16 ms (started: 2023-09-27 00:01:18 -07:00)


In [507]:
# Assuming you have the probability matrices and other data
viterbi_decoder = ViterbiDecoding(p, t, e, model.states, model.vocab)

# Apply Greedy Decoding on development data
predicted_dev_tags_viterbi = viterbi_decoder.decode(valid_sentences_with_pos_tags)

time: 9.03 s (started: 2023-09-27 00:01:19 -07:00)


In [508]:
acc_v = calculate_accuracy(predicted_dev_tags_viterbi, df_valid.labels.tolist())
print("Greedy Decoding Accuracy: ", round(acc_v, 4))

Greedy Decoding Accuracy:  0.9323
time: 32 ms (started: 2023-09-27 00:01:33 -07:00)


In [509]:
# Apply Greedy Decoding on Test data
predicted_test_tags_v = greedy_decoder.decode(test_sentences_with_pos_tags)

df_viterbi_preds = df_test.copy(deep=True)
df_viterbi_preds["labels"] = predicted_test_tags_v

df_viterbi_preds.to_json(PathConfig.VITERBI_ALGO_OUTPUT_PATH, orient="records", indent=4)

df_viterbi_preds.head()

Unnamed: 0,index,sentence,labels
0,0,"[influential, members, of, the, house, ways, a...","[JJ, NNS, IN, DT, NNP, NNS, CC, VBZ, NNP, VBD,..."
1,1,"[the, bill, ,, whose, backers, include, chairm...","[DT, NN, ,, WP$, NNS, VBP, NN, NNP, NNP, -LRB-..."
2,2,"[the, bill, intends, to, restrict, the, rtc, t...","[DT, NN, VBZ, TO, VB, DT, NNP, TO, NNP, NNS, R..."
3,3,"[``, such, agency, `, self-help, ', borrowing,...","[``, JJ, NN, ``, JJ, '', NN, VBZ, JJ, CC, JJ, ..."
4,4,"[the, complex, financing, plan, in, the, s&l, ...","[DT, JJ, NN, NN, IN, DT, NN, NN, NN, VBZ, VBG,..."


time: 813 ms (started: 2023-09-27 00:01:37 -07:00)


# THE END
