In [None]:
import pandas as pd
import numpy as np

pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)

In [None]:
# statement, speaker_description, justification


def build_descriptive_text_vocab_ashley(input_text):
    vocab = set()
    vocab.add("<UNK>")
    for text in input_text:
        for word in text.split():
            word = remove_punctuation_ashley(word)
            if word:
                vocab.add(word)
    return {token: i for i, token in enumerate(vocab)}


def vectorize_descriptive_text_ashley(input_text, vocab):
    vectorized_text = np.zeros(len(vocab))
    for word in input_text.split():
        word = remove_punctuation_ashley(word)
        if word in vocab:
            vectorized_text[vocab[word]] += 1
        else:
            vectorized_text[vocab["<UNK>"]] += 1
    return vectorized_text


def remove_punctuation_ashley(word):
    punctuation = set([".", "(", ")", ",", ";", "?", "!", '"', ":", "'"])
    while word and word[0] in punctuation:
        word = word[1:]
    while word and word[-1] in punctuation:
        word = word[:-1]
    return word.lower()

In [None]:
train_data = pd.read_csv("data/train.csv")
V = build_descriptive_text_vocab_ashley(train_data["statement"])

test_statement = "chores chores stunt chores stunt chores ontario the"
test_vector = vectorize_descriptive_text_ashley(test_statement, V)
print(test_vector)

[0. 0. 0. ... 0. 0. 0.]


In [None]:
# one hot encoding of subjects and context


# for the subject and context columns we only need to add each row into the vocab list and see if there are any repetitions
def build_descriptive_text_vocab_nruta(input_text):
    vocab = set()
    input_text = input_text.str.lower()
    input_text.fillna("NONE", inplace=True)
    for word in input_text:
        vocab.add(word)
    vocab.add("<UNK>")
    return {token: i for i, token in enumerate(vocab)}


def vectorize_descriptive_text_nruta(input_text, vocab):
    vectorized_text = np.zeros(len(vocab))
    for word in input_text:
        if word in vocab:
            vectorized_text[vocab[word]] += 1
        else:
            vectorized_text[vocab["<UNK>"]] += 1
    return vectorized_text

In [None]:
speaker = build_descriptive_text_vocab_nruta(train_data["speaker"])

test = ["nruta", "joe biden"]

test_result = vectorize_descriptive_text_nruta(test, speaker)
test_result

array([0., 0., 0., ..., 0., 0., 0.])

In [6]:
build_descriptive_text_vocab_nruta(train_data["context"])

{'a stump speech': 0,
 'a letter posted on a blog': 1,
 'a direct mail letter': 2,
 'an interview on fox news\' "the neil cavuto show," about u.s. sen. john kerry\'s boat': 3,
 'the south democratic presidential forum': 4,
 'an interview with the incline': 5,
 'comments during the south carolina democratic presidential debate': 6,
 "during sen. marco rubio's speech to seminole county gop": 7,
 'comments announcing florida would reject federal dollars for a tampa-orlando high-speed rail line': 8,
 'an interview on cnn\'s "state of the union with candy crowley"': 9,
 'a letter to the president': 10,
 'a usa today column': 11,
 'his speech at the republican national convention': 12,
 'a speech at a trump rally': 13,
 'the financial times': 14,
 'georgia': 15,
 'house judiciary committee hearing on his bill allowing concealed carry on college campuses': 16,
 'her campaign web site': 17,
 'an interview with sarah palin': 18,
 'video': 19,
 'an immigration speech': 20,
 '"u.s. news & world r

In [None]:
def build_descriptive_text_vocab_subject_stateInfo_nakiyah(input_text):
    vocab = set()
    vocab.add("<UNK>")
    input_text = input_text.str.lower()
    input_text = input_text.fillna("<NONE>")
    input_text = input_text.astype(str)

    # Build vocabulary
    for text in input_text:
        for word in text.split(";"):
            word = word.strip()  # Remove extra spaces
            if word:
                vocab.add(word)

    return {token: i for i, token in enumerate(vocab)}


def vectorize_descriptive_text_subject_nakiyah(input_text, vocab):
    # Ensure the input is a string
    if isinstance(input_text, list):
        input_text = ";".join(input_text)  # Join list into a string
    vectorized_text = np.zeros(len(vocab))
    for word in input_text.split(";"):
        if word in vocab:
            vectorized_text[vocab[word]] += 1
        else:
            vectorized_text[vocab["<UNK>"]] += 1
    return vectorized_text


V_subject = build_descriptive_text_vocab_subject_stateInfo_nakiyah(
    train_data["subject"]
)
V_state = build_descriptive_text_vocab_subject_stateInfo_nakiyah(
    train_data["state_info"]
)

print(V_subject)
test = ["bankruptcy", "infrastructure", "well", "NakiyahDhariwala"]
check = vectorize_descriptive_text_subject_nakiyah(test, V_subject)
print(check)

{'food': 0, 'criminal justice': 1, 'homeland security': 2, 'new jersey': 3, 'katrina': 4, 'facebook fact-checks': 5, 'medicaid': 6, 'campaign finance': 7, 'children': 8, 'nuclear': 9, 'georgia': 10, 'ukraine': 11, 'tourism': 12, 'military': 13, 'homeless': 14, 'good enough to be true': 15, 'pensions': 16, 'gambling': 17, 'labor': 18, 'oregon': 19, 'iran': 20, 'bush administration': 21, 'foreign policy': 22, 'corrections and updates': 23, 'race and ethnicity': 24, 'taxes': 25, 'science': 26, 'ethics': 27, 'arizona': 28, 'ask politifact': 29, 'new hampshire 2012': 30, 'death penalty': 31, 'new york': 32, 'supreme court': 33, 'global news service': 34, 'nevada': 35, 'message machine 2010': 36, 'ad watch': 37, 'guns': 38, 'climate change': 39, 'kagan nomination': 40, 'civil rights': 41, 'wealth': 42, 'water': 43, 'north carolina': 44, 'congress': 45, 'census': 46, 'public safety': 47, 'debt': 48, 'gas prices': 49, 'elections': 50, 'jan. 6': 51, 'sotomayor nomination': 52, 'virginia': 53, '