In [1]:
import pandas as pd
import numpy as np

pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)

In [2]:
# statement, speaker_description, justification


def build_descriptive_text_vocab_ashley(input_text):
    vocab = set()
    vocab.add("<UNK>")
    for text in input_text:
        for word in text.split():
            word = remove_punctuation_ashley(word)
            if word:
                vocab.add(word)
    return {token: i for i, token in enumerate(vocab)}


def vectorize_descriptive_text_ashley(input_text, vocab):
    vectorized_text = np.zeros(len(vocab))
    for word in input_text.split():
        word = remove_punctuation_ashley(word)
        if word in vocab:
            vectorized_text[vocab[word]] += 1
        else:
            vectorized_text[vocab["<UNK>"]] += 1
    return vectorized_text


def remove_punctuation_ashley(word):
    punctuation = set([".", "(", ")", ",", ";", "?", "!", '"', ":", "'"])
    while word and word[0] in punctuation:
        word = word[1:]
    while word and word[-1] in punctuation:
        word = word[:-1]
    return word.lower()

In [3]:
train_data = pd.read_csv("data/train.csv")
V = build_descriptive_text_vocab_ashley(train_data["statement"])

test_statement = "chores chores stunt chores stunt chores ontario the"
test_vector = vectorize_descriptive_text_ashley(test_statement, V)
print(test_vector)

[0. 0. 0. ... 0. 0. 0.]


In [4]:
# one hot encoding of subjects and context


# for the subject and context columns we only need to add each row into the vocab list and see if there are any repetitions
def build_descriptive_text_vocab_nruta(input_text):
    vocab = set()
    input_text = input_text.str.lower()
    input_text.fillna("NONE", inplace=True)
    for word in input_text:
        vocab.add(word)
    vocab.add("<UNK>")
    return {token: i for i, token in enumerate(vocab)}


def vectorize_descriptive_text_nruta(input_text, vocab):
    vectorized_text = np.zeros(len(vocab))
    for word in input_text:
        if word in vocab:
            vectorized_text[vocab[word]] += 1
        else:
            vectorized_text[vocab["<UNK>"]] += 1
    return vectorized_text

In [5]:
speaker = build_descriptive_text_vocab_nruta(train_data["speaker"])

test = ["nruta", "joe biden"]

test_result = vectorize_descriptive_text_nruta(test, speaker)
test_result

array([0., 0., 0., ..., 0., 0., 0.])

In [6]:
build_descriptive_text_vocab_nruta(train_data["context"])

{'coral gables, fla': 0,
 'st. clair shores, mich': 1,
 'a broadcast of msnbc\'s "morning joe"': 2,
 'reno': 3,
 'a discussion on cbs\' "face the nation"': 4,
 'msnbc\'s the "ed show," aug. 25, 2011': 5,
 'a town hall on cnn': 6,
 'remarks to the press': 7,
 'milwaukee': 8,
 'an interview.': 9,
 'a discussion on fox news\' "hannity"': 10,
 'an interview on ny1': 11,
 'remarks at a party fundraiser': 12,
 'comments on "the rachel maddow show"': 13,
 'remarks after winning the south carolina republican presidential primary': 14,
 'debate comments': 15,
 'an interview on npr\'s "morning edition"': 16,
 'an appearance on the ingraham angle': 17,
 'a state senate budget hearing': 18,
 'a televised debate on fox 5': 19,
 'at a rally at the ohio statehouse': 20,
 'social media feeds': 21,
 'an appearance on nbc\'s "meet the press"': 22,
 'a tweet and campaign event': 23,
 'a debate on wbtv': 24,
 'a prepared speech': 25,
 'an interview on cbs’ “face the nation”': 26,
 'testimony before the st

In [7]:
def build_descriptive_text_vocab_subject_stateInfo_nakiyah(input_text):
    vocab = set()
    vocab.add("<UNK>")
    input_text = input_text.str.lower()
    input_text = input_text.fillna("<NONE>")
    input_text = input_text.astype(str)

    # Build vocabulary
    for text in input_text:
        for word in text.split(";"):
            word = word.strip()  # Remove extra spaces
            if word:
                vocab.add(word)

    return {token: i for i, token in enumerate(vocab)}


def vectorize_descriptive_text_subject_nakiyah(input_text, vocab):
    # Ensure the input is a string
    if isinstance(input_text, list):
        input_text = ";".join(input_text)  # Join list into a string
    vectorized_text = np.zeros(len(vocab))
    for word in input_text.split(";"):
        if word in vocab:
            vectorized_text[vocab[word]] += 1
        else:
            vectorized_text[vocab["<UNK>"]] += 1
    return vectorized_text


V_subject = build_descriptive_text_vocab_subject_stateInfo_nakiyah(
    train_data["subject"]
)
V_state = build_descriptive_text_vocab_subject_stateInfo_nakiyah(
    train_data["state_info"]
)

print(V_subject)
test = ["bankruptcy", "infrastructure", "well", "NakiyahDhariwala"]
check = vectorize_descriptive_text_subject_nakiyah(test, V_subject)
print(check)

{'bush administration': 0, 'gas prices': 1, 'science': 2, 'public service': 3, 'nbc': 4, 'death penalty': 5, "the 2018 california governor's race": 6, 'bankruptcy': 7, 'ask politifact': 8, 'this week - abc news': 9, 'privacy issues': 10, 'sexuality': 11, 'families': 12, 'immigration': 13, 'fake news': 14, 'urban': 15, 'florida amendments': 16, 'terrorism': 17, 'education': 18, 'nuclear': 19, 'labor': 20, 'homeless': 21, 'nevada': 22, 'deficit': 23, 'baseball': 24, 'medicaid': 25, 'census': 26, 'patriotism': 27, 'children': 28, 'party support': 29, 'financial regulation': 30, 'water': 31, 'georgia': 32, 'sports': 33, 'candidate biography': 34, 'states': 35, 'russia': 36, 'supreme court': 37, 'pennsylvania': 38, 'debates': 39, 'military': 40, 'drugs': 41, 'homeland security': 42, 'florida': 43, 'race and ethnicity': 44, 'negative campaigning': 45, 'pop culture': 46, 'health care': 47, 'state budget': 48, 'china': 49, 'welfare': 50, 'health check': 51, 'occupy wall street': 52, 'autism': 

### Converting this to a PyTorch file

In [8]:
from sklearn.preprocessing import LabelEncoder
import torch

In [None]:
def process_data(file_path):
    """
    Processes a dataset file to encode categorical variables and convert data into PyTorch tensors.

    Args:
        file_path (str): Path to the CSV file

    Returns:
        tuple: A tuple containing:
        - features_tensor (torch.Tensor) : Tensor of features
        - labels_tensor (torch.Tensor) : Tensor of labels.
        - label_encoders (dict): Dictionary of LabelEncoders for categorical columns
    """

    df = pd.read_csv(file_path)

    # identify numerical and categorical columns
    numerical_columns = [
        "id",
        "true_counts",
        "mostly_true_counts",
        "half_true_counts",
        "mostly_false_counts",
        "mostly_false_counts",
        "false_counts",
        "pants_on_fire_counts",
    ]

    categorical_columns = [
        col for col in df.columns if col not in numerical_columns + ["label"]
    ]
    # encode all categorical columns
    label_encoders = {}

    for col in categorical_columns:
        if df[col].dtype == "object":
            le = LabelEncoder()
            df[col] = le.fit_transform(df[col].astype(str))
            label_encoders[col] = le

    # separate features and labels
    features = df[numerical_columns + categorical_columns]
    labels = df["label"]

    # convert to PyTorch tensors
    features_tensor = torch.tensor(features.values, dtype=torch.float)
    labels_tensor = torch.tensor(labels.values, dtype=torch.long)

    # print label encoders for reference
    # print ("\nLabel Encoders (for decoding):")
    # for col, le in label_encoders.items():
    #   print(f"{col}: {le.classes_}")

    return features_tensor, labels_tensor, label_encoders


# Usage
train_features, train_labels, train_encoders = process_data("data/train.csv")
test_features, test_labels, test_encoders = process_data("data/test.csv")
valid_features, valid_labels, valid_encoders = process_data("data/valid.csv")

# print shapes for verification
print("Train Features Tensors Shape:", train_features.shape)
print("Train Labels Tensor Shape:", train_labels.shape)

print("Test Features Tensor Shape:", test_features.shape)
print("Test Labels Tensor Shape:", test_labels.shape)

print("Valid Features Tensor Shape:", valid_features.shape)
print("Valid Labels Tensor Shape:", valid_labels.shape)

Train Features Tensors Shape: torch.Size([18369, 16])
Train Labels Tensor Shape: torch.Size([18369])
Test Features Tensor Shape: torch.Size([2296, 16])
Test Labels Tensor Shape: torch.Size([2296])
Valid Features Tensor Shape: torch.Size([2297, 16])
Valid Labels Tensor Shape: torch.Size([2297])
