In [10]:
import pandas as pd
import numpy as np

pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)

In [20]:
# statement, speaker_description, justification


def build_descriptive_text_vocab_ashley(input_text):
    vocab = set()
    vocab.add("<UNK>")
    for text in input_text:
        for word in text.split():
            word = remove_punctuation_ashley(word)
            if word:
                vocab.add(word)
    return {token: i for i, token in enumerate(vocab)}


def vectorize_descriptive_text_ashley(input_text, vocab):
    vectorized_text = np.zeros(len(vocab))
    for word in input_text.split():
        word = remove_punctuation_ashley(word)
        if word in vocab:
            vectorized_text[vocab[word]] += 1
        else:
            vectorized_text[vocab["<UNK>"]] += 1
    return vectorized_text


def remove_punctuation_ashley(word):
    punctuation = set([".", "(", ")", ",", ";", "?", "!", '"', ":", "'"])
    while word and word[0] in punctuation:
        word = word[1:]
    while word and word[-1] in punctuation:
        word = word[:-1]
    return word.lower()

In [21]:
train_data = pd.read_csv("data/train.csv")
V = build_descriptive_text_vocab_ashley(train_data["statement"])

test_statement = "chores chores stunt chores stunt chores ontario the"
test_vector = vectorize_descriptive_text_ashley(test_statement, V)
print(test_vector)

[0. 0. 0. ... 0. 0. 0.]


In [22]:
# one hot encoding of subjects and context


# for the subject and context columns we only need to add each row into the vocab list and see if there are any repetitions
def build_descriptive_text_vocab_nruta(input_text):
    vocab = set()
    input_text = input_text.str.lower()
    for word in input_text:
        vocab.add(word)
    vocab.add("<UNK>")
    return {token: i for i, token in enumerate(vocab)}


def vectorize_descriptive_text_nruta(input_text, vocab):
    vectorized_text = np.zeros(len(vocab))
    for word in input_text:
        if word in vocab:
            vectorized_text[vocab[word]] += 1
        else:
            vectorized_text[vocab["<UNK>"]] += 1
    return vectorized_text

In [6]:
speaker = build_descriptive_text_vocab_nruta(train_data["speaker"])

test = ["nruta", "joe biden"]

test_result = vectorize_descriptive_text_nruta(test, speaker)
test_result

array([0., 0., 0., ..., 0., 0., 0.])

In [7]:
build_descriptive_text_vocab_nruta(train_data["context"])

{'south korea': 0,
 'interview on "face the nation."': 1,
 'a published piece on their web site': 2,
 'new york': 3,
 'print': 4,
 'a facebook live': 5,
 'orlando at presidency 5': 6,
 'his energy plan': 7,
 'remarks to a conference': 8,
 'a blog post on campaign website': 9,
 "remarks on fox's hannity": 10,
 'interview on meet the press with chuck todd': 11,
 'iowa on fox news': 12,
 'an appearance on wpro': 13,
 'a segment on "fox news sunday"': 14,
 "an interview with 'the economist'": 15,
 'a segment': 16,
 'a redstate.com column': 17,
 'a hillary for america ad': 18,
 'the new york sun': 19,
 'cape girardeau, miss': 20,
 'a news interview': 21,
 'an interview for npr\'s "all things considered"': 22,
 'a speech at the florida association of counties conference': 23,
 'an interview on msnbc\'s "jansing & co."': 24,
 'a protest sign': 25,
 'his state of the state speech': 26,
 "an interview on nbc's today show": 27,
 'the daily signal': 28,
 'a press conference with congressional lea

In [26]:
def build_descriptive_text_vocab_subject_stateInfo_nakiyah(input_text):
    vocab = set()
    vocab.add("<UNK>")
    input_text = input_text.str.lower()
    input_text = input_text.astype(str)

    # Build vocabulary
    for text in input_text:
        for word in text.split(";"):
            word = word.strip()  # Remove extra spaces
            if word:
                vocab.add(word)

    return {token: i for i, token in enumerate(vocab)}


def vectorize_descriptive_text_subject_nakiyah(input_text, vocab):
    # Ensure the input is a string
    if isinstance(input_text, list):
        input_text = ";".join(input_text)  # Join list into a string
    vectorized_text = np.zeros(len(vocab))
    for word in input_text.split(";"):
        if word in vocab:
            vectorized_text[vocab[word]] += 1
        else:
            vectorized_text[vocab["<UNK>"]] += 1
    return vectorized_text


V_subject = build_descriptive_text_vocab_subject_stateInfo_nakiyah(
    train_data["subject"]
)
V_state = build_descriptive_text_vocab_subject_stateInfo_nakiyah(
    train_data["state_info"]
)

print(V_subject)
test = ["bankruptcy", "infrastructure", "well", "NakiyahDhariwala"]
check = vectorize_descriptive_text_subject_nakiyah(test, V_subject)
print(check)

{'education': 0, 'supreme court': 1, 'new york': 2, 'government regulation': 3, 'message machine 2010': 4, 'marriage': 5, 'vermont': 6, 'economy': 7, 'congress': 8, 'regulation': 9, 'obama birth certificate': 10, 'drugs': 11, 'alcohol': 12, 'climate change': 13, 'this week - abc news': 14, 'occupy wall street': 15, 'negative campaigning': 16, 'fires': 17, 'autism': 18, 'welfare': 19, 'voter id laws': 20, 'state budget': 21, 'islam': 22, 'county government': 23, 'recreation': 24, 'corporations': 25, 'kagan nomination': 26, 'small business': 27, 'global news service': 28, 'trade': 29, 'stimulus': 30, 'sports': 31, 'oregon': 32, 'workers': 33, 'nuclear': 34, 'pop culture': 35, 'polls and public opinion': 36, 'florida': 37, 'foreign policy': 38, 'israel': 39, 'census': 40, 'disability': 41, 'agriculture': 42, 'county budget': 43, 'children': 44, 'medicaid': 45, 'jobs': 46, 'nbc': 47, "politifact's top promises": 48, 'social security': 49, 'russia': 50, 'transparency': 51, 'unions': 52, 'im

### Converting this to a PyTorch file

In [31]:
import torch
from torch.utils.data import Dataset, DataLoader
import torchvision.transforms as transform

In [28]:
def process_data(df):
    """
    Processes a dataset file to encode categorical variables and convert data into PyTorch tensors.

    Args:
        file_path (str): Path to the CSV file

    Returns:
        tuple: A tuple containing:
        - features_tensor (torch.Tensor) : Tensor of features
        - labels_tensor (torch.Tensor) : Tensor of labels.
        - label_encoders (dict): Dictionary of LabelEncoders for categorical columns
    """
    # drop useless data
    dropped_columns = ["id", "date"]
    df = df.drop(dropped_columns, axis=1)
    
    for col in ["statement", "justification", "speaker_description"]:
        df[col] = df[col].fillna("None").astype(str)
        V = build_descriptive_text_vocab_ashley(df[col])
        df[col] = df[col].apply(lambda x: vectorize_descriptive_text_ashley(x, V))
        
    for col in ["subject", "state_info"]:
        df[col] = df[col].fillna("None").astype(str)
        V = build_descriptive_text_vocab_subject_stateInfo_nakiyah(df[col])
        df[col] = df[col].apply(lambda x: vectorize_descriptive_text_subject_nakiyah(x, V))
        
    for col in ["speaker", "context"]:
        df[col] = df[col].fillna("None").astype(str)
        V = build_descriptive_text_vocab_nruta(df[col])
        df[col] = df[col].apply(lambda x: vectorize_descriptive_text_nruta(x, V))

    return df


# test
# df = pd.read_csv("data/train.csv")
# df = process_data(df)
# print(df.iloc[:10])

In [37]:
class SentimentDataset(Dataset):
    def __init__(self, path, transform=None):
        self.sentiment = pd.read_csv(path)
        self.sentiment = process_data(self.sentiment)
        self.transform = transform
        
        
    def __len__(self):
        return len(self.sentiment)
    
    def __getitem__(self, idx):
        data = self.sentiment.iloc[idx]
        label = data["label"]
        data = data.drop("label")
        
        feature_vectors = []
        for col in data.index:
            value = data[col]
            # Handle numeric columns or vectorized text
            if isinstance(value, (np.ndarray, list)):
                feature_vectors.append(np.array(value))
            else:
                feature_vectors.append(np.array([value], dtype=np.float32))

        # Combine all features into a single vector
        final_vector = np.concatenate(feature_vectors)

        if self.transform:
            final_vector = self.transform(final_vector)
            
        return torch.tensor(final_vector, dtype=torch.float32), torch.tensor(label, dtype=torch.float32)
        

In [38]:
t = transform.Compose([transform.ToTensor()])
train_dataset = SentimentDataset(path="data/train.csv")

dataloader = DataLoader(train_dataset, batch_size=2, shuffle=True)
iterator = iter(dataloader)
data, label = next(iterator)

print(data, label)

MemoryError: Unable to allocate 355. KiB for an array with shape (45438,) and data type float64