In [13]:
import pandas as pd
import numpy as np

pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)

In [14]:
# statement, speaker_description, justification


def build_descriptive_text_vocab_ashley(input_text):
    vocab = set()
    vocab.add("<UNK>")
    for text in input_text:
        for word in text.split():
            word = remove_punctuation_ashley(word)
            if word:
                vocab.add(word)
    return {token: i for i, token in enumerate(vocab)}


def vectorize_descriptive_text_ashley(input_text, vocab):
    vectorized_text = np.zeros(len(vocab))
    for word in input_text.split():
        word = remove_punctuation_ashley(word)
        if word in vocab:
            vectorized_text[vocab[word]] += 1
        else:
            vectorized_text[vocab["<UNK>"]] += 1
    return vectorized_text


def remove_punctuation_ashley(word):
    punctuation = set([".", "(", ")", ",", ";", "?", "!", '"', ":", "'"])
    while word and word[0] in punctuation:
        word = word[1:]
    while word and word[-1] in punctuation:
        word = word[:-1]
    return word.lower()

In [15]:
# one hot encoding of subjects and context


# for the subject and context columns we only need to add each row into the vocab list and see if there are any repetitions
def build_descriptive_text_vocab_nruta(input_text):
    vocab = set()
    input_text = input_text.str.lower()
    for word in input_text:
        vocab.add(word)
    vocab.add("<UNK>")
    return {token: i for i, token in enumerate(vocab)}


def vectorize_descriptive_text_nruta(input_text, vocab):
    vectorized_text = np.zeros(len(vocab))
    for word in input_text:
        if word in vocab:
            vectorized_text[vocab[word]] += 1
        else:
            vectorized_text[vocab["<UNK>"]] += 1
    return vectorized_text

In [16]:
def build_descriptive_text_vocab_subject_stateInfo_nakiyah(input_text):
    vocab = set()
    vocab.add("<UNK>")
    input_text = input_text.str.lower()
    input_text = input_text.astype(str)

    # Build vocabulary
    for text in input_text:
        for word in text.split(";"):
            word = word.strip()  # Remove extra spaces
            if word:
                vocab.add(word)

    return {token: i for i, token in enumerate(vocab)}


def vectorize_descriptive_text_subject_nakiyah(input_text, vocab):
    # Ensure the input is a string
    if isinstance(input_text, list):
        input_text = ";".join(input_text)  # Join list into a string
    vectorized_text = np.zeros(len(vocab))
    for word in input_text.split(";"):
        if word in vocab:
            vectorized_text[vocab[word]] += 1
        else:
            vectorized_text[vocab["<UNK>"]] += 1
    return vectorized_text

### Converting this to a PyTorch file

In [18]:
import torch
from torch.utils.data import Dataset, DataLoader
import torchvision.transforms as transform

In [None]:
def process_data(df):
    """
    Processes a dataset file to encode categorical variables and convert data into PyTorch tensors.

    Args:
        file_path (str): Path to the CSV file

    Returns:
        tuple: A tuple containing:
        - features_tensor (torch.Tensor) : Tensor of features
        - labels_tensor (torch.Tensor) : Tensor of labels.
        - label_encoders (dict): Dictionary of LabelEncoders for categorical columns
    """
    # drop useless data
    dropped_columns = ["id", "date"]
    df = df.drop(dropped_columns, axis=1)
    
    for col in ["statement", "justification", "speaker_description"]:
        df[col] = df[col].fillna("None").astype(str)
        V = build_descriptive_text_vocab_ashley(df[col])
        df[col] = df[col].apply(lambda x: vectorize_descriptive_text_ashley(x, V))
        
    for col in ["subject", "state_info"]:
        df[col] = df[col].fillna("None").astype(str)
        V = build_descriptive_text_vocab_subject_stateInfo_nakiyah(df[col])
        df[col] = df[col].apply(lambda x: vectorize_descriptive_text_subject_nakiyah(x, V))
        
    for col in ["speaker", "context"]:
        df[col] = df[col].fillna("None").astype(str)
        V = build_descriptive_text_vocab_nruta(df[col])
        df[col] = df[col].apply(lambda x: vectorize_descriptive_text_nruta(x, V))

    return df

In [25]:
class SentimentDataset(Dataset):
    def __init__(self, path, transform=None):
        self.sentiment = pd.read_csv(path)
        self.sentiment = process_data(self.sentiment)
        self.transform = transform
        
        
    def __len__(self):
        return len(self.sentiment)
    
    def __getitem__(self, idx):
        data = self.sentiment.iloc[idx]
        label = data["label"]
        data = data.drop("label")
        
        max_length = 0
        for col in data.index:
            value = data[col]
            if isinstance(value, (np.ndarray, list)):
                max_length = max(max_length, len(value))
        
        feature_matrix = []
        for col in data.index:
            value = data[col]
            
            # Handle numeric columns or vectorized text
            if isinstance(value, (np.ndarray, list)):
                if len(value) != max_length:
                    value = np.concatenate([np.array(value), np.zeros(max_length - len(value))])
            else:
                value = np.concatenate([np.array([value]), np.zeros(max_length - 1)])
            feature_matrix.append(np.array(value))
        feature_matrix = np.array(feature_matrix).T
        print(feature_matrix.shape)

        if self.transform:
            feature_matrix = self.transform(feature_matrix)
            
        return torch.tensor(feature_matrix, dtype=torch.float32), torch.tensor(label, dtype=torch.float32)
        

In [26]:
t = transform.Compose([transform.ToTensor()])
train_dataset = SentimentDataset(path="data/train.csv")

dataloader = DataLoader(train_dataset, batch_size=2, shuffle=True)
iterator = iter(dataloader)
data, label = next(iterator)

print(data, label)

(45438, 13)
(45438, 13)
tensor([[[  0.,   0.,   0.,  ...,   1.,   0.,   0.],
         [  0.,   0.,   0.,  ...,   0.,   0.,   0.],
         [  0.,   0.,   0.,  ...,   0.,   0.,   0.],
         ...,
         [  0.,   0.,   0.,  ...,   0.,   0.,   0.],
         [  0.,   0.,   0.,  ...,   0.,   0.,   0.],
         [  0.,   0.,   0.,  ...,   0.,   0.,   0.]],

        [[  0.,   0.,   0.,  ..., 570.,   0.,   0.],
         [  0.,   0.,   0.,  ...,   0.,   0.,   0.],
         [  0.,   0.,   0.,  ...,   0.,   0.,   0.],
         ...,
         [  0.,   0.,   0.,  ...,   0.,   0.,   0.],
         [  0.,   0.,   0.,  ...,   0.,   0.,   0.],
         [  0.,   0.,   0.,  ...,   0.,   0.,   0.]]]) tensor([4., 1.])
