In [6]:
import pandas as pd
import torch
from sklearn.preprocessing import LabelEncoder

In [None]:
import pandas as pd
import torch
from sklearn.preprocessing import LabelEncoder

def process_data(file_path):
    """
    Processes a dataset file to encode categorical variables and convert data into PyTorch tensors.

    Args:
        file_path (str): Path to the CSV file.

    Returns:
        tuple: A tuple containing:
            - features_tensor (torch.Tensor): Tensor of features.
            - labels_tensor (torch.Tensor): Tensor of labels.
            - label_encoders (dict): Dictionary of LabelEncoders for categorical columns.
    """
    # Load the dataset
    df = pd.read_csv(file_path)

    # Identify numerical and categorical columns
    numerical_columns = [
        "id", "true_counts", "mostly_true_counts",
        "half_true_counts", "mostly_false_counts", "false_counts", "pants_on_fire_counts"
    ]
    categorical_columns = [col for col in df.columns if col not in numerical_columns + ["label"]]

    # Encode all categorical columns
    label_encoders = {}
    for col in categorical_columns:
        if df[col].dtype == 'object':  # Encode only object-type columns
            le = LabelEncoder()
            df[col] = le.fit_transform(df[col].astype(str))
            label_encoders[col] = le

    # Separate features and labels
    features = df[numerical_columns + categorical_columns]
    labels = df["label"]

    # Convert to PyTorch tensors
    features_tensor = torch.tensor(features.values, dtype=torch.float)
    labels_tensor = torch.tensor(labels.values, dtype=torch.long)

    # # Print label encoders for reference
    # print("\nLabel Encoders (for decoding):")
    # for col, le in label_encoders.items():
    #     print(f"{col}: {le.classes_}")
        
    return features_tensor, labels_tensor, label_encoders

# Usage
train_features, train_labels, train_encoders = process_data("LIAR2/train.csv")
test_features, test_labels, test_encoders = process_data("LIAR2/test.csv")
valid_features, valid_labels, valid_encoders = process_data("LIAR2/valid.csv")

# Print shapes for verification
print("Train Features Tensor Shape:", train_features.shape)
print("Train Labels Tensor Shape:", train_labels.shape)

print("Test Features Tensor Shape:", test_features.shape)
print("Test Labels Tensor Shape:", test_labels.shape)

print("Valid Features Tensor Shape:", valid_features.shape)
print("Valid Labels Tensor Shape:", valid_labels.shape)



Label Encoders (for decoding):
statement: ["#Business formation is still on the rise in Ohio! So far in 2011 we've assisted w/ 44,443 new business filings."
 '#GradInsurance debacle is a DIRECT result of #ObamaCare. #MIZZOU has to break the law to do the right thing.'
 '#MichaelCohen: Convicted of lying to Congress re: Russia investigation … #ElliottAbrams: Convicted of lying to Congress re: arming Contra death squads'
 ...
 '… if the Agreement goes into effect, the millions of Mexican citizens who will become eligible for Social Security benefits may have a far stronger claim to benefits and protection against cuts than U.S. citizens!'
 '\ufeffSays JoAnne Kloppenburg\'s side had a "3-to-1 money advantage" in the Wisconsin Supreme Court campaign.'
 '\ufeff\ufeff"Since Mayor Kennedy O\'Brien took office Sayreville has issued 22,081 building permits! Now O\'Brien is holding secret meetings with big developers.']
date: ['April 1, 2008' 'April 1, 2009' 'April 1, 2010' ... 'September 9, 20