In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.impute import SimpleImputer
from torch.utils.data import Dataset, random_split


In [2]:
import torch

class CustomDataset(Dataset):
    def __init__(self, x, y):
        self.x = x
        self.y = y

    def __getitem__(self, index):
        # Return a dictionary with 'features' and 'label' as keys
        features = torch.tensor(self.x.iloc[index], dtype=torch.float32)
        label = torch.tensor(self.y[index], dtype=torch.long)
        return features, label
    def __len__(self):
        # Return the total number of samples
        return len(self.x)

In [3]:
def Preprocessing(data, label_column, encoded_columns):
    #First take out the labels and encode them.
    le = LabelEncoder()
    labels = data[label_column]
    labels = le.fit_transform(labels)
    
    #Get the attributes and preprocess them 
    encoder = OneHotEncoder(sparse=False)
    imp = SimpleImputer()
    scaler = StandardScaler()
    
    categorical_columns = data.select_dtypes(include=['object']).columns.tolist()

    # Apply one-hot encoding to the categorical columns
    one_hot_encoded = encoder.fit_transform(data[encoded_columns])

    #Create a dataFrame with the one-hot encoded columns
    one_hot_data = pd.DataFrame(one_hot_encoded, columns=encoder.get_feature_names_out(encoded_columns))

    #Drop categorical columns 
    data = data.drop(categorical_columns, axis=1)

    #Now we need to impute and stanardize the data before adding the categorical data
    data = pd.DataFrame(imp.fit_transform(data), columns=data.columns)
    data = pd.DataFrame(scaler.fit_transform(data), columns=data.columns)
    
    # Concatenate the one-hot encoded dataaframe with the original dataaframe
    data_encoded = pd.concat([data, one_hot_data], axis=1)
    
    return labels, data_encoded

In [4]:
def data_to_dataset(x, y):
    return CustomDataset(x, y)
