In [None]:
import pandas as pd
import os
import numpy as np
from util import show_images, dict_train_test_split
from sklearn.impute import KNNImputer
import matplotlib.pyplot as plt
import os
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, mean_squared_error, mean_absolute_error, r2_score

In [None]:
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer

df = pd.read_csv(os.path.join('data', 'tabular.csv'))
cols_cat = ["V9", "V12", "V19", "V20", "V21", "V23", "V24", "V29", "V31", "V36", "V37", "V46", "V47", "V51", "V52", "V54", "V55", "V58"]
# encode categorical variables with sklearn label encoder
le = LabelEncoder()
for col in cols_cat:
    df[col] = le.fit_transform(df[col])
# impute missing with mode
#imputer = SimpleImputer(strategy='most_frequent')
#df = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)
#
## apply feature scaling
#scaler = StandardScaler()
#df = pd.DataFrame(scaler.fit_transform(df), columns=df.columns)
#
## cap the outlier to 3 std
#for col in df.columns:
#    df[col] = df[col].apply(lambda x: 3 if x > 3 else x)
#    df[col] = df[col].apply(lambda x: -3 if x < -3 else x)
#
## use PCA to reduce dimension to 30
#from sklearn.decomposition import PCA
#pca = PCA(n_components=30)
#df_pca = pd.DataFrame(pca.fit_transform(df), columns=[f"V{i}" for i in range(1, 31)])
#
## drop duplicates
#df_pca = df_pca.drop_duplicates()
#df_pca


In [None]:
# print out v0

In [None]:

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
import torch.nn as nn
import torch
from torch.utils.data import Dataset, DataLoader

class CustomDataset(Dataset):
    def __init__(self, tabular_data, image_data, labels):
        self.tabular_data = tabular_data
        self.image_data = image_data
        self.labels = labels

    def __len__(self):
        return len(self.tabular_data)

    def __getitem__(self, idx):
        return self.tabular_data[idx], self.image_data[idx], self.labels[idx]


class Model(nn.Module):  # Inherit from nn.Module
    def __init__(self):
        super(Model, self).__init__() 
        """
        Constructor for Model class.
  
        Parameters
        ----------
        self : object
            The instance of the object passed by Python.
        """
        self.cols_cat = ["V9", "V12", "V19", "V20", "V21", "V23", "V24", "V29", "V31", "V36", "V37", "V46", "V47", "V51", "V52", "V54", "V55", "V58"]
        # create cnn for image
        self.cnn = nn.Sequential(
            nn.Conv2d(in_channels=1, out_channels=8, kernel_size=3, padding=1),  # Output: 8 x 8 x 8
            nn.LeakyReLU(),
        )

        # for tabular data
        self.tabular = nn.Sequential(
            nn.Linear(15, 16),  # First hidden layer
            nn.ReLU(),
        )

        # Determine the size of the flattened CNN output
        with torch.no_grad():
            self.cnn_output_size = self._get_cnn_output_size(torch.zeros((1, 1, 8, 8)))

        # for combined data
        self.combined = nn.Sequential(
            nn.Linear(self.cnn_output_size + 16, 16),
            nn.ReLU(),
            nn.Linear(16, 1)
        )
        
    def _get_cnn_output_size(self, sample_input):
        return self.cnn(sample_input).view(sample_input.size(0), -1).shape[1]
    
    def forward(self, tabular, images):
        """
        Forward pass of the neural network.
        
        Parameters
        ----------
        tabular : torch.tensor of shape (n_samples, n_features)
            Tabular data.
        images : torch.tensor of shape (n_samples, height, width)
            Images.
            
        Returns
        -------
        torch.tensor of shape (n_samples,)
            Predicted target values.
        """
        cnn_out = self.cnn(images.unsqueeze(1).float())
        cnn_out = cnn_out.view(images.size(0), -1)
        tabular_out = self.tabular(tabular)
        combined = torch.cat([cnn_out, tabular_out], dim=1)
        return self.combined(combined)

    def fit(self, X_dict, y):
        """
        Train the model using the input data.
        
        Parameters
        ----------
        X_dict : dictionary with the following entries:
            - tabular: pandas Dataframe of shape (n_samples, n_features)
            - images: ndarray of shape (n_samples, height, width)
            Training data.
        y : pandas Dataframe of shape (n_samples,)
            Target values.
            
        Returns
        -------
        self : object
            Returns an instance of the trained model.
        """
        tabular = self.preprocess_tabular(X_dict['tabular'])
        # convert numpy array to pandas dataframe
        images = X_dict['images']
        images = self.preprocess_images(images)
        images = images[tabular.index]

        # filter removed rows from images and y based on tabular index
        y = y.loc[tabular.index]

        # convert to tensor
        tabular = torch.tensor(tabular.values).float()
        y = torch.tensor(y.values).float()

        # Create custom dataset
        dataset = CustomDataset(tabular, images, y)

        # Create DataLoader
        dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

        # Train the model
        criterion = nn.MSELoss()
        optimizer = torch.optim.Adam(self.parameters(), lr=0.001)
        for epoch in range(10):
            for tabular_batch, image_batch, y_batch in dataloader:
                optimizer.zero_grad()
                output = self.forward(tabular_batch, image_batch)
                loss = criterion(output, y_batch)
                loss.backward()
                torch.nn.utils.clip_grad_norm_(self.parameters(), max_norm=1.0)
                optimizer.step()

            print(f"Epoch {epoch+1}, loss: {loss.item():.4f}")


        return self

    
    def predict(self, X_dict):
        """
        Use the trained model to make predictions.
        
        Parameters
        ----------
        X_dict : dictionary with the following entries:
            - tabular: pandas Dataframe of shape (n_samples, n_features)
            - images: ndarray of shape (n_samples, height, width)
            Input data.
            
        Returns
        -------
        pandas Dataframe of shape (n_samples,)
           Predicted target values per element in X_dict.
           
        """
        self.eval()
        tabular = self.preprocess_tabular_predict(X_dict['tabular'])
        images = self.preprocess_images(X_dict['images'])

        # convert to tensor
        tabular = torch.tensor(tabular.values).float()

        # Create dataset
        dataset = CustomDataset(tabular, images, torch.zeros(tabular.shape[0]))

        # Create DataLoader
        dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

        # predict
        y_pred = []
        for tabular_batch, image_batch, _ in dataloader:
            output = self.forward(tabular_batch, image_batch)
            y_pred.append(output)

        # convert to pandas dataframe
        y_pred = torch.cat(y_pred)
        return pd.DataFrame(y_pred.detach().numpy(), columns=['y_pred'])
    
    def preprocess_tabular(self, X):
        """
        Preprocess the tabular data.
        
        Parameters
        ----------
        X : pandas Dataframe of shape (n_samples, n_features)
            Input data.
            
        Returns
        -------
        pandas Dataframe of shape (n_samples, n_features)
            Preprocessed data.
        """
        # encode categorical variables with sklearn label encoder
        le = LabelEncoder()
        for col in self.cols_cat:
            X[col] = le.fit_transform(X[col])
        
        # impute missing with mode
        imputer = SimpleImputer(strategy='most_frequent')
        df = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)

        # apply feature scaling
        scaler = StandardScaler()
        df = pd.DataFrame(scaler.fit_transform(df), columns=df.columns)

        # cap the outlier to 3 std
        for col in df.columns:
            df[col] = df[col].apply(lambda x: 3 if x > 3 else x)
            df[col] = df[col].apply(lambda x: -3 if x < -3 else x)

        # use PCA to reduce dimension to 15
        self.pca = PCA(n_components=15)
        df = pd.DataFrame(self.pca.fit_transform(df), columns=[f"V{i}" for i in range(1, 16)])

        # drop duplicates
        df = df.drop_duplicates()

        return df
    
    def preprocess_tabular_predict(self, X):
        """
        Preprocess the tabular data.
        
        Parameters
        ----------
        X : pandas Dataframe of shape (n_samples, n_features)
            Input data.
            
        Returns
        -------
        pandas Dataframe of shape (n_samples, n_features)
            Preprocessed data.
        """
        # encode categorical variables with sklearn label encoder
        le = LabelEncoder()
        for col in self.cols_cat:
            X[col] = le.fit_transform(X[col])
        
        # impute missing with mode
        imputer = SimpleImputer(strategy='most_frequent')
        df = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)

        # apply feature scaling
        scaler = StandardScaler()
        df = pd.DataFrame(scaler.fit_transform(df), columns=df.columns)

        # cap the outlier to 3 std
        for col in df.columns:
            df[col] = df[col].apply(lambda x: 3 if x > 3 else x)
            df[col] = df[col].apply(lambda x: -3 if x < -3 else x)

        # use PCA to reduce dimension to 15
        df = pd.DataFrame(self.pca.fit_transform(df), columns=[f"V{i}" for i in range(1, 16)])
        return df
    
    def preprocess_images(self, X):
        """
        Preprocess the images.
        
        Parameters
        ----------
        X : ndarray of shape (n_samples, height, width)
            Input data.
            
        Returns
        -------
        ndarray of shape (n_samples, height, width)
            Preprocessed data.
        """
        # replace nan with 0
        X = np.nan_to_num(X)
        return X

In [None]:
# Load data
df = pd.read_csv(os.path.join('data', 'tabular.csv'))
with open(os.path.join('data', 'images.npy'), 'rb') as f:
    images = np.load(f)
    
# Exclude target column
X_columns = [col for col in df.columns if col != 'target']

# Create X_dict and y
X_dict = {
    'tabular': df[X_columns],
    'images': images
}
y = df['target']

In [None]:
# check if there are nan values in images
print(np.isnan(images).any())
# replace nan with 0
images = np.nan_to_num(images)
print(np.isnan(images).any())

In [None]:
# Split train and test
X_dict_train, y_train, X_dict_test, y_test = dict_train_test_split(X_dict, y, ratio=0.9)

# Train and predict
model = Model()
model.fit(X_dict_train, y_train)
y_pred = model.predict(X_dict_test)

# Evaluate model predition
# Learn more: https://scikit-learn.org/stable/modules/classes.html#module-sklearn.metrics
print("MSE: {0:.2f}".format(mean_squared_error(y_test, y_pred)))

In [None]:
y_pred = model.predict(X_dict_test)

# Evaluate model predition
# Learn more: https://scikit-learn.org/stable/modules/classes.html#module-sklearn.metrics
print("MSE: {0:.2f}".format(mean_squared_error(y_test, y_pred)))