In [None]:
import pandas as pd
import os
import numpy as np
from util import show_images, dict_train_test_split
from sklearn.impute import KNNImputer
import matplotlib.pyplot as plt
import os
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, mean_squared_error, mean_absolute_error, r2_score
%pip install memory-profiler

In [None]:
# Load data
df = pd.read_csv(os.path.join('data', 'tabular.csv'))
with open(os.path.join('data', 'images.npy'), 'rb') as f:
    images = np.load(f)
    
# Exclude target column
X_columns = [col for col in df.columns if col != 'target']

# Create X_dict and y
X_dict = {
    'tabular': df[X_columns],
    'images': images
}
y = df['target']

In [None]:
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
import torch.nn as nn
import torch
from sklearn.ensemble import RandomForestRegressor
from torch.utils.data import Dataset, DataLoader

%load_ext memory_profiler

class Model:  
    """
    This class represents an AI model.
    """
    
    def __init__(self):
        """
        Constructor for Model class.
  
        Parameters
        ----------
        self : object
            The instance of the object passed by Python.
        """
        # initialize random forest regressor
        self.cols_cat = ["V9", "V12", "V19", "V20", "V21", "V23", "V24", "V29", "V31", "V36", "V37", "V46", "V47", "V51", "V52", "V54", "V55", "V58"]
        self.model = RandomForestRegressor(n_estimators=15, verbose=2)

    def fit(self, X_dict, y):
        """
        Train the model using the input data.
        
        Parameters
        ----------
        X_dict : dictionary with the following entries:
            - tabular: pandas Dataframe of shape (n_samples, n_features)
            - images: ndarray of shape (n_samples, height, width)
            Training data.
        y : pandas Dataframe of shape (n_samples,)
            Target values.
            
        Returns
        -------
        self : object
            Returns an instance of the trained model.
        """
        tabular = X_dict['tabular']
        tabular = self.preprocess_tabular(tabular)
        
        # filter y to match the tabular data
        y = y.loc[tabular.index]

        # train the model with random forest regressor
        self.model.fit(tabular, y)

        return self
    
    def predict(self, X_dict):
        """
        Use the trained model to make predictions.
        
        Parameters
        ----------
        X_dict : dictionary with the following entries:
            - tabular: pandas Dataframe of shape (n_samples, n_features)
            - images: ndarray of shape (n_samples, height, width)
            Input data.
            
        Returns
        -------
        pandas Dataframe of shape (n_samples,)
           Predicted target values per element in X_dict.
           
        """
        tabular = X_dict['tabular']
        tabular = self.preprocess_tabular_predict(tabular)
        
        # make predictions
        y_pred = self.model.predict(tabular)
        
        return pd.DataFrame(y_pred)
    
    def preprocess_tabular(self, X):
        """
        Preprocess the tabular data.
        
        Parameters
        ----------
        X : pandas Dataframe of shape (n_samples, n_features)
            Input data.
            
        Returns
        -------
        pandas Dataframe of shape (n_samples, n_features)
            Preprocessed data.
        """
        # encode categorical variables with sklearn label encoder
        le = LabelEncoder()
        for col in self.cols_cat:
            X[col] = le.fit_transform(X[col])
        
        # replace missing values with mode
        imputer = SimpleImputer(strategy='most_frequent')
        df = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)

        # cap the outlier to 3 std only for numerical variables
        for col in df.columns:
            if col not in self.cols_cat:
                df[col] = df[col].apply(lambda x: 3 if x > 3 else x)
                df[col] = df[col].apply(lambda x: -3 if x < -3 else x)

        # apply feature scaling only to numerical variables
        scaler = StandardScaler()
        df[df.columns.difference(self.cols_cat)] = scaler.fit_transform(df[df.columns.difference(self.cols_cat)])

        # use PCA to reduce dimension to 15
        self.pca = PCA(n_components=20)
        df = pd.DataFrame(self.pca.fit_transform(df), columns=[f"V{i}" for i in range(1, 21)])

        # drop duplicates
        df = df.drop_duplicates()

        return df
    
    def preprocess_tabular_predict(self, X):
        """
        Preprocess the tabular data.
        
        Parameters
        ----------
        X : pandas Dataframe of shape (n_samples, n_features)
            Input data.
            
        Returns
        -------
        pandas Dataframe of shape (n_samples, n_features)
            Preprocessed data.
        """
        # encode categorical variables with sklearn label encoder
        le = LabelEncoder()
        for col in self.cols_cat:
            X[col] = le.fit_transform(X[col])
        
        # replace missing values with mode
        imputer = SimpleImputer(strategy='most_frequent')
        df = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)

        # cap the outlier to 3 std only for numerical variables
        for col in df.columns:
            if col not in self.cols_cat:
                df[col] = df[col].apply(lambda x: 3 if x > 3 else x)
                df[col] = df[col].apply(lambda x: -3 if x < -3 else x)

        # apply feature scaling only to numerical variables
        scaler = StandardScaler()
        df[df.columns.difference(self.cols_cat)] = scaler.fit_transform(df[df.columns.difference(self.cols_cat)])

        # use PCA to reduce dimension to 20
        df = pd.DataFrame(self.pca.fit_transform(df), columns=[f"V{i}" for i in range(1,21)])

        return df

In [None]:
# Split train and test
X_dict_train, y_train, X_dict_test, y_test = dict_train_test_split(X_dict, y, ratio=0.9)

# Train and predict
model = Model()
%memit model.fit(X_dict_train, y_train)
%memit model.predict(X_dict_test)

# Evaluate model predition
# Learn more: https://scikit-learn.org/stable/modules/classes.html#module-sklearn.metrics
# print("MSE: {0:.2f}".format(mean_squared_error(y_test, y_pred)))