In [None]:
import pandas as pd
import os
import numpy as np
from util import show_images, dict_train_test_split
from sklearn.impute import KNNImputer
import matplotlib.pyplot as plt
import os
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, mean_squared_error, mean_absolute_error, r2_score

In [None]:
# Load data
df = pd.read_csv(os.path.join('data', 'tabular.csv'))
with open(os.path.join('data', 'images.npy'), 'rb') as f:
    images = np.load(f)
    
# Exclude target column
X_columns = [col for col in df.columns if col != 'target']

# Create X_dict and y
X_dict = {
    'tabular': df[X_columns],
    'images': images
}
y = df['target']

In [None]:
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
import torch.nn as nn
import torch
from sklearn.ensemble import RandomForestRegressor, HistGradientBoostingRegressor
from torch.utils.data import Dataset, DataLoader

class Model:  
    """
    This class represents an AI model.
    """
    
    def __init__(self):
        """
        Constructor for Model class.
  
        Parameters
        ----------
        self : object
            The instance of the object passed by Python.
        """
        # initialize random forest regressor
        self.cols_cat = ["V9", "V12", "V19", "V20", "V21", "V23", "V24", "V29", "V31", "V36", "V37", "V46", "V47", "V51", "V52", "V54", "V55", "V58"]
        self.cols_cat_index = [i for i, col in enumerate(X_columns) if col in self.cols_cat]
        self.model = HistGradientBoostingRegressor(verbose=2, max_iter=300)

    def fit(self, X_dict, y):
        """
        Train the model using the input data.
        
        Parameters
        ----------
        X_dict : dictionary with the following entries:
            - tabular: pandas Dataframe of shape (n_samples, n_features)
            - images: ndarray of shape (n_samples, height, width)
            Training data.
        y : pandas Dataframe of shape (n_samples,)
            Target values.
            
        Returns
        -------
        self : object
            Returns an instance of the trained model.
        """
        tabular = X_dict['tabular']
        tabular = self.preprocess_tabular(tabular)
        
        # filter y to match the tabular data
        y = y.loc[tabular.index]

        # train the model with random forest regressor
        self.model.fit(tabular, y)

        return self
    
    def predict(self, X_dict):
        """
        Use the trained model to make predictions.
        
        Parameters
        ----------
        X_dict : dictionary with the following entries:
            - tabular: pandas Dataframe of shape (n_samples, n_features)
            - images: ndarray of shape (n_samples, height, width)
            Input data.
            
        Returns
        -------
        pandas Dataframe of shape (n_samples,)
           Predicted target values per element in X_dict.
           
        """
        tabular = X_dict['tabular']
        tabular = self.preprocess_tabular_predict(tabular)
        
        # make predictions
        y_pred = self.model.predict(tabular)

        df = pd.DataFrame(y_pred)
        print(df.shape)
        
        return df
    
    def preprocess_tabular(self, X):
        """
        Preprocess the tabular data.
        
        Parameters
        ----------
        X : pandas Dataframe of shape (n_samples, n_features)
            Input data.
            
        Returns
        -------
        pandas Dataframe of shape (n_samples, n_features)
            Preprocessed data.
        """
        # encode categorical variables
        encoder = LabelEncoder()
        for col in X.columns:
            if col in self.cols_cat:
                X[col] = encoder.fit_transform(X[col].astype(str))

        # drop categorical variables with too many categories by looking at the unique values
        #self.drop_cols = []
        #for col in X.columns:
        #    if col in self.cols_cat:
        #        if len(X[col].unique()) > 255:
        #            X = X.drop(col, axis=1)
        #            self.drop_cols.append(col)
        
        # replace missing values with mode
        imputer = SimpleImputer(strategy='most_frequent')
        df = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)

        # cap the outlier to 3 std only for numerical variables
        for col in df.columns:
            if col not in self.cols_cat:
                df[col] = df[col].apply(lambda x: 3 if x > 3 else x)
                df[col] = df[col].apply(lambda x: -3 if x < -3 else x)

        # apply feature scaling only to numerical variables
        scaler = StandardScaler()
        df[df.columns.difference(self.cols_cat)] = scaler.fit_transform(df[df.columns.difference(self.cols_cat)])

        # use PCA to reduce dimension to 40
        #self.pca = PCA(n_components=40)
        #df = pd.DataFrame(self.pca.fit_transform(df), columns=[f"V{i}" for i in range(1, 41)])

        # drop duplicates
        df = df.drop_duplicates()

        return df
    
    def preprocess_tabular_predict(self, X):
        """
        Preprocess the tabular data.
        
        Parameters
        ----------
        X : pandas Dataframe of shape (n_samples, n_features)
            Input data.
            
        Returns
        -------
        pandas Dataframe of shape (n_samples, n_features)
            Preprocessed data.
        """
        # encode categorical variables
        encoder = LabelEncoder()
        for col in X.columns:
            if col in self.cols_cat:
                X[col] = encoder.fit_transform(X[col].astype(str))
                
        # drop categorical variables from self.drop_cols
        #X = X.drop(self.drop_cols, axis=1)
        
        # replace missing values with mode
        imputer = SimpleImputer(strategy='most_frequent')
        df = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)

        # cap the outlier to 3 std only for numerical variables
        for col in df.columns:
            if col not in self.cols_cat:
                df[col] = df[col].apply(lambda x: 3 if x > 3 else x)
                df[col] = df[col].apply(lambda x: -3 if x < -3 else x)

        # apply feature scaling only to numerical variables
        scaler = StandardScaler()
        df[df.columns.difference(self.cols_cat)] = scaler.fit_transform(df[df.columns.difference(self.cols_cat)])

        # use PCA to reduce dimension to 40
        # df = pd.DataFrame(self.pca.fit_transform(df), columns=[f"V{i}" for i in range(1,41)])

        return df

In [None]:
# Load data
df = pd.read_csv(os.path.join('data', 'tabular.csv'))
with open(os.path.join('data', 'images.npy'), 'rb') as f:
    images = np.load(f)
    
# Exclude target column
X_columns = [col for col in df.columns if col != 'target']

# Create X_dict and y
X_dict = {
    'tabular': df[X_columns],
    'images': images
}
y = df['target']

In [None]:
# Split train and test
X_dict_train, y_train, X_dict_test, y_test = dict_train_test_split(X_dict, y, ratio=0.9)

# Train and predict
model = Model()
model.fit(X_dict_train, y_train)
y_pred = model.predict(X_dict_test)

# Evaluate model predition
# Learn more: https://scikit-learn.org/stable/modules/classes.html#module-sklearn.metrics
print("MSE: {0:.2f}".format(mean_squared_error(y_test, y_pred)))

In [None]:
# print out the first 10 predictions with ground truth less than 50, and the corresponding ground truth
print("First 10 predictions with ground truth less than 50:")
print(y_pred[y_test < 50].head(10))
print("Ground truth:")
print(y_test[y_test < 50].head(10))

In [None]:
import time

################################
## Benchmarking PCA dimension ##
################################

# Function to benchmark model
def benchmark_model(X_dict, y, pca_dimensions):
    mse_scores = []
    times = []

    # Split train and test only once
    X_dict_train, y_train, X_dict_test, y_test = dict_train_test_split(X_dict, y, ratio=0.1)
    
    for dim in pca_dimensions:
        start_time = time.time()

        # Train and predict using the same train and test data
        model = Model()
        model.fit(X_dict_train, y_train, n_pca_components=dim)
        y_pred = model.predict(X_dict_test)

        # Evaluate model prediction
        mse = mean_squared_error(y_test, y_pred)
        mse_scores.append(mse)
        times.append(time.time() - start_time)
        print(f"MSE with PCA dimension {dim}: {mse:.2f}")

    return mse_scores, times

# Run benchmark
pca_dimensions = [10, 15, 20, 25, 30]
mse_scores, times = benchmark_model(X_dict, y, pca_dimensions)

# [Plotting code remains the same]


In [None]:
import time

############################
## Benchmarking max depth ##
############################

# Function to benchmark model
def benchmark_model(X_dict, y, pca_dimensions):
    mse_scores = []
    times = []

    # Split train and test only once
    X_dict_train, y_train, X_dict_test, y_test = dict_train_test_split(X_dict, y, ratio=0.1)
    
    for dep in max_depths:
        start_time = time.time()

        # Train and predict using the same train and test data
        model = Model()
        model.fit(X_dict_train, y_train, max_depth=dep)
        y_pred = model.predict(X_dict_test)

        # Evaluate model prediction
        mse = mean_squared_error(y_test, y_pred)
        mse_scores.append(mse)
        times.append(time.time() - start_time)
        print(f"MSE with Max Depth {dep}: {mse:.2f}")

    return mse_scores, times

# Run benchmark
max_depths = [10, 30, 50, 70, 90 ,110, 130, 150, 170]
mse_scores, times = benchmark_model(X_dict, y, max_depths)

# [Plotting code remains the same]


In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.plot(max_depths, mse_scores, marker='o')
plt.title("MSE vs Max Depths")
plt.xlabel("Max Depths")
plt.ylabel("Mean Squared Error")

plt.subplot(1, 2, 2)
plt.plot(max_depths, times, marker='o', color='orange')
plt.title("Training Time vs Max Depths")
plt.xlabel("Max Depths")
plt.ylabel("Time (seconds)")

plt.tight_layout()
plt.show()