## Import Libraries and read data

In [1]:
# Import manipulation libraries
import itertools
import pandas as pd
import numpy as np

# Import visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns

# Ignore Warning library
import warnings

# ML/Preproc libraries
from sklearn.preprocessing import RobustScaler, StandardScaler, MinMaxScaler, OneHotEncoder, PowerTransformer
from sklearn.tree import DecisionTreeClassifier
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline 


In [2]:
# Ignore certain warnings when plotting
warnings.filterwarnings("ignore", "is_categorical_dtype")
warnings.filterwarnings("ignore", "use_inf_as_na")

In [3]:
# Import data from csv
test_values = pd.read_csv('test_values.csv')
train_labels = pd.read_csv('train_labels.csv')
train_values = pd.read_csv('train_values.csv')

In [4]:
# Merge training datasets to have 1 df
df = pd.merge(train_values, train_labels, on='building_id')

## Data Cleaning

In [5]:
# Define anomaly detection visualization function
def anomaly_detection(data, columns):
    # Univariate columns
    if columns[0] == 'geo_level_1_id':
        for col in columns:
            plt.figure(figsize=(10,6))
            sns.boxplot(data=data, x=col)
            plt.title(f'Feature: {col}');
    elif columns[0] == 'land_surface_condition':
        data_encoded = pd.get_dummies(data, columns=columns, dtype=int)
        for col in [col for col in data_encoded.columns if col not in data.columns]:
            plt.figure(figsize=(10,6))
            sns.kdeplot(data_encoded[col], fill=True, warn_singular=False)
            plt.title(f'Feature: {col}');
    elif columns[0] == 'has_superstructure_adobe_mud':
        for col in columns:
            plt.figure(figsize=(10,6))
            sns.histplot(data=data, x=col, stat='count')
            plt.title(f'Feature: {col}');

In [6]:
# Lambda function to deal with string issues for columns
col_trans = lambda x: x.replace(',','').split()

In [None]:
# Classifying each column
num_features = col_trans('geo_level_1_id, geo_level_2_id, geo_level_3_id')
cat_features = col_trans('land_surface_condition, foundation_type, roof_type, ground_floor_type, other_floor_type, position, plan_configuration, legal_ownership_status')
binary_features = col_trans('has_superstructure_adobe_mud, has_superstructure_mud_mortar_stone, has_superstructure_stone_flag, has_superstructure_cement_mortar_stone, has_superstructure_mud_mortar_brick, has_superstructure_cement_mortar_brick, has_superstructure_timber, has_superstructure_bamboo, has_superstructure_rc_engineered, has_superstructure_other, has_secondary_use_hotel, has_secondary_use_rental, has_secondary_use_institution, has_secondary_use_school, has_secondary_use_industry, has_secondary_use_health_post, has_secondary_use_gov_office, has_secondary_use_use_police, has_secondary_use_other')
left_skew_features = col_trans('count_floors_pre_eq, age, area_percentage, height_percentage, count_families')

### Categorical Anomaly Detection

In [None]:
# Univariant columns anomaly detection
anomaly_detection(data=df, columns=cat_features)

### Univariate Anomaly Detection

In [None]:
anomaly_detection(data=df, columns=list(itertools.chain(num_features, left_skew_features)))

### Binary Anomaly Detection

In [None]:
anomaly_detection(data=df, columns=binary_features)

## Data Transformation

### Viewing distribution of Univariate data

In [10]:
# Function to view skewness of univariate data
def skew_vis(data, columns):
    for feat in columns:
        plt.figure(figsize=(10,6))
        sns.histplot(data=data[feat], bins=150)
        plt.title(f'Dist of Feature: {feat}')


#### Regular data Skewness

In [None]:
skew_vis(data=df, columns=list(itertools.chain(num_features, left_skew_features)))

## Model Selection

### Baseline Model

In [139]:
# Split data into X and y to diffentiate training data and target data
X = df.drop(columns=['damage_grade'])
y = df[['damage_grade']]

In [141]:
# Building custom transformer for preprocessing
class CustomerTransformer:
    def __init__(self):
        self.minmax_features = num_features
        self.ohe_features = cat_features
        self.yj_columns = left_skew_features
        self.minmax_scaler = MinMaxScaler()
        self.ohe_encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
        self.yj_transformer = PowerTransformer(method='yeo-johnson')

    def fit(self, X, y=None):

        # Fit Yeo-Johnson transformer
        self.yj_transformer.fit(X[self.yj_columns])

        # Fit the MinMaxScaler on numerical columns
        self.minmax_scaler.fit(X[self.minmax_features])

        # Fit the OHE on categorical columns
        self.ohe_encoder.fit(X[self.ohe_features])
        
        return self


    def transform(self, X, y=None):

        # Apply Yeo-Johnson Transformation
        X_yj_scaled = self.yj_transformer.transform(X[self.yj_columns]) 

        # Apply MinMaxScaler Transformation
        X_minmax_scaled = self.minmax_scaler.transform(X[self.minmax_features])

        # Apply OneHotEncoder Transformation
        X_ohe_encoded = self.ohe_encoder.transform(X[self.ohe_features])

        return np.hstack((X_yj_scaled, X_minmax_scaled, X_ohe_encoded))

In [142]:
baseline_model_pipeline = Pipeline(steps=[
    ('preprocess', CustomerTransformer()),
    ('classifier', DecisionTreeClassifier())
])

In [None]:
baseline_model_pipeline.fit(X, y)

In [None]:
predictions = baseline_model_pipeline.predict(test_values)
print(f'Predictions: {predictions}')

### Model Evaluation

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'classifier__max_depth': [20]
}

grid_search = GridSearchCV(baseline_model_pipeline, param_grid=param_grid, refit='accuracy', return_train_score=True, cv=5)
grid_search.fit(X, y)

In [None]:
print(f'Best Parameters: {grid_search.best_params_}')
print(f'Best CV Accuracy: {grid_search.best_score_}')