## Import Libraries and read data

In [36]:
# Import manipulation libraries
import pandas as pd
import numpy as np

# Import visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns

# Ignore Warning library
import warnings

# Preprocessing libraries
from sklearn.preprocessing import RobustScaler, StandardScaler, MinMaxScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline 
from sklearn.tree import DecisionTreeClassifier

In [2]:
# Ignore certain warnings when plotting
warnings.filterwarnings("ignore", "is_categorical_dtype")
warnings.filterwarnings("ignore", "use_inf_as_na")

In [3]:
# Import data from csv
test_values = pd.read_csv('test_values.csv')
train_labels = pd.read_csv('train_labels.csv')
train_values = pd.read_csv('train_values.csv')

In [4]:
# View test data
test_values.head()

Unnamed: 0,building_id,geo_level_1_id,geo_level_2_id,geo_level_3_id,count_floors_pre_eq,age,area_percentage,height_percentage,land_surface_condition,foundation_type,...,has_secondary_use_agriculture,has_secondary_use_hotel,has_secondary_use_rental,has_secondary_use_institution,has_secondary_use_school,has_secondary_use_industry,has_secondary_use_health_post,has_secondary_use_gov_office,has_secondary_use_use_police,has_secondary_use_other
0,300051,17,596,11307,3,20,7,6,t,r,...,0,0,0,0,0,0,0,0,0,0
1,99355,6,141,11987,2,25,13,5,t,r,...,1,0,0,0,0,0,0,0,0,0
2,890251,22,19,10044,2,5,4,5,t,r,...,0,0,0,0,0,0,0,0,0,0
3,745817,26,39,633,1,0,19,3,t,r,...,0,0,1,0,0,0,0,0,0,0
4,421793,17,289,7970,3,15,8,7,t,r,...,0,0,0,0,0,0,0,0,0,0


In [5]:
# View training labels
train_labels.head()

Unnamed: 0,building_id,damage_grade
0,802906,3
1,28830,2
2,94947,3
3,590882,2
4,201944,3


In [6]:
# View training values
train_values.head()

Unnamed: 0,building_id,geo_level_1_id,geo_level_2_id,geo_level_3_id,count_floors_pre_eq,age,area_percentage,height_percentage,land_surface_condition,foundation_type,...,has_secondary_use_agriculture,has_secondary_use_hotel,has_secondary_use_rental,has_secondary_use_institution,has_secondary_use_school,has_secondary_use_industry,has_secondary_use_health_post,has_secondary_use_gov_office,has_secondary_use_use_police,has_secondary_use_other
0,802906,6,487,12198,2,30,6,5,t,r,...,0,0,0,0,0,0,0,0,0,0
1,28830,8,900,2812,2,10,8,7,o,r,...,0,0,0,0,0,0,0,0,0,0
2,94947,21,363,8973,2,10,5,5,t,r,...,0,0,0,0,0,0,0,0,0,0
3,590882,22,418,10694,2,10,6,5,t,r,...,0,0,0,0,0,0,0,0,0,0
4,201944,11,131,1488,3,30,8,9,t,r,...,0,0,0,0,0,0,0,0,0,0


In [7]:
# Merge training datasets to have 1 df
df = pd.merge(train_values, train_labels, on='building_id')

## Data Cleaning

In [8]:
df.head()

Unnamed: 0,building_id,geo_level_1_id,geo_level_2_id,geo_level_3_id,count_floors_pre_eq,age,area_percentage,height_percentage,land_surface_condition,foundation_type,...,has_secondary_use_hotel,has_secondary_use_rental,has_secondary_use_institution,has_secondary_use_school,has_secondary_use_industry,has_secondary_use_health_post,has_secondary_use_gov_office,has_secondary_use_use_police,has_secondary_use_other,damage_grade
0,802906,6,487,12198,2,30,6,5,t,r,...,0,0,0,0,0,0,0,0,0,3
1,28830,8,900,2812,2,10,8,7,o,r,...,0,0,0,0,0,0,0,0,0,2
2,94947,21,363,8973,2,10,5,5,t,r,...,0,0,0,0,0,0,0,0,0,3
3,590882,22,418,10694,2,10,6,5,t,r,...,0,0,0,0,0,0,0,0,0,2
4,201944,11,131,1488,3,30,8,9,t,r,...,0,0,0,0,0,0,0,0,0,3


In [9]:
# Check for amount of duplicated data
df.duplicated().sum()

0

In [10]:
# Check the amount of null values in each columns
df.isnull().sum() / len(df) * 100

building_id                               0.0
geo_level_1_id                            0.0
geo_level_2_id                            0.0
geo_level_3_id                            0.0
count_floors_pre_eq                       0.0
age                                       0.0
area_percentage                           0.0
height_percentage                         0.0
land_surface_condition                    0.0
foundation_type                           0.0
roof_type                                 0.0
ground_floor_type                         0.0
other_floor_type                          0.0
position                                  0.0
plan_configuration                        0.0
has_superstructure_adobe_mud              0.0
has_superstructure_mud_mortar_stone       0.0
has_superstructure_stone_flag             0.0
has_superstructure_cement_mortar_stone    0.0
has_superstructure_mud_mortar_brick       0.0
has_superstructure_cement_mortar_brick    0.0
has_superstructure_timber         

In [11]:
# Define anomaly detection visualization function
def anomaly_detection(data, columns):
    # Univariate columns
    if columns[0] == 'geo_level_1_id':
        for col in columns:
            plt.figure(figsize=(10,6))
            sns.boxplot(data=data, x=col)
            plt.title(f'Feature: {col}');
    elif columns[0] == 'land_surface_condition':
        data_encoded = pd.get_dummies(data, columns=columns, dtype=int)
        for col in [col for col in data_encoded.columns if col not in data.columns]:
            plt.figure(figsize=(10,6))
            sns.kdeplot(data_encoded[col], fill=True, warn_singular=False)
            plt.title(f'Feature: {col}');
    elif columns[0] == 'has_superstructure_adobe_mud':
        for col in columns:
            plt.figure(figsize=(10,6))
            sns.histplot(data=data, x=col, stat='count')
            plt.title(f'Feature: {col}');

In [12]:
# Lambda function to deal with string issues for columns
col_trans = lambda x: x.replace(',','').split()

In [13]:
# Defining Univariate columns
uni_st = 'geo_level_1_id, geo_level_2_id, geo_level_3_id, count_floors_pre_eq, age, area_percentage, height_percentage, count_families'
uni_cols = col_trans(uni_st)

In [14]:
# Defining Categorical Columns
cat_st = 'land_surface_condition, foundation_type, roof_type, ground_floor_type, other_floor_type, position, plan_configuration, legal_ownership_status'
cat_cols = col_trans(cat_st)

In [15]:
# Defining binary columns
binary_st = 'has_superstructure_adobe_mud, has_superstructure_mud_mortar_stone, has_superstructure_stone_flag, has_superstructure_cement_mortar_stone, has_superstructure_mud_mortar_brick, has_superstructure_cement_mortar_brick, has_superstructure_timber, has_superstructure_bamboo, has_superstructure_rc_engineered, has_superstructure_other, has_secondary_use_hotel, has_secondary_use_rental, has_secondary_use_institution, has_secondary_use_school, has_secondary_use_industry, has_secondary_use_health_post, has_secondary_use_gov_office, has_secondary_use_use_police, has_secondary_use_other'
binary_cols = col_trans(binary_st)

### Categorical Anomaly Detection

In [None]:
# Univariant columns anomaly detection
anomaly_detection(data=df, columns=cat_cols)

### Univariate Anomaly Detection

In [None]:
anomaly_detection(data=df, columns=uni_cols)

### Binary Anomaly Detection

In [None]:
anomaly_detection(data=df, columns=binary_cols)

## Data Transformation

### Scaling Univariate data

In [16]:
# Function to view skewness of univariate data
def skew_vis(data, columns):
    for feat in columns:
        plt.figure(figsize=(10,6))
        sns.histplot(data=data[feat], bins=150)
        plt.title(f'Dist of Feature: {feat}')


In [17]:
# Robust scaling function
def robust_scaling(data, columns):
    rb_scaler = RobustScaler()
    rb_scaler.fit(data[columns])
    data[rb_scaler.get_feature_names_out()] = rb_scaler.transform(data[columns])
    return data

In [18]:
# Standard Scaling function
def standard_scaling(data, columns):
    st_scaler = StandardScaler()
    st_scaler.fit(data[columns])
    data[st_scaler.get_feature_names_out()] = st_scaler.transform(data[columns])
    return data

In [19]:
# MinMax Scaling function
def min_max_scaling(data, columns):
    mm_scaler = MinMaxScaler()
    mm_scaler.fit(data[columns])
    data[mm_scaler.get_feature_names_out()] = mm_scaler.transform(data[columns])
    return data

In [20]:
# Check minimum and maximum of each column 
for col in uni_cols:
    print(f'Maximum in column: {col}, {df[col].max()}')
    print(f'Minimum in column: {col}, {df[col].min()}')

Maximum in column: geo_level_1_id, 30
Minimum in column: geo_level_1_id, 0
Maximum in column: geo_level_2_id, 1427
Minimum in column: geo_level_2_id, 0
Maximum in column: geo_level_3_id, 12567
Minimum in column: geo_level_3_id, 0
Maximum in column: count_floors_pre_eq, 9
Minimum in column: count_floors_pre_eq, 1
Maximum in column: age, 995
Minimum in column: age, 0
Maximum in column: area_percentage, 100
Minimum in column: area_percentage, 1
Maximum in column: height_percentage, 32
Minimum in column: height_percentage, 2
Maximum in column: count_families, 9
Minimum in column: count_families, 0


In [21]:
df.head()

Unnamed: 0,building_id,geo_level_1_id,geo_level_2_id,geo_level_3_id,count_floors_pre_eq,age,area_percentage,height_percentage,land_surface_condition,foundation_type,...,has_secondary_use_hotel,has_secondary_use_rental,has_secondary_use_institution,has_secondary_use_school,has_secondary_use_industry,has_secondary_use_health_post,has_secondary_use_gov_office,has_secondary_use_use_police,has_secondary_use_other,damage_grade
0,802906,6,487,12198,2,30,6,5,t,r,...,0,0,0,0,0,0,0,0,0,3
1,28830,8,900,2812,2,10,8,7,o,r,...,0,0,0,0,0,0,0,0,0,2
2,94947,21,363,8973,2,10,5,5,t,r,...,0,0,0,0,0,0,0,0,0,3
3,590882,22,418,10694,2,10,6,5,t,r,...,0,0,0,0,0,0,0,0,0,2
4,201944,11,131,1488,3,30,8,9,t,r,...,0,0,0,0,0,0,0,0,0,3


In [22]:
# Transforming univariate column into different scaled version 
df_rb_scaled = pd.DataFrame(robust_scaling(df, uni_cols), columns=uni_cols)
df_st_scaled = pd.DataFrame(standard_scaling(df, uni_cols), columns=uni_cols)
df_mm_scaled = pd.DataFrame(min_max_scaling(df, uni_cols), columns=uni_cols)

In [23]:
# Finalizing on MinMax Scaler due to no negative values in data
df_scaled = min_max_scaling(data=df, columns=uni_cols)

In [None]:
df_scaled.head()

#### Regular data Skewness

In [None]:
skew_vis(data=df, columns=uni_cols)

#### Standard Scaling Skewness

In [None]:
skew_vis(data=df_st_scaled, columns=uni_cols)

#### Robust Scaling Skewness

In [None]:
skew_vis(data=df_rb_scaled, columns=uni_cols) # Robust scaled skewness of data

#### MinMax Scaled Skewness

In [None]:
skew_vis(data=df_mm_scaled, columns=uni_cols)

### Encoding Categorical Columns

In [26]:
# Build OHE function to encode categorical features
def ohe_encoding(data, columns):
    ohe = OneHotEncoder(sparse_output=False)
    data[ohe.get_feature_names_out()] = ohe.fit_transform(data[columns])
    data.drop(columns=columns, inplace=True)
    return data

In [26]:
# Call function and encode scaled data
df_encoded = ohe_encoding(data=df_scaled, columns=cat_cols)

## Model Selection

In [22]:
# Split data into X and y to diffentiate training data and target data
X = df.drop(columns=['damage_grade'])
y = df[['damage_grade']]

In [23]:
X.head()

Unnamed: 0,building_id,geo_level_1_id,geo_level_2_id,geo_level_3_id,count_floors_pre_eq,age,area_percentage,height_percentage,land_surface_condition,foundation_type,...,has_secondary_use_agriculture,has_secondary_use_hotel,has_secondary_use_rental,has_secondary_use_institution,has_secondary_use_school,has_secondary_use_industry,has_secondary_use_health_post,has_secondary_use_gov_office,has_secondary_use_use_police,has_secondary_use_other
0,802906,6,487,12198,2,30,6,5,t,r,...,0,0,0,0,0,0,0,0,0,0
1,28830,8,900,2812,2,10,8,7,o,r,...,0,0,0,0,0,0,0,0,0,0
2,94947,21,363,8973,2,10,5,5,t,r,...,0,0,0,0,0,0,0,0,0,0
3,590882,22,418,10694,2,10,6,5,t,r,...,0,0,0,0,0,0,0,0,0,0
4,201944,11,131,1488,3,30,8,9,t,r,...,0,0,0,0,0,0,0,0,0,0


In [32]:
# Building custom transformer for preprocessing
class CustomerTransformer:
    def __init__(self, numerical_features, categorical_features):
        self.numerical_features = numerical_features
        self.categorical_features = categorical_features
        self.scaler = MinMaxScaler()
        self.encoder = OneHotEncoder(sparse_output=False)

    def fit(self, X, y=None):
        # Fit the MinMaxScaler on numerical columns
        self.scaler.fit(X[self.numerical_features])

        # Fit the OHE on categorical columns
        self.encoder.fit(X[self.categorical_features])

        return self


    def transform(self, X):
        X_numerical_scaled = self.scaler.transform(X[self.numerical_features])

        X_categorical_encoded = self.encoder.transform(X[self.categorical_features])

        return np.hstack((X_numerical_scaled, X_categorical_encoded))

In [39]:
from sklearn.pipeline import Pipeline
baseline_model_pipeline = Pipeline(steps=[
    ('preprocess', CustomerTransformer(numerical_features=uni_cols, categorical_features=cat_cols)),
    ('classifier', DecisionTreeClassifier())
])

In [40]:
baseline_model_pipeline.fit(X, y)

In [42]:
predictions = baseline_model_pipeline.predict(test_values)
print(f'Predictions: {predictions}')

Predictions: [2 2 2 ... 2 2 2]


In [48]:
from sklearn.metrics import make_scorer, precision_score, recall_score, accuracy_score

accuracy = make_scorer(accuracy_score)
precision = make_scorer(precision_score)
recall = make_scorer(recall_score)

In [51]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'classifier__max_depth': [20]
}


grid_search = GridSearchCV(baseline_model_pipeline, param_grid=param_grid, refit='accuracy', return_train_score=True, cv=5)

grid_search.fit(X, y)

Traceback (most recent call last):
  File "c:\Users\lewis.trudeau\AppData\Local\anaconda3\Lib\site-packages\sklearn\metrics\_scorer.py", line 115, in __call__
    score = scorer._score(cached_call, estimator, *args, **kwargs)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\lewis.trudeau\AppData\Local\anaconda3\Lib\site-packages\sklearn\metrics\_scorer.py", line 282, in _score
    return self._sign * self._score_func(y_true, y_pred, **self._kwargs)
                        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\lewis.trudeau\AppData\Local\anaconda3\Lib\site-packages\sklearn\metrics\_classification.py", line 1954, in precision_score
    p, _, _, _ = precision_recall_fscore_support(
                 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\lewis.trudeau\AppData\Local\anaconda3\Lib\site-packages\sklearn\metrics\_classification.py", line 1573, in precision_recall_fscore_support
    labels = _check_set_wise_labels(y_true,

In [53]:
print(f'Best Parameters: {grid_search.best_params_}')
print(f'Best CV Accuracy: {grid_search.best_score_:.2f}')

Best Parameters: {'classifier__max_depth': 20}
Best CV Accuracy: 0.69


In [57]:
cv_results = grid_search.cv_results_


In [None]:
print(f'Accuracy: {cv_results['mean_test_accuracy']}')
print(f'Precision: {cv_results['mean_test_precision']}')
print(f'Recall: {cv_results['mean_test_recall']}')

In [61]:
print('Accuracy:', cv_results['mean_test_precision'])

KeyError: 'mean_test_precision'