In [None]:
# Math and Data Reading
import pandas as pd
import numpy as np

# Visualization
import seaborn as sns
import matplotlib.pyplot as plt
from plotly.subplots import make_subplots
import plotly.graph_objects as go

# Modeling
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, BaggingClassifier
from sklearn.metrics import mean_squared_error
from sklearn.metrics import f1_score, accuracy_score

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
values = pd.read_csv('/content/drive/MyDrive/train_values.csv')
labels = pd.read_csv('/content/drive/MyDrive/train_labels.csv')

In [None]:
df_eq = pd.merge(left = values, right = labels, left_on = values['building_id'], right_on = labels['building_id'] )

In [None]:
df_eq.drop(columns = ['building_id_x', 'building_id_y'], inplace=True)

In [None]:
label_key = df_eq['key_0']  # if we need it later, it's here!

In [None]:
df_eq.drop(columns = ['key_0'], inplace=True)

# Modeling

In [None]:
X = df_eq.drop(columns = ['damage_grade'])[:10000]
y = df_eq['damage_grade'][:10000]

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X, y, random_state=13, stratify=y)

In [None]:
ctr = make_column_transformer(
    (OneHotEncoder(drop='first',  handle_unknown='ignore'), ['land_surface_condition', 
                       'foundation_type', 
                       'roof_type', 
                       'ground_floor_type', 
                       'other_floor_type', 
                       'position', 
                       'plan_configuration', 
                       'legal_ownership_status']
     ), remainder = 'passthrough', verbose_feature_names_out = False)
X_train_enc = pd.DataFrame(ctr.fit_transform(X_train), columns = ctr.get_feature_names_out())
X_val_enc = pd.DataFrame(ctr.transform(X_val), columns = ctr.get_feature_names_out())

In [None]:
cts = make_column_transformer(
    (StandardScaler(), ['age', 'area_percentage', 'height_percentage']),
    remainder = 'passthrough', verbose_feature_names_out=False)
X_train_ss = pd.DataFrame(cts.fit_transform(X_train_enc), columns = cts.get_feature_names_out())
X_val_ss = pd.DataFrame(cts.transform(X_val_enc), columns = cts.get_feature_names_out())

In [None]:
def modeling(mod, model, params):
    mod = model
    gs = GridSearchCV(mod, params, n_jobs=-1)
    gs.fit(X_train_ss, y_train)
    print('X train Accuracy: ', accuracy_score(y_train, gs.predict(X_train_ss))),
    print('X test Accuracy: ', accuracy_score(y_val, gs.predict(X_val_ss))),
    print('X train F1: ', f1_score(y_train, gs.predict(X_train_ss), average='weighted')), 
    print('X test F1: ', f1_score(y_val, gs.predict(X_val_ss), average='weighted'))

In [None]:
lg_params = {
    'tol': [.0001, .001, .01, .1],
    'C': [.01, .1, 1, 10, 100]
}

modeling('lg', LogisticRegression(max_iter=10_000), lg_params)

In [None]:
gs = GridSearchCV(LogisticRegression(), lg_params, n_jobs=-1)

In [None]:
gs_lr = gs.fit(X_train_ss, y_train)

In [None]:
gs_lr.best_estimator_.coef_[0]

In [None]:
coef_df = pd.DataFrame(np.expm1(gs_lr.best_estimator_.coef_[0]), index=gs_lr.best_estimator_.feature_names_in_, columns = ['coef_val'])

In [None]:
coef_df
coef_df_sorted = coef_df.sort_values(by=['coef_val'], ascending=False)
coef_df_sorted

In [None]:
svc_params = {
    'C': [.01, .1, 1, 10, 100],
    'tol': [.0001, .001, .01, .1],
    'class_weight': ['balanced', None]
}

modeling('svm', SVC(), svc_params)

In [None]:
gbc_params = {
    'loss': ['deviance'],
    'n_estimators': [5, 10, 50, 100, 250],
    'min_samples_split': [2, 5, 7]
}

modeling('gbc', GradientBoostingClassifier(), gbc_params)

In [None]:
rfr_params = {
    'n_estimators' : [10, 50, 100, 150],
    'max_depth' : [3, 4, 5]
}
    


modeling('rfr', RandomForestClassifier(), rfr_params)

In [None]:
dtr_params = {
    'criterion' : ['gini', 'entropy'],
    'max_depth' : [3, 4, 5]
}

modeling('dtr', DecisionTreeClassifier(), dtr_params)

In [None]:
knn_params = {
    'n_neighbors' : [2, 4, 6, 8],
    'weights' : ['uniform', 'distance']
}

modeling('knn', KNeighborsClassifier(), knn_params)