### Imports

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFECV
import matplotlib.pyplot as plt

import pandas as pd
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV, cross_validate, KFold, RandomizedSearchCV
import time
from sklearn.metrics import f1_score as score
from xgboost import XGBClassifier

from sklearn.preprocessing import LabelEncoder

from hyperopt import STATUS_OK, Trials, fmin, hp, tpe
from sklearn.metrics import accuracy_score

%matplotlib inline

### Data Load and Processing

In [None]:
train_values = pd.read_csv('/content/drive/My Drive/train_values.csv', index_col='building_id')
train_values.shape
train_values.head()

Unnamed: 0_level_0,geo_level_1_id,geo_level_2_id,geo_level_3_id,count_floors_pre_eq,age,area_percentage,height_percentage,land_surface_condition,foundation_type,roof_type,...,has_secondary_use_agriculture,has_secondary_use_hotel,has_secondary_use_rental,has_secondary_use_institution,has_secondary_use_school,has_secondary_use_industry,has_secondary_use_health_post,has_secondary_use_gov_office,has_secondary_use_use_police,has_secondary_use_other
building_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
802906,6,487,12198,2,30,6,5,t,r,n,...,0,0,0,0,0,0,0,0,0,0
28830,8,900,2812,2,10,8,7,o,r,n,...,0,0,0,0,0,0,0,0,0,0
94947,21,363,8973,2,10,5,5,t,r,n,...,0,0,0,0,0,0,0,0,0,0
590882,22,418,10694,2,10,6,5,t,r,n,...,0,0,0,0,0,0,0,0,0,0
201944,11,131,1488,3,30,8,9,t,r,n,...,0,0,0,0,0,0,0,0,0,0


In [None]:
df_dc = pd.get_dummies(train_values, columns=['land_surface_condition', 'foundation_type', 'roof_type', 'ground_floor_type', 'other_floor_type', 'position', 'plan_configuration', 'legal_ownership_status'])

In [None]:
ha = train_values.area_percentage * train_values.height_percentage
hs = train_values.height_percentage * train_values.height_percentage

In [None]:
df_dc['HA']=ha
df_dc['HS']=hs

In [None]:
df_dc.shape

(260601, 70)

In [None]:
train_labels = pd.read_csv('/content/drive/My Drive/train_labels.csv', index_col='building_id')
train_labels.shape

(260601, 1)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df_dc, train_labels, test_size=0.05, random_state=1)

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.01, random_state=1)

In [None]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape, X_val.shape, y_val.shape

((245094, 70), (245094, 1), (13031, 70), (13031, 1), (2476, 70), (2476, 1))

In [None]:
le = LabelEncoder()
y_train = le.fit_transform(y_train)
y_test = le.fit_transform(y_test)
y_val = le.fit_transform(y_val)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


### Model Training and Tuning

In [None]:
space={ 'max_depth' : 10,
        'gamma': hp.uniform ('gamma', 1.35,2.5),
        'colsample_bytree' : hp.uniform('colsample_bytree', 0.59,0.7),
        'min_child_weight':4,
        'n_estimators' : 800,
        'subsample' : hp.uniform('subsample', 0.6, 0.9),
        'learning_rate' : hp.uniform('learning_rate', 0.1, 0.3),
        'seed': 0
    }

In [None]:
def objective(space):
    clf=XGBClassifier(
                    n_estimators =space['n_estimators'], 
                    max_depth=space['max_depth'],
                    gamma = space['gamma'], 
                    min_child_weight=space['min_child_weight'],
                    subsample=space['subsample'],
                    colsample_bytree=int(space['colsample_bytree']),
                    objective='multi:softmax',
                    num_class=3
                    )
    
    evaluation = [( X_test, y_test), ( X_val, y_val)]
    
    clf.fit(X_test, y_test,
            eval_set=evaluation,
            verbose=False)
    

    pred = clf.predict(X_val)
    #pred+1
    accuracy = accuracy_score(y_val, pred)
    print ("SCORE:", accuracy)
    return {'loss': -accuracy, 'status': STATUS_OK }

In [None]:
trials = Trials()

best_hyperparams = fmin(fn = objective,
                        space = space,
                        algo = tpe.suggest,
                        max_evals = 200,
                        trials = trials)


print("The best hyperparameters are : ","\n")
print(best_hyperparams)

SCORE:
0.6987075928917609
SCORE:
0.6922455573505655
SCORE:
0.6991114701130856
SCORE:
0.7015347334410339
SCORE:
0.6954765751211631
SCORE:
0.6962843295638126
SCORE:
0.6983037156704361
SCORE:
0.6950726978998385
SCORE:
0.6962843295638126
SCORE:
0.691437802907916
SCORE:
0.6954765751211631
SCORE:
0.6849757673667205
SCORE:
0.6869951534733441
SCORE:
0.6954765751211631
SCORE:
0.6950726978998385
SCORE:
0.6910339256865913
SCORE:
0.697092084006462
SCORE:
0.688610662358643
SCORE:
0.6910339256865913
SCORE:
0.6938610662358643
SCORE:
0.6995153473344103
SCORE:
0.694264943457189
SCORE:
0.6926494345718901
SCORE:
0.6978998384491115
SCORE:
0.6958804523424879
SCORE:
0.6954765751211631
SCORE:
0.6974959612277868
SCORE:
0.6987075928917609
SCORE:
0.6958804523424879
SCORE:
0.6974959612277868
SCORE:
0.691437802907916
SCORE:
0.6950726978998385
SCORE:
0.6954765751211631
SCORE:
0.7007269789983845
SCORE:
0.6946688206785138
SCORE:
0.6983037156704361
SCORE:
0.6962843295638126
SCORE:
0.6954765751211631
SCORE:
0.69668820

### Final Model

In [None]:
clf3 = XGBClassifier(
              colsample_bytree= 0.6576260012112893,
              gamma= 1.4742840552410175, 
              learning_rate= 0.13198329932243713,
              subsample=0.7693738298060033,
              max_depth=10,
              min_child_weight=4,
              n_estimators=800,
              objective='multi:softmax',
              num_class=3
              )
              
              
le = LabelEncoder()
y_train = le.fit_transform(y_train)

clf3.fit(X_train, y_train)

In [None]:
train_pred2 = clf3.predict(X_train)
test_pred2 = clf3.predict(X_test)

training_accuracy2 = score(y_train, train_pred2, average='micro')
cross_validation_accuracy2 = score(y_test, test_pred2, average='micro')

print(f' training_accuracy: {training_accuracy2}\n cross_validation_accuracy: {cross_validation_accuracy2}')

 training_accuracy: 0.8375235623883083
 cross_validation_accuracy: 0.7555828409178115


In [None]:

in_sample_preds2 = clf3.predict(df_dc)
f1_score(train_labels, in_sample_preds2+1, average='micro')

0.8325639579280202

### Create Submission File

In [None]:
test_values = pd.read_csv('/content/drive/My Drive/test_values.csv', index_col='building_id')
test_values_dc = pd.get_dummies(test_values, columns=['land_surface_condition', 'foundation_type', 'roof_type', 'ground_floor_type', 'other_floor_type', 'position', 'plan_configuration', 'legal_ownership_status'])


In [None]:
test_values_dc['HA']=test_values.area_percentage*test_values.height_percentage
test_values_dc['HS']=test_values.height_percentage*test_values.height_percentage

In [None]:
test_values_dc.shape

(86868, 70)

In [None]:
predictions2 = clf3.predict(test_values_dc)

In [None]:
submission_format = pd.read_csv('/content/drive/My Drive/submission_format.csv', index_col='building_id')
my_submission2 = pd.DataFrame(data=predictions2,
                             columns=submission_format.columns,
                             index=submission_format.index)
my_submission2.head()

Unnamed: 0_level_0,damage_grade
building_id,Unnamed: 1_level_1
300051,2
99355,1
890251,1
745817,0
421793,2


In [None]:
my_submission2+1

Unnamed: 0_level_0,damage_grade
building_id,Unnamed: 1_level_1
300051,3
99355,2
890251,2
745817,1
421793,3
...,...
310028,2
663567,2
1049160,2
442785,2


In [None]:
(my_submission2+1).to_csv('submission3XGBoostRound31.csv')