In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from ipywidgets import widgets

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score 
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.decomposition import PCA, TruncatedSVD

# Get Data from Website

!wget https://s3.amazonaws.com/drivendata/data/57/public/train_values.csv -nc -P ./nepal/

!wget https://s3.amazonaws.com/drivendata/data/57/public/train_labels.csv -nc -P ./nepal/

!wget https://s3.amazonaws.com/drivendata/data/57/public/test_values.csv -nc -P ./nepal/


# Import Data

In [2]:
X = pd.read_csv("./nepal/train_values.csv",
                index_col='building_id',
                dtype={'geo_level_1_id': 'object',
                       'geo_level_2_id': 'object',
                       'geo_level_3_id': 'object'})

In [3]:
X.head()

Unnamed: 0_level_0,geo_level_1_id,geo_level_2_id,geo_level_3_id,count_floors_pre_eq,age,area_percentage,height_percentage,land_surface_condition,foundation_type,roof_type,...,has_secondary_use_agriculture,has_secondary_use_hotel,has_secondary_use_rental,has_secondary_use_institution,has_secondary_use_school,has_secondary_use_industry,has_secondary_use_health_post,has_secondary_use_gov_office,has_secondary_use_use_police,has_secondary_use_other
building_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
802906,6,487,12198,2,30,6,5,t,r,n,...,0,0,0,0,0,0,0,0,0,0
28830,8,900,2812,2,10,8,7,o,r,n,...,0,0,0,0,0,0,0,0,0,0
94947,21,363,8973,2,10,5,5,t,r,n,...,0,0,0,0,0,0,0,0,0,0
590882,22,418,10694,2,10,6,5,t,r,n,...,0,0,0,0,0,0,0,0,0,0
201944,11,131,1488,3,30,8,9,t,r,n,...,0,0,0,0,0,0,0,0,0,0


In [4]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 260601 entries, 802906 to 747594
Data columns (total 38 columns):
geo_level_1_id                            260601 non-null object
geo_level_2_id                            260601 non-null object
geo_level_3_id                            260601 non-null object
count_floors_pre_eq                       260601 non-null int64
age                                       260601 non-null int64
area_percentage                           260601 non-null int64
height_percentage                         260601 non-null int64
land_surface_condition                    260601 non-null object
foundation_type                           260601 non-null object
roof_type                                 260601 non-null object
ground_floor_type                         260601 non-null object
other_floor_type                          260601 non-null object
position                                  260601 non-null object
plan_configuration                        2

In [5]:
y = pd.read_csv('nepal/train_labels.csv', index_col='building_id')['damage_grade']

# Solution 1: Model /w one feature 

In [6]:
def housing_plot(X,y):
    def plotter(column):
        valid_rows = X[column].notna()
        plt.plot(X.loc[valid_rows, column], y[valid_rows], '.', color='k')
        plt.ylabel('Damage Level')
        plt.yticks([1,2,3])
        
    return plotter

dropdown_values = sorted(X.columns)
widgets.interact(housing_plot(X,y), column=dropdown_values);

interactive(children=(Dropdown(description='column', options=('age', 'area_percentage', 'count_families', 'cou…

In [7]:
X_height = X[['height_percentage']]
X_height.head()

Unnamed: 0_level_0,height_percentage
building_id,Unnamed: 1_level_1
802906,5
28830,7
94947,5
590882,5
201944,9


In [8]:
X_train, X_test, y_train, y_test = train_test_split(X_height, y, test_size=0.2, random_state=42)

In [9]:
one_feat_model = LogisticRegression(solver='lbfgs', multi_class='auto')
one_feat_model.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='auto',
          n_jobs=None, penalty='l2', random_state=None, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False)

# Compare in- and out-sample metrics (f1 score)

In [10]:
def get_metrics(model, X_train, X_test, y_train, y_test):
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    in_samp_score = f1_score(y_train, y_train_pred, average='micro')
    out_samp_score = f1_score(y_test, y_test_pred, average='micro')
    print('In-sample f1 score:', in_samp_score)
    print('Out-sample f1 score:', out_samp_score)

In [11]:
get_metrics(one_feat_model, X_train, X_test, y_train, y_test)

In-sample f1 score: 0.5699779355333845
Out-sample f1 score: 0.5660290477926364


# Create submission

In [12]:
def create_submission(model, X_cols=None):
    X = pd.read_csv("./nepal/test_values.csv", index_col='building_id')
    if X_cols != None:
        X = X[X_cols]
    y_pred = model.predict(X)
    submission = pd.DataFrame(y_pred, index=X.index,
                              columns=['damage_grade'])
    date_string = pd.Timestamp.utcnow().strftime(format='%Y-%m-%d_%H%M_')
    submission.to_csv(f'nepal/{date_string}submission_csv')

In [13]:
create_submission(one_feat_model, ['height_percentage'])

## Score: 0.56

# Solution 2: All Numerical Features - Logistic Regression

In [14]:
numerical_features = list(X.describe().columns)

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [16]:
num_feat_model = LogisticRegression(solver='lbfgs', max_iter=1000, multi_class='auto')
num_feat_model.fit(X_train[numerical_features], y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=1000, multi_class='auto',
          n_jobs=None, penalty='l2', random_state=None, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False)

In [17]:
get_metrics(num_feat_model, X_train[numerical_features], X_test[numerical_features], y_train, y_test)

In-sample f1 score: 0.5762135456638526
Out-sample f1 score: 0.5758523435851193


# Solution 3: All Numerical Features - Gradient Boosting Classifier

In [20]:
xg_num_model = GradientBoostingClassifier()
xg_num_model.fit(X_train[numerical_features], y_train)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              n_iter_no_change=None, presort='auto', random_state=None,
              subsample=1.0, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False)

In [21]:
get_metrics(xg_num_model, X_train[numerical_features], X_test[numerical_features], y_train, y_test)

In-sample f1 score: 0.5924740982348426
Out-sample f1 score: 0.5902227509065444


In [None]:
create_submission(xg_num_model, numerical_features)

# Solution 4: All Numerical Features - Random Forest Classifier

In [22]:
rand_for_model = RandomForestClassifier()
rand_for_model.fit(X_train[numerical_features], y_train)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [23]:
get_metrics(rand_for_model, X_train[numerical_features], X_test[numerical_features], y_train, y_test)

In-sample f1 score: 0.7060005755947812
Out-sample f1 score: 0.5687918497342722


# Solution 4.5: All Numerical Features - Random Forest Classifier with GridSearchCV

In [None]:
param_grid = {'max_depth': range(4,11,2),
              'min_samples_leaf': range(250,301,50),
              'n_estimators': range(100,251,50),
              'class_weight': ['balanced', 'balanced_subsample', None]}

gs = GridSearchCV(RandomForestClassifier(), 
                        param_grid=param_grid,
                        cv=5,
                        n_jobs=12,
                        verbose=1)

In [None]:
random_forest_model = Pipeline([
    ('gs', gs)
])

In [None]:
random_forest_model.fit(X_train[numerical_features], y_train)

y_train_pred = random_forest_model.predict(X_train[numerical_features])

print('In-sample f1 score:')
f1_score(y_train, y_train_pred, average='micro')

In [None]:
y_test_pred = random_forest_model.predict(X_test[numerical_features])

print('Out-sample f1 score:')
f1_score(y_test, y_test_pred, average='micro')

# Solution 5: All Features - Logistic Regression

In [17]:
X.head()

Unnamed: 0_level_0,geo_level_1_id,geo_level_2_id,geo_level_3_id,count_floors_pre_eq,age,area_percentage,height_percentage,land_surface_condition,foundation_type,roof_type,...,has_secondary_use_agriculture,has_secondary_use_hotel,has_secondary_use_rental,has_secondary_use_institution,has_secondary_use_school,has_secondary_use_industry,has_secondary_use_health_post,has_secondary_use_gov_office,has_secondary_use_use_police,has_secondary_use_other
building_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
802906,6,487,12198,2,30,6,5,t,r,n,...,0,0,0,0,0,0,0,0,0,0
28830,8,900,2812,2,10,8,7,o,r,n,...,0,0,0,0,0,0,0,0,0,0
94947,21,363,8973,2,10,5,5,t,r,n,...,0,0,0,0,0,0,0,0,0,0
590882,22,418,10694,2,10,6,5,t,r,n,...,0,0,0,0,0,0,0,0,0,0
201944,11,131,1488,3,30,8,9,t,r,n,...,0,0,0,0,0,0,0,0,0,0


In [25]:
categorical_variables = [col for col in X.columns
                         if X[col].dtype == 'object']
categorical_variables

['geo_level_1_id',
 'geo_level_2_id',
 'geo_level_3_id',
 'land_surface_condition',
 'foundation_type',
 'roof_type',
 'ground_floor_type',
 'other_floor_type',
 'position',
 'plan_configuration',
 'legal_ownership_status']

In [37]:
ct = ColumnTransformer(remainder='passthrough',
                       transformers=[('ohe', OneHotEncoder(handle_unknown='ignore'), categorical_variables)])

#param_grid= {#'max_depth': range(3,13,3),
#            'max_iter': range(1000, 3001, 500),
#            'C': range(1,12,2)}

#gs = GridSearchCV(LogisticRegression(solver='lbfgs', multi_class='auto'),
#                  param_grid=param_grid,
#                  cv=3,
#                  n_jobs=12,
#                  verbose=1)

all_feat_model = Pipeline([
    ('transformer', ct),
    ('classifier', LogisticRegression(solver='lbfgs', multi_class='auto', C=100, max_iter=3000))
])

all_feat_model.fit(X_train, y_train);



In [38]:
get_metrics(all_feat_model, X_train, X_test, y_train, y_test) 75,15

In-sample f1 score: 0.7516625032137252
Out-sample f1 score: 0.7515588726233188


In [22]:
def create_submission_for_all_features(model, X_cols=None):
    X = pd.read_csv("./nepal/test_values.csv",
                index_col='building_id',
                dtype={'geo_level_1_id': 'object',
                       'geo_level_2_id': 'object',
                       'geo_level_3_id': 'object'})
    if X_cols != None:
        X = X[X_cols]
    y_pred = model.predict(X)
    submission = pd.DataFrame(y_pred, index=X.index,
                              columns=['damage_grade'])
    date_string = pd.Timestamp.utcnow().strftime(format='%Y-%m-%d_%H%M_')
    submission.to_csv(f'nepal/{date_string}submission_csv')

In [23]:
create_submission_for_all_features(all_feat_model)

# Solution 6: All Features - ExtraTreesClassifier with GridSearchCV

In [29]:
ct = ColumnTransformer(remainder='passthrough',
                       transformers=[('ohe', OneHotEncoder(handle_unknown='ignore'), categorical_variables)])

param_grid= {#'max_depth': range(3,13,3),
            'min_samples_split': np.power(2, np.arange(1,6)),
            'min_samples_leaf': np.power(2, np.arange(1,6))}

gs = GridSearchCV(ExtraTreesClassifier(),
                  param_grid=param_grid,
                  cv=3,
                  n_jobs=6,
                  verbose=1)

all_feat_extra_trees_model = Pipeline([
    ('transformer', ct),
    ('classifier', gs)
])

all_feat_extra_trees_model.fit(X_train, y_train)

Fitting 3 folds for each of 25 candidates, totalling 75 fits


[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:  3.0min
[Parallel(n_jobs=6)]: Done  75 out of  75 | elapsed:  4.2min finished


Pipeline(memory=None,
     steps=[('transformer', ColumnTransformer(n_jobs=None, remainder='passthrough', sparse_threshold=0.3,
         transformer_weights=None,
         transformers=[('ohe', OneHotEncoder(categorical_features=None, categories=None,
       dtype=<class 'numpy.float64'>, handle_unknown='ignore',
       n_val...   pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=1))])

In [30]:
get_metrics(all_feat_extra_trees_model, X_train, X_test, y_train, y_test)

In-sample f1 score: 0.7472515349194169
Out-sample f1 score: 0.7235471307150668
