In [66]:
%matplotlib inline

from pathlib import Path

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from lightgbm import LGBMClassifier

from sklearn.preprocessing import StandardScaler

from sklearn.pipeline import make_pipeline

from sklearn.model_selection import GridSearchCV

In [67]:
import warnings
warnings.filterwarnings('ignore')

In [68]:
#train_values = pd.read_csv(DATA_DIR / 'train_values.csv', index_col='building_id')
#train_labels = pd.read_csv(DATA_DIR / 'train_labels.csv', index_col='building_id')

df_train = pd.read_csv('train_values_short1.csv', index_col='building_id')
df_train_labels = pd.read_csv('train_labels.csv', index_col='building_id')

In [69]:
import os
cwd = os.getcwd()

In [70]:
train_values_subset = df_train

In [72]:
pipe = make_pipeline(StandardScaler(), 
                     LGBMClassifier(random_state=2021))  

In [76]:
from sklearn.compose import ColumnTransformer

In [78]:
LGBMClassifier().get_params().keys()

dict_keys(['boosting_type', 'class_weight', 'colsample_bytree', 'importance_type', 'learning_rate', 'max_depth', 'min_child_samples', 'min_child_weight', 'min_split_gain', 'n_estimators', 'n_jobs', 'num_leaves', 'objective', 'random_state', 'reg_alpha', 'reg_lambda', 'silent', 'subsample', 'subsample_for_bin', 'subsample_freq'])

In [87]:
param_grid = {
    #'lgbmclassifier__learning_rate': [0.1], 
    #'lgbmclassifier__n_estimators': [30],
    'lgbmclassifier__num_leaves': [10, 20, 30, 40, 50], 
    'lgbmclassifier__num_iterations': [10, 20, 30, 40, 50, 100], 
    #'lgbmclassifier__boosting_type': ['gbdt'], 
    #'lgbmclassifier__max_depth': [10, 30, 60],
    'lgbmclassifier__objective': ['regression'], 
    #'lgbmclassifier__seed': [500],
    #'lgbmclassifier__colsample_bytree': [0.65, 0.75, 0.8], 
    #'lgbmclassifier__subsample': [0.7, 0.75], 
    #'lgbmclassifier__reg_alpha': [1, 2, 6],
    #'lgbmclassifier__reg_lambda': [1, 2, 6]
             }

gs = GridSearchCV(pipe, param_grid, cv=10)

In [88]:
gs.fit(train_values_subset, df_train_labels.values.ravel())

GridSearchCV(cv=10,
             estimator=Pipeline(steps=[('standardscaler', StandardScaler()),
                                       ('lgbmclassifier',
                                        LGBMClassifier(random_state=2021))]),
             param_grid={'lgbmclassifier__num_iterations': [10, 20, 30, 40, 50,
                                                            100],
                         'lgbmclassifier__num_leaves': [10, 20, 30, 40, 50],
                         'lgbmclassifier__objective': ['regression']})

In [89]:
from sklearn.metrics import f1_score

in_sample_preds = gs.predict(train_values_subset)
f1_score(df_train_labels, in_sample_preds, average='micro')

0.7657376602545654

In [90]:
gs.best_score_

0.758074607885396

In [91]:
gs.best_params_

{'lgbmclassifier__num_iterations': 100,
 'lgbmclassifier__num_leaves': 50,
 'lgbmclassifier__objective': 'regression'}

In [92]:
param_grid = {
    #'lgbmclassifier__learning_rate': [0.1], 
    #'lgbmclassifier__n_estimators': [30],
    'lgbmclassifier__num_leaves': [40, 50, 60], 
    'lgbmclassifier__num_iterations': [90, 100, 110], 
    #'lgbmclassifier__boosting_type': ['gbdt'], 
    #'lgbmclassifier__max_depth': [10, 30, 60],
    'lgbmclassifier__objective': ['regression'], 
    #'lgbmclassifier__seed': [500],
    #'lgbmclassifier__colsample_bytree': [0.65, 0.75, 0.8], 
    #'lgbmclassifier__subsample': [0.7, 0.75], 
    #'lgbmclassifier__reg_alpha': [1, 2, 6],
    #'lgbmclassifier__reg_lambda': [1, 2, 6]
             }

gs = GridSearchCV(pipe, param_grid, cv=10)

In [93]:
gs.fit(train_values_subset, df_train_labels.values.ravel())

GridSearchCV(cv=10,
             estimator=Pipeline(steps=[('standardscaler', StandardScaler()),
                                       ('lgbmclassifier',
                                        LGBMClassifier(random_state=2021))]),
             param_grid={'lgbmclassifier__num_iterations': [90, 100, 110],
                         'lgbmclassifier__num_leaves': [40, 50, 60],
                         'lgbmclassifier__objective': ['regression']})

In [94]:
in_sample_preds = gs.predict(train_values_subset)
f1_score(df_train_labels, in_sample_preds, average='micro')

0.7686117858335155

In [95]:
gs.best_score_

0.75819356001739

In [96]:
gs.best_params_

{'lgbmclassifier__num_iterations': 110,
 'lgbmclassifier__num_leaves': 60,
 'lgbmclassifier__objective': 'regression'}

In [97]:
param_grid = {
    #'lgbmclassifier__learning_rate': [0.1], 
    #'lgbmclassifier__n_estimators': [30],
    'lgbmclassifier__num_leaves': [50, 60, 70], 
    'lgbmclassifier__num_iterations': [100, 110, 120], 
    #'lgbmclassifier__boosting_type': ['gbdt'], 
    #'lgbmclassifier__max_depth': [10, 30, 60],
    'lgbmclassifier__objective': ['regression'], 
    #'lgbmclassifier__seed': [500],
    #'lgbmclassifier__colsample_bytree': [0.65, 0.75, 0.8], 
    #'lgbmclassifier__subsample': [0.7, 0.75], 
    #'lgbmclassifier__reg_alpha': [1, 2, 6],
    #'lgbmclassifier__reg_lambda': [1, 2, 6]
             }

gs = GridSearchCV(pipe, param_grid, cv=10)

In [98]:
gs.fit(train_values_subset, df_train_labels.values.ravel())

GridSearchCV(cv=10,
             estimator=Pipeline(steps=[('standardscaler', StandardScaler()),
                                       ('lgbmclassifier',
                                        LGBMClassifier(random_state=2021))]),
             param_grid={'lgbmclassifier__num_iterations': [100, 110, 120],
                         'lgbmclassifier__num_leaves': [50, 60, 70],
                         'lgbmclassifier__objective': ['regression']})

In [99]:
in_sample_preds = gs.predict(train_values_subset)
f1_score(df_train_labels, in_sample_preds, average='micro')

0.7706148479860015

In [100]:
gs.best_score_

0.758377752703285

In [101]:
gs.best_params_

{'lgbmclassifier__num_iterations': 110,
 'lgbmclassifier__num_leaves': 70,
 'lgbmclassifier__objective': 'regression'}

In [63]:
test_values = pd.read_csv('test_values.csv', index_col='building_id')

In [64]:
test_values_subset = test_values

In [65]:
predictions = gs.predict(test_values_subset)


ValueError: could not convert string to float: 't'

In [89]:
submission_format = pd.read_csv('submission_format.csv', index_col='building_id')


In [90]:
my_submission = pd.DataFrame(data=predictions,
                             columns=submission_format.columns,
                             index=submission_format.index)

In [91]:
my_submission.head()

Unnamed: 0_level_0,damage_grade
building_id,Unnamed: 1_level_1
300051,3
99355,2
890251,2
745817,1
421793,3


In [92]:
my_submission.to_csv('submissionLGBM_00.csv')