In [1]:
%matplotlib inline

from pathlib import Path

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from lightgbm import LGBMClassifier

from sklearn.preprocessing import StandardScaler

from sklearn.pipeline import make_pipeline

from sklearn.model_selection import GridSearchCV

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
#train_values = pd.read_csv(DATA_DIR / 'train_values.csv', index_col='building_id')
#train_labels = pd.read_csv(DATA_DIR / 'train_labels.csv', index_col='building_id')

df_train = pd.read_csv('train_values_short1.csv', index_col='building_id')
df_train_labels = pd.read_csv('train_labels.csv', index_col='building_id')

In [7]:
pipe = make_pipeline(StandardScaler(), 
                     LGBMClassifier(random_state=2021, num_iterations= 110,
                                   num_leaves= 70, objective= 'regression'))  

In [8]:
from sklearn.model_selection import cross_val_score

In [11]:
cross_val_score(pipe, df_train, df_train_labels, scoring='f1_micro', cv=15).mean()

0.7582549971264656

In [12]:
# Pruebo hiperparametros que funcionaron bien para otros modelos
pipe1 = make_pipeline(StandardScaler(), 
                     LGBMClassifier(random_state=2021, num_iterations= 273,
                                   num_leaves= 70, objective= 'regression'))  

In [13]:
cross_val_score(pipe1, df_train, df_train_labels, scoring='f1_micro', cv=15).mean()

0.7590378119930284

In [4]:
superestructures = ['has_superstructure_adobe_mud', 'has_superstructure_mud_mortar_stone',
       'has_superstructure_stone_flag',
       'has_superstructure_cement_mortar_stone',
       'has_superstructure_mud_mortar_brick',
       'has_superstructure_cement_mortar_brick', 'has_superstructure_timber',
       'has_superstructure_bamboo', 'has_superstructure_rc_non_engineered',
       'has_superstructure_rc_engineered', 'has_superstructure_other']

In [5]:
train_values = pd.read_csv('train_values.csv', index_col='building_id')
ambos_encodings = df_train.merge(train_values[superestructures],
                                               left_index=True, right_index=True)

In [10]:
ambos_encodings.to_csv('train_values_lightGBM.csv', index=True)

In [12]:
test_values = pd.read_csv('test_values.csv', index_col='building_id')
test_values1 = pd.read_csv('Archivos auxiliares KNN/test_values_short1.csv', index_col='building_id')
ambos_encodings_t = test_values1.merge(test_values[superestructures],
                                               left_index=True, right_index=True)

In [14]:
ambos_encodings_t['superestructure']

building_id
300051     2.374896
99355      2.374896
890251     2.374896
745817     1.629329
421793     2.374896
             ...   
310028     2.257787
663567     2.588448
1049160    2.257787
442785     2.313821
501372     1.629329
Name: superestructure, Length: 86868, dtype: float64

In [18]:
cross_val_score(pipe1, ambos_encodings, df_train_labels, scoring='f1_micro', cv=15).mean()

0.7598973570751862

In [7]:
pipe2 = make_pipeline(StandardScaler(), 
                     LGBMClassifier(random_state=2021, num_iterations= 273,
                                   num_leaves= 70, objective= 'regression',
                                   colsample_bytree=0.77)) 

In [20]:
cross_val_score(pipe2, ambos_encodings, df_train_labels, scoring='f1_micro', cv=15).mean()

0.7602503864798534

In [21]:
cross_val_score(pipe2, df_train, df_train_labels, scoring='f1_micro', cv=15).mean()

0.7595021106132224

El pipe2 anda mejor, tanto si se usan ambos encodigns como si usa uno solo

### Caluclar probas del set de test

In [9]:
pipe2.fit(ambos_encodings, df_train_labels)

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('lgbmclassifier',
                 LGBMClassifier(colsample_bytree=0.77, num_iterations=273,
                                num_leaves=70, objective='regression',
                                random_state=2021))])

In [15]:
probas = pipe2.predict_proba(ambos_encodings_t)

In [17]:
probas_df = pd.DataFrame(data=probas,
                             columns=[1,2,3],
                             index=test_values.index)

In [18]:
probas_df.head()

Unnamed: 0_level_0,1,2,3
building_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
300051,0.005105,0.200623,0.794272
99355,2.4e-05,0.999803,0.000173
890251,0.005219,0.01002,0.98476
745817,0.624295,0.375544,0.000161
421793,0.000255,0.193208,0.806537


In [19]:
probas_df.to_csv('predict_probas/ligthGBM.csv', index=True)