# LightGBM
### Modelo LightGBM utilizando apenas as variáveis categóricas da base de dados

In [2]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split

## Dataframes

In [3]:
df_public = pd.read_csv('dados/public.csv')
df_metar = pd.read_csv('dados/metar.csv', low_memory=False)

## data_ref

In [4]:
df_public['data_ref'] = df_public['hora_ref'].apply(lambda x: x[:7])

## Merge

In [5]:
df_public_metar = df_public.merge(df_metar, how='left', left_on=['metar', 'data_ref'], right_on=['meta', 'date'])

## Datasets de Treino e Teste

In [6]:
df_numeric = df_public_metar.select_dtypes(include=np.number)

In [7]:
# Dataframe para envio na competição
df_test_final = df_numeric[df_numeric.espera.isnull()].drop('espera',axis=1)

# Dataframes de treino e teste
df_train_inicial = df_numeric[~df_numeric.espera.isnull()]
X_train_inicial = df_train_inicial.drop('espera', axis=1)
y_train_inicial = df_train_inicial['espera']

X_train, X_test, y_train, y_test = train_test_split(
    X_train_inicial, y_train_inicial, test_size=0.20, random_state=42)

In [8]:
train_data_lgb = lgb.Dataset(X_train, label=y_train)
validation_data_lgb = lgb.Dataset(X_test, label=y_test)

## Parâmetros

In [9]:
param = {'num_leaves': 100, 'objective': 'binary', 'metric': 'f1'}

## Treino

In [10]:
num_round = 10
bst = lgb.train(param, train_data_lgb, num_round, valid_sets=[validation_data_lgb])

[LightGBM] [Info] Number of positive: 2966, number of negative: 166377
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002706 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4
[LightGBM] [Info] Number of data points in the train set: 169343, number of used features: 2
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.017515 -> initscore=-4.027042
[LightGBM] [Info] Start training from score -4.027042


In [11]:
y_pred = bst.predict(X_test)

In [12]:
print(f1_score(y_test.to_list(), np.int8(y_pred > 0.95).tolist(), average='macro'))
print(f1_score(y_test.to_list(), np.int8(y_pred > 0.95).tolist(), average='micro'))
print(f1_score(y_test.to_list(), np.int8(y_pred > 0.95).tolist(), average='weighted'))
print(f1_score(y_test.to_list(), np.int8(y_pred > 0.95).tolist(), average=None))

0.4957118353344768
0.9829931972789115
0.9745627238888693
[0.99142367 0.        ]


## Teste e Submissão

In [13]:
sub = df_public_metar[df_public_metar.espera.isnull()].drop('espera',axis=1).flightid

In [14]:
sub = pd.DataFrame(sub)

In [16]:
y_pred = bst.predict(df_test_final)

In [17]:
sub['espera'] = np.int8(y_pred > 0.02)

In [18]:
(y_pred > 0.02).any()

True

In [19]:
sub.to_csv('submission.csv',index=False)