In [108]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import xgboost as xgb
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
import optuna
from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier

pd.set_option('display.max_rows', 150)
pd.set_option('display.max_columns', 600)

In [109]:
df = pd.read_csv('train.csv')

In [110]:
df

Unnamed: 0,id,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,0,15674932,Okwudilichukwu,668,France,Male,33.0,3,0.00,2,1.0,0.0,181449.97,0
1,1,15749177,Okwudiliolisa,627,France,Male,33.0,1,0.00,2,1.0,1.0,49503.50,0
2,2,15694510,Hsueh,678,France,Male,40.0,10,0.00,2,1.0,0.0,184866.69,0
3,3,15741417,Kao,581,France,Male,34.0,2,148882.54,1,1.0,1.0,84560.88,0
4,4,15766172,Chiemenam,716,Spain,Male,33.0,5,0.00,2,1.0,1.0,15068.83,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
165029,165029,15667085,Meng,667,Spain,Female,33.0,2,0.00,1,1.0,1.0,131834.75,0
165030,165030,15665521,Okechukwu,792,France,Male,35.0,3,0.00,1,0.0,0.0,131834.45,0
165031,165031,15664752,Hsia,565,France,Male,31.0,5,0.00,1,1.0,1.0,127429.56,0
165032,165032,15689614,Hsiung,554,Spain,Female,30.0,7,161533.00,1,0.0,1.0,71173.03,0


In [111]:
# Taxa de gastos (balance / tenure)
# df['Gastos_Anuais'] = df['Balance'] / df['Tenure']

# Proporção de produtos por CreditScore
df['Prop_Produtos_Score'] = df['NumOfProducts'] / df['CreditScore']
# Faixa etária (exemplos de intervalos)
def get_faixa_etaria(age):
  if age <= 30:
    return '20-30'
  elif age <= 40:
    return '31-40'
  elif age <= 50:
    return '41-50'
  elif age <= 60:
    return '51-60'
  else:
    return '60+'

df['Faixa_Etaria'] = df['Age'].apply(get_faixa_etaria)

# Nível de saldo (exemplos de intervalos)
def get_nivel_saldo(balance):
  if balance == 0:
    return 'Zero'
  elif balance <= 100000:
    return 'Baixo'
  elif balance <= 150000:
    return 'Medio'
  else:
    return 'Alto'

df['Nivel_Saldo'] = df['Balance'].apply(get_nivel_saldo)

# CreditScore
def get_credit_bin(age):
  if age <= 500:
    return '-500'
  elif age <= 600:
    return '501-600'
  elif age <= 700:
    return '601-700'
  elif age <= 800:
    return '701-800'
  else:
    return '800+'

df['Faixa_Score'] = df['CreditScore'].apply(get_credit_bin)

df['Age'] = df['Age'].astype(int)

In [112]:
print(df.isnull().sum())

id                     0
CustomerId             0
Surname                0
CreditScore            0
Geography              0
Gender                 0
Age                    0
Tenure                 0
Balance                0
NumOfProducts          0
HasCrCard              0
IsActiveMember         0
EstimatedSalary        0
Exited                 0
Prop_Produtos_Score    0
Faixa_Etaria           0
Nivel_Saldo            0
Faixa_Score            0
dtype: int64


In [113]:

# Geography (One-Hot Encoding)
df_encoded = pd.get_dummies(df, columns=['Geography', 'Gender', 'Faixa_Etaria', 'Nivel_Saldo', 'Faixa_Score'])

In [114]:
df_encoded.columns

Index(['id', 'CustomerId', 'Surname', 'CreditScore', 'Age', 'Tenure',
       'Balance', 'NumOfProducts', 'HasCrCard', 'IsActiveMember',
       'EstimatedSalary', 'Exited', 'Prop_Produtos_Score', 'Geography_France',
       'Geography_Germany', 'Geography_Spain', 'Gender_Female', 'Gender_Male',
       'Faixa_Etaria_20-30', 'Faixa_Etaria_31-40', 'Faixa_Etaria_41-50',
       'Faixa_Etaria_51-60', 'Faixa_Etaria_60+', 'Nivel_Saldo_Alto',
       'Nivel_Saldo_Baixo', 'Nivel_Saldo_Medio', 'Nivel_Saldo_Zero',
       'Faixa_Score_-500', 'Faixa_Score_501-600', 'Faixa_Score_601-700',
       'Faixa_Score_701-800', 'Faixa_Score_800+'],
      dtype='object')

In [115]:
X = df_encoded.drop(['id', 'CustomerId', 'Surname', 'Exited'], axis=1)
y = df_encoded['Exited']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [116]:
model1 = LGBMClassifier()
model2 = xgb.XGBClassifier()
model3 = RandomForestClassifier()

model1.fit(X_train, y_train)
model2.fit(X_train, y_train)
model3.fit(X_train, y_train)

y_pred1 = model1.predict(X_test)
y_pred2 = model2.predict(X_test)
y_pred3 = model3.predict(X_test)

accuracy1 = accuracy_score(y_test, y_pred1)
accuracy2 = accuracy_score(y_test, y_pred2)
accuracy3 = accuracy_score(y_test, y_pred3)

print(f"LightGBM Accuracy: {accuracy1}")
print(f"XGBoost Accuracy: {accuracy2}")
print(f"Random Forest Accuracy: {accuracy3}")

[LightGBM] [Info] Number of positive: 27966, number of negative: 104061
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001869 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1147
[LightGBM] [Info] Number of data points in the train set: 132027, number of used features: 28
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.211820 -> initscore=-1.313988
[LightGBM] [Info] Start training from score -1.313988
LightGBM Accuracy: 0.8669979095343412
XGBoost Accuracy: 0.8662707910443239
Random Forest Accuracy: 0.8592116823704062


In [140]:
import lightgbm as lgb
from sklearn.metrics import mean_squared_error
import optuna

def objective(trial):
    params = {
        "objective": "regression",
        "metric": "rmse",
        "n_estimators": 1000,
        "verbosity": -1,
        "bagging_freq": 1,
        "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.1, log=True),
        "num_leaves": trial.suggest_int("num_leaves", 2, 2**10),
        "subsample": trial.suggest_float("subsample", 0.05, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.05, 1.0),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 1, 100),
    }

    model = LGBMClassifier(**params)
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    rmse = mean_squared_error(y_test, predictions, squared=False)
    return rmse

In [141]:
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=30)

[I 2024-04-24 14:35:49,001] A new study created in memory with name: no-name-c1f3e93f-13e4-4d39-9a40-18bf1ad6772b
[I 2024-04-24 14:35:59,972] Trial 0 finished with value: 0.3807880585718829 and parameters: {'learning_rate': 0.0052198534300252055, 'num_leaves': 694, 'subsample': 0.7437514387130519, 'colsample_bytree': 0.12785545926469488, 'min_data_in_leaf': 87}. Best is trial 0 with value: 0.3807880585718829.
[I 2024-04-24 14:36:10,111] Trial 1 finished with value: 0.36539996268393493 and parameters: {'learning_rate': 0.015575825717170321, 'num_leaves': 226, 'subsample': 0.9246056402934179, 'colsample_bytree': 0.3498803751848725, 'min_data_in_leaf': 52}. Best is trial 1 with value: 0.36539996268393493.
[I 2024-04-24 14:36:29,525] Trial 2 finished with value: 0.3794330784518125 and parameters: {'learning_rate': 0.0010203622378617703, 'num_leaves': 788, 'subsample': 0.3230590737700477, 'colsample_bytree': 0.9161170814112916, 'min_data_in_leaf': 78}. Best is trial 1 with value: 0.36539996

In [None]:
best_trial = study.best_trial

best_params = best_trial.params
print(f"Melhores hiperparâmetros: {best_params}")