In [36]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, f1_score, precision_score, recall_score, accuracy_score, make_scorer
from sklearn.feature_selection import SelectKBest, f_classif
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt
import joblib
import os

In [37]:
x_train_full = pd.read_csv("data/processed/x_train.csv")
y_train_full = pd.read_csv("data/processed/y_train.csv")
y = y_train_full.values.ravel()

In [38]:
selector = SelectKBest(score_func=f_classif, k=12)
selector.fit(x_train_full, y)
top_features = x_train_full.columns[selector.get_support()]
print(top_features)

Index(['LIMIT_BAL', 'PAY_0', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6',
       'sum_all_pay', 'bill_to_limit_ratio', 'pay_to_bill_ratio',
       'avg_pay_delay', 'count_delays'],
      dtype='object')


In [39]:
x = x_train_full[top_features]

In [40]:
x_train, x_test, y_train, y_test = train_test_split(x, y, stratify=y, test_size=0.2, random_state=42)

In [41]:
smote = SMOTE(random_state=42)
x_res, y_res = smote.fit_resample(x_train, y_train)

In [42]:
scaler = StandardScaler()
x_train = scaler.fit_transform(x_res)
x_test = scaler.transform(x_test)

In [43]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [44]:
f1 = make_scorer(f1_score, pos_label=0)

In [45]:
rf_param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2', None],
    'random_state': [42]
}

rf_grid = GridSearchCV(
    RandomForestClassifier(n_jobs=-1), rf_param_grid,
    scoring=f1, cv=cv, n_jobs=-1, verbose=2
)
rf_grid.fit(x_train, y_res)
rf_best = rf_grid.best_estimator_
print("\nMelhores parâmetros RF:", rf_grid.best_params_)

Fitting 5 folds for each of 243 candidates, totalling 1215 fits
[CV] END max_depth=5, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100, random_state=42; total time=   2.0s
[CV] END max_depth=5, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=100, random_state=42; total time=   2.1s
[CV] END max_depth=5, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100, random_state=42; total time=   2.3s
[CV] END max_depth=5, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100, random_state=42; total time=   2.4s
[CV] END max_depth=5, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100, random_state=42; total time=   2.6s
[CV] END max_depth=5, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100, random_state=42; total time=   3.5s[CV] END max_depth=5, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=200, random_state=42; tota

In [46]:
joblib.dump(rf_best, "models/rf.pkl")

['models/rf.pkl']

In [47]:
lr_param_grid = {
    'C': [0.01, 0.1, 1, 10],
    'solver': ['lbfgs','liblinear'],
    'max_iter': [1000],
    'random_state': [42]
}

lr_grid = GridSearchCV(
    LogisticRegression(), lr_param_grid,
    scoring=f1, cv=cv, n_jobs=-1, verbose=2
)
lr_grid.fit(x_train, y_res)
lr_best = lr_grid.best_estimator_
print("\nMelhores parâmetros LR:", lr_grid.best_params_)

Fitting 5 folds for each of 8 candidates, totalling 40 fits
[CV] END C=0.01, max_iter=1000, random_state=42, solver=lbfgs; total time=   0.1s
[CV] END C=0.01, max_iter=1000, random_state=42, solver=lbfgs; total time=   0.1s
[CV] END C=0.01, max_iter=1000, random_state=42, solver=lbfgs; total time=   0.1s
[CV] END C=0.01, max_iter=1000, random_state=42, solver=lbfgs; total time=   0.1s
[CV] END C=0.01, max_iter=1000, random_state=42, solver=lbfgs; total time=   0.1s
[CV] END C=0.1, max_iter=1000, random_state=42, solver=lbfgs; total time=   0.1s[CV] END C=0.1, max_iter=1000, random_state=42, solver=lbfgs; total time=   0.1s

[CV] END C=0.1, max_iter=1000, random_state=42, solver=lbfgs; total time=   0.1s
[CV] END C=0.1, max_iter=1000, random_state=42, solver=lbfgs; total time=   0.1s
[CV] END C=0.1, max_iter=1000, random_state=42, solver=lbfgs; total time=   0.2s
[CV] END C=0.01, max_iter=1000, random_state=42, solver=liblinear; total time=   0.2s
[CV] END ..C=1, max_iter=1000, random_s

In [48]:
joblib.dump(lr_best, "models/lr.pkl")

['models/lr.pkl']