In [2]:
import pandas as pd

file_path = '../data/loan.csv'
loan_data = pd.read_csv(file_path)

loan_data.info()



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            614 non-null    object 
 1   Gender             601 non-null    object 
 2   Married            611 non-null    object 
 3   Dependents         599 non-null    object 
 4   Education          614 non-null    object 
 5   Self_Employed      582 non-null    object 
 6   ApplicantIncome    614 non-null    int64  
 7   CoapplicantIncome  614 non-null    float64
 8   LoanAmount         592 non-null    float64
 9   Loan_Amount_Term   600 non-null    float64
 10  Credit_History     564 non-null    float64
 11  Property_Area      614 non-null    object 
 12  Loan_Status        614 non-null    object 
dtypes: float64(4), int64(1), object(8)
memory usage: 62.5+ KB


In [3]:
from sklearn.model_selection import train_test_split

# Separar o conjunto em variáveis preditoras e variável alvo
X = loan_data.drop(columns=['Loan_ID', 'Loan_Status'])  # Removendo a variável alvo e identificador
y = loan_data['Loan_Status']

# Realizar o train_test_split 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Verificar tamanhos dos conjuntos
X_train.shape, X_test.shape, y_train.shape, y_test.shape


((491, 11), (123, 11), (491,), (123,))

In [4]:
from sklearn.utils import resample

# Concatenar os dados de treino para realizar o downsampling
train_data = pd.concat([X_train, y_train], axis=1)

# Separar as classes majoritária e minoritária
majority_class = train_data[train_data['Loan_Status'] == 'Y']
minority_class = train_data[train_data['Loan_Status'] == 'N']

# Realizar o downsampling da classe majoritária
majority_downsampled = resample(
    majority_class,
    replace=False,  # Não realizar substituição
    n_samples=len(minority_class),  # Tornar ambas as classes balanceadas
    random_state=42
)

# Combinar a classe minoritária com a classe majoritária reduzida
balanced_train_data = pd.concat([majority_downsampled, minority_class])

# Separar novamente em variáveis preditoras e alvo
X_balanced = balanced_train_data.drop(columns=['Loan_Status'])
y_balanced = balanced_train_data['Loan_Status']

# Realizar novo train_test_split para treino e validação
X_train_balanced, X_val, y_train_balanced, y_val = train_test_split(
    X_balanced, y_balanced, test_size=0.2, random_state=42, stratify=y_balanced
)

# Verificar tamanhos dos conjuntos balanceados
X_train_balanced.shape, X_val.shape, y_train_balanced.shape, y_val.shape


((246, 11), (62, 11), (246,), (62,))

In [5]:
# Remover as colunas especificadas nos conjuntos de treino, validação e teste
columns_to_drop = ['Loan_ID', 'CoapplicantIncome', 'Loan_Amount_Term', 'Credit_History', 'Property_Area']

X_train_balanced = X_train_balanced.drop(columns=columns_to_drop, errors='ignore')
X_val = X_val.drop(columns=columns_to_drop, errors='ignore')
X_test = X_test.drop(columns=columns_to_drop, errors='ignore')

# Verificar a nova estrutura dos conjuntos
X_train_balanced.columns, X_val.columns, X_test.columns


(Index(['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed',
        'ApplicantIncome', 'LoanAmount'],
       dtype='object'),
 Index(['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed',
        'ApplicantIncome', 'LoanAmount'],
       dtype='object'),
 Index(['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed',
        'ApplicantIncome', 'LoanAmount'],
       dtype='object'))

In [6]:
# Preencher os valores ausentes com os valores majoritários de cada coluna
columns_to_fill = ['Dependents', 'Self_Employed', 'Married', 'Gender']

# Iterar sobre as colunas e preencher os valores ausentes
for col in columns_to_fill:
    most_frequent_value = X_train_balanced[col].mode()[0]
    X_train_balanced[col].fillna(most_frequent_value, inplace=True)
    X_val[col].fillna(most_frequent_value, inplace=True)
    X_test[col].fillna(most_frequent_value, inplace=True)

# Verificar se ainda há valores ausentes nos conjuntos de dados
missing_values_train = X_train_balanced.isnull().sum()
missing_values_val = X_val.isnull().sum()
missing_values_test = X_test.isnull().sum()

missing_values_train, missing_values_val, missing_values_test


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_train_balanced[col].fillna(most_frequent_value, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_val[col].fillna(most_frequent_value, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on whic

(Gender              0
 Married             0
 Dependents          0
 Education           0
 Self_Employed       0
 ApplicantIncome     0
 LoanAmount         14
 dtype: int64,
 Gender             0
 Married            0
 Dependents         0
 Education          0
 Self_Employed      0
 ApplicantIncome    0
 LoanAmount         1
 dtype: int64,
 Gender             0
 Married            0
 Dependents         0
 Education          0
 Self_Employed      0
 ApplicantIncome    0
 LoanAmount         2
 dtype: int64)

In [7]:
# Preencher os valores ausentes na coluna LoanAmount com o valor predominante (moda)
mode_loan_amount = X_train_balanced['LoanAmount'].mode()[0]

X_train_balanced['LoanAmount'].fillna(mode_loan_amount, inplace=True)
X_val['LoanAmount'].fillna(mode_loan_amount, inplace=True)
X_test['LoanAmount'].fillna(mode_loan_amount, inplace=True)

# Verificar novamente se há valores ausentes nos conjuntos de dados
missing_values_train = X_train_balanced.isnull().sum()
missing_values_val = X_val.isnull().sum()
missing_values_test = X_test.isnull().sum()

missing_values_train, missing_values_val, missing_values_test


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_train_balanced['LoanAmount'].fillna(mode_loan_amount, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_val['LoanAmount'].fillna(mode_loan_amount, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate ob

(Gender             0
 Married            0
 Dependents         0
 Education          0
 Self_Employed      0
 ApplicantIncome    0
 LoanAmount         0
 dtype: int64,
 Gender             0
 Married            0
 Dependents         0
 Education          0
 Self_Employed      0
 ApplicantIncome    0
 LoanAmount         0
 dtype: int64,
 Gender             0
 Married            0
 Dependents         0
 Education          0
 Self_Employed      0
 ApplicantIncome    0
 LoanAmount         0
 dtype: int64)

In [8]:
from sklearn.preprocessing import LabelEncoder

# Instanciar o LabelEncoder
label_encoder = LabelEncoder()

# Listar as colunas categóricas a serem transformadas
categorical_columns = ['Gender', 'Married', 'Education', 'Self_Employed']
target_column = 'Loan_Status'

# Aplicar o LabelEncoder nas colunas categóricas do conjunto de treino, validação e teste
for col in categorical_columns:
    X_train_balanced[col] = label_encoder.fit_transform(X_train_balanced[col])
    X_val[col] = label_encoder.transform(X_val[col])
    X_test[col] = label_encoder.transform(X_test[col])

# Aplicar o LabelEncoder na variável alvo
y_train_balanced = label_encoder.fit_transform(y_train_balanced)
y_val = label_encoder.transform(y_val)
y_test = label_encoder.transform(y_test)

# Verificar as primeiras linhas para confirmar a transformação
X_train_balanced.head(), y_train_balanced[:5]


(     Gender  Married Dependents  Education  Self_Employed  ApplicantIncome  \
 276       1        1          0          0              0             3993   
 413       1        1          0          1              0             2253   
 78        1        1         3+          0              0             3167   
 441       1        1          0          0              0             7901   
 218       1        1          2          0              0             5000   
 
      LoanAmount  
 276       207.0  
 413       110.0  
 78        180.0  
 441       180.0  
 218        72.0  ,
 array([1, 1, 0, 1, 0]))

In [9]:
# Ajustar a coluna Dependents para que o valor '3+' seja substituído por 3
X_train_balanced['Dependents'] = X_train_balanced['Dependents'].replace('3+', 3).astype(int)
X_val['Dependents'] = X_val['Dependents'].replace('3+', 3).astype(int)
X_test['Dependents'] = X_test['Dependents'].replace('3+', 3).astype(int)

# Verificar os valores únicos para garantir que a transformação foi realizada corretamente
unique_dependents_train = X_train_balanced['Dependents'].unique()
unique_dependents_val = X_val['Dependents'].unique()
unique_dependents_test = X_test['Dependents'].unique()

unique_dependents_train, unique_dependents_val, unique_dependents_test


(array([0, 3, 2, 1]), array([0, 1, 3, 2]), array([0, 1, 3, 2]))

In [10]:
from sklearn.preprocessing import StandardScaler

# Instanciar o StandardScaler
scaler = StandardScaler()

# Selecionar as colunas numéricas
numeric_columns = ['ApplicantIncome', 'LoanAmount', 'Dependents']

# Aplicar a padronização nos conjuntos de treino, validação e teste
X_train_balanced[numeric_columns] = scaler.fit_transform(X_train_balanced[numeric_columns])
X_val[numeric_columns] = scaler.transform(X_val[numeric_columns])
X_test[numeric_columns] = scaler.transform(X_test[numeric_columns])

# Verificar as primeiras linhas para confirmar a padronização
X_train_balanced.head()


Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,LoanAmount
276,1,1,-0.780256,0,0,-0.272211,0.690484
413,1,1,-0.780256,1,0,-0.51497,-0.474085
78,1,1,2.218852,0,0,-0.387451,0.366325
441,1,1,-0.780256,0,0,0.27302,0.366325
218,1,1,1.21915,0,0,-0.131718,-0.930308


In [11]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, cross_val_predict
from sklearn.metrics import classification_report, roc_auc_score, roc_curve

# Definir os classificadores
classifiers = {
    "Logistic Regression": LogisticRegression(),
    "Random Forest": RandomForestClassifier(),
    "SVM": SVC(probability=True),
    "KNN": KNeighborsClassifier(),
    "Gradient Boosting": GradientBoostingClassifier(),
}

# Definir os parâmetros para GridSearch
param_grids = {
    "Logistic Regression": {'classifier__C': [0.1, 1, 10]},
    "Random Forest": {'classifier__n_estimators': [50, 100], 'classifier__max_depth': [None, 10]},
    "SVM": {'classifier__C': [0.1, 1, 10], 'classifier__kernel': ['linear', 'rbf']},
    "KNN": {'classifier__n_neighbors': [3, 5, 7]},
    "Gradient Boosting": {'classifier__n_estimators': [50, 100], 'classifier__learning_rate': [0.01, 0.1]},
}

# Preparar resultados
results = {}

# Executar cada classificador com Grid Search e validação cruzada
for name, clf in classifiers.items():
    print(f"Treinando {name}...")
    pipeline = Pipeline([('classifier', clf)])
    grid_search = GridSearchCV(pipeline, param_grids[name], cv=5, scoring='roc_auc', verbose=0)
    grid_search.fit(X_train_balanced, y_train_balanced)
    
    # Obter melhores predições e métricas
    best_model = grid_search.best_estimator_
    y_pred = cross_val_predict(best_model, X_val, y_val, cv=5, method="predict")
    y_proba = cross_val_predict(best_model, X_val, y_val, cv=5, method="predict_proba")[:, 1]
    
    # Classification report
    report = classification_report(y_val, y_pred, output_dict=True)
    
    # AUC
    auc_score = roc_auc_score(y_val, y_proba)
    
    # Guardar resultados
    results[name] = {
        "Best Params": grid_search.best_params_,
        "Classification Report": report,
        "AUC": auc_score,
    }

Treinando Logistic Regression...
Treinando Random Forest...
Treinando SVM...
Treinando KNN...
Treinando Gradient Boosting...


In [12]:
from sklearn.metrics import precision_score, recall_score, f1_score

# Dicionário para armazenar as métricas
metrics_results = {}

# Para cada modelo, calcular as métricas
for name, clf in classifiers.items():
    # Treinar o modelo no conjunto balanceado
    pipeline = Pipeline([('classifier', clf)])
    grid_search = GridSearchCV(pipeline, param_grids[name], cv=5, scoring='roc_auc', verbose=0)
    grid_search.fit(X_train_balanced, y_train_balanced)
    
    # Previsões no conjunto de validação
    y_pred = grid_search.best_estimator_.predict(X_val)
    
    # Calcular as métricas
    precision = precision_score(y_val, y_pred)
    recall = recall_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)
    auc_score = roc_auc_score(y_val, grid_search.best_estimator_.predict_proba(X_val)[:, 1])
    
    # Armazenar os resultados
    metrics_results[name] = {
        "AUC": auc_score,
        "Precision": precision,
        "Recall": recall,
        "F1-Score": f1
    }

# Apresentar as métricas em formato de DataFrame
import pandas as pd
metrics_df = pd.DataFrame(metrics_results).T

# Exibir as métricas
print(metrics_df)


                          AUC  Precision    Recall  F1-Score
Logistic Regression  0.625390   0.636364  0.677419  0.656250
Random Forest        0.480749   0.566667  0.548387  0.557377
SVM                  0.380853   0.541667  0.838710  0.658228
KNN                  0.469823   0.451613  0.451613  0.451613
Gradient Boosting    0.452133   0.466667  0.451613  0.459016


In [13]:
import joblib
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

# Configurar e treinar o modelo Logistic Regression
final_model = LogisticRegression(C=1)  # Ajuste o valor de C para o encontrado na validação
final_pipeline = Pipeline([('classifier', final_model)])

# Treinar o pipeline com os dados de treino balanceados
final_pipeline.fit(X_train_balanced, y_train_balanced)

# Salvar o pipeline final
output_path = '../data/final_logistic_regression_pipeline.pkl'
joblib.dump(final_pipeline, output_path)

print(f"Pipeline salvo em: {output_path}")


Pipeline salvo em: ../data/final_logistic_regression_pipeline.pkl
