In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
import matplotlib.pyplot as plt
import seaborn as sns

df = pd.read_csv(r"C:\Users\mayk3\Downloads\emprestamo.csv")

df.columns = df.columns.str.strip().str.replace(' ', '_').str.lower()

print("Colunas padronizadas:")
print(df.columns)
print("\nPrimeiras 5 linhas:")
print(df.head())

In [None]:
moda_credito = df['credit_history'].mode()[0]
df['credit_history'] = df['credit_history'].fillna(moda_credito)

cols_categoricas_faltantes = ['gender', 'married', 'dependents', 'self_employed']
for col in cols_categoricas_faltantes:
    df[col] = df[col].fillna(df[col].mode()[0])


cols_numericas_faltantes = ['loanamount', 'loan_amount_term']
for col in cols_numericas_faltantes:
    df[col] = df[col].fillna(df[col].median())

print("Total de NaN após imputação (deve ser 0):", df.isnull().sum().sum())

In [None]:
df['total_income'] = df['applicantincome'] + df['coapplicantincome']

df['loan_to_income_ratio'] = df['loanamount'] / (df['total_income'] + 1)

df['loan_status'] = df['loan_status'].map({'Y': 1, 'N': 0})

df['married'] = df['married'].map({'Yes': 1, 'No': 0})
df['gender'] = df['gender'].map({'Male': 1, 'Female': 0})
df['self_employed'] = df['self_employed'].map({'Yes': 1, 'No': 0})
df['education'] = df['education'].map({'Graduate': 1, 'Not Graduate': 0})
df['dependents'] = df['dependents'].replace('3+', '3').astype(int)

df = pd.get_dummies(df, columns=['property_area'], drop_first=True, dtype=int)

print("Tipos de dados após codificação (Todas as colunas exceto loan_id devem ser numéricas):")
print(df.dtypes.value_counts())

In [None]:
y = df['loan_status']
X = df.drop(columns=[
    'loan_status', 'loan_id', 'applicantincome', 'coapplicantincome'
])

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


features_to_scale = [
    'total_income', 'loanamount', 'loan_amount_term',
    'loan_to_income_ratio'
]

scaler = StandardScaler()
X_train[features_to_scale] = scaler.fit_transform(X_train[features_to_scale])
X_test[features_to_scale] = scaler.transform(X_test[features_to_scale])

print("Tamanho do conjunto de Treino:", X_train.shape)
print("Tamanho do conjunto de Teste:", X_test.shape)

In [None]:
lr_model = LogisticRegression(random_state=42)
lr_model.fit(X_train, y_train)

lr_y_pred = lr_model.predict(X_test)
lr_y_proba = lr_model.predict_proba(X_test)[:, 1]

print("\n--- Desempenho da Regressão Logística ---")
print(classification_report(y_test, lr_y_pred))
print("Matriz de Confusão:\n", confusion_matrix(y_test, lr_y_pred))
print(f"AUC-ROC: {roc_auc_score(y_test, lr_y_proba):.4f}")

coeficientes = lr_model.coef_[0]
feature_importance = pd.DataFrame({
    'Feature': X_train.columns,
    'Coeficiente': coeficientes
}).sort_values(by='Coeficiente', ascending=False)

print("\n--- Fatores de Risco (Coeficientes) ---")
print(feature_importance)

plt.figure(figsize=(10, 6))
sns.barplot(x='Coeficiente', y='Feature', data=feature_importance)
plt.title('Impacto das Variáveis na Aprovação (Regressão Logística)')
plt.show()

In [None]:
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

rf_y_pred = rf_model.predict(X_test)
rf_y_proba = rf_model.predict_proba(X_test)[:, 1]

print("\n--- Desempenho do Random Forest ---")
print(classification_report(y_test, rf_y_pred))
print("Matriz de Confusão:\n", confusion_matrix(y_test, rf_y_pred))
print(f"AUC-ROC: {roc_auc_score(y_test, rf_y_proba):.4f}")

rf_feature_importance = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': rf_model.feature_importances_
}).sort_values(by='Importance', ascending=False)

print("\n--- Importância das Features (Random Forest) ---")
print(rf_feature_importance)