In [1018]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score
import numpy as np




In [1019]:
df = pd.read_csv('credit_risk_dataset.csv')

df

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
0,22,59000,RENT,123.0,PERSONAL,D,35000,16.02,1,0.59,Y,3
1,21,9600,OWN,5.0,EDUCATION,B,1000,11.14,0,0.10,N,2
2,25,9600,MORTGAGE,1.0,MEDICAL,C,5500,12.87,1,0.57,N,3
3,23,65500,RENT,4.0,MEDICAL,C,35000,15.23,1,0.53,N,2
4,24,54400,RENT,8.0,MEDICAL,C,35000,14.27,1,0.55,Y,4
...,...,...,...,...,...,...,...,...,...,...,...,...
32576,57,53000,MORTGAGE,1.0,PERSONAL,C,5800,13.16,0,0.11,N,30
32577,54,120000,MORTGAGE,4.0,PERSONAL,A,17625,7.49,0,0.15,N,19
32578,65,76000,RENT,3.0,HOMEIMPROVEMENT,B,35000,10.99,1,0.46,N,28
32579,56,150000,MORTGAGE,5.0,PERSONAL,B,15000,11.48,0,0.10,N,26


In [1020]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32581 entries, 0 to 32580
Data columns (total 12 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   person_age                  32581 non-null  int64  
 1   person_income               32581 non-null  int64  
 2   person_home_ownership       32581 non-null  object 
 3   person_emp_length           31686 non-null  float64
 4   loan_intent                 32581 non-null  object 
 5   loan_grade                  32581 non-null  object 
 6   loan_amnt                   32581 non-null  int64  
 7   loan_int_rate               29465 non-null  float64
 8   loan_status                 32581 non-null  int64  
 9   loan_percent_income         32581 non-null  float64
 10  cb_person_default_on_file   32581 non-null  object 
 11  cb_person_cred_hist_length  32581 non-null  int64  
dtypes: float64(3), int64(5), object(4)
memory usage: 3.0+ MB


In [1021]:
print(df.isnull().sum())

person_age                       0
person_income                    0
person_home_ownership            0
person_emp_length              895
loan_intent                      0
loan_grade                       0
loan_amnt                        0
loan_int_rate                 3116
loan_status                      0
loan_percent_income              0
cb_person_default_on_file        0
cb_person_cred_hist_length       0
dtype: int64


In [1022]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32581 entries, 0 to 32580
Data columns (total 12 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   person_age                  32581 non-null  int64  
 1   person_income               32581 non-null  int64  
 2   person_home_ownership       32581 non-null  object 
 3   person_emp_length           31686 non-null  float64
 4   loan_intent                 32581 non-null  object 
 5   loan_grade                  32581 non-null  object 
 6   loan_amnt                   32581 non-null  int64  
 7   loan_int_rate               29465 non-null  float64
 8   loan_status                 32581 non-null  int64  
 9   loan_percent_income         32581 non-null  float64
 10  cb_person_default_on_file   32581 non-null  object 
 11  cb_person_cred_hist_length  32581 non-null  int64  
dtypes: float64(3), int64(5), object(4)
memory usage: 3.0+ MB


In [1023]:
x = df.drop(columns=['loan_status']) 
y = df['loan_status']

numeric_feactures = [
    'person_age',
    'person_income',
    'person_emp_length',
    'loan_amnt',
    'loan_int_rate',
    'loan_percent_income',
    'cb_person_cred_hist_length'
]

categorical_features = [
    'person_home_ownership',
    'loan_intent',
    'loan_grade',
    'cb_person_default_on_file'
]

#Asegurarnos de que las columnas existen en el DataFrame
numeric_feactures = [col for col in numeric_feactures if col in df.columns]
categorical_features = [col for col in categorical_features if col in df.columns]

print("Numeric Features:", numeric_feactures)
print("Categorical Features:", categorical_features)

Numeric Features: ['person_age', 'person_income', 'person_emp_length', 'loan_amnt', 'loan_int_rate', 'loan_percent_income', 'cb_person_cred_hist_length']
Categorical Features: ['person_home_ownership', 'loan_intent', 'loan_grade', 'cb_person_default_on_file']


In [1024]:
numeric_transformers = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformers = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

In [1025]:
preprecessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformers, numeric_feactures),
        ('cat', categorical_transformers, categorical_features)
    ],
    remainder='drop'
)

In [1026]:
model_pipeline = Pipeline(steps=[
    ('preprocessor', preprecessor),
    ('classifier', LogisticRegression(solver='liblinear', random_state=42))
])

In [1027]:
print("Pipeline created successfully.")
model_pipeline

Pipeline created successfully.


In [1028]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=16516)

In [1029]:
print("Dimenciones de los conjuntos de entrenamiento y prueba: " + str(X_train.shape) + " y " + str(X_test.shape))


Dimenciones de los conjuntos de entrenamiento y prueba: (26064, 11) y (6517, 11)


In [1030]:
model_pipeline.fit(X_train, y_train)
print("Model trained successfully.")

Model trained successfully.


In [1031]:
y_pred = model_pipeline.predict(X_test) # Predicciones del modelo
y_pred_proba = model_pipeline.predict_proba(X_test)[:, 1] # Probabilidades de la clase positiva

In [1032]:
print("Reporte de clasificación:" + "\n" + classification_report(y_test, y_pred))

Reporte de clasificación:
              precision    recall  f1-score   support

           0       0.89      0.95      0.92      5115
           1       0.75      0.56      0.64      1402

    accuracy                           0.86      6517
   macro avg       0.82      0.75      0.78      6517
weighted avg       0.86      0.86      0.86      6517



In [1033]:
print("Training set size:", X_train.shape)
print("Test set size:", X_test.shape)
print("Predictions:", y_pred[:10])  # Mostrar las primeras 10 predicciones
print("Predicted probabilities:", y_pred_proba[:10])  # Mostrar las primeras 10 probabilidades

Training set size: (26064, 11)
Test set size: (6517, 11)
Predictions: [1 0 0 0 1 0 0 0 0 0]
Predicted probabilities: [0.62392906 0.11348466 0.02023416 0.02294506 0.85628727 0.04456989
 0.03677733 0.09289317 0.01266854 0.07505301]


In [1034]:
print("\nPrimeras 10 Predicciones de Probabilidad de Impago (Clase 1) vs. Valores Reales:") # Vamos a aproximar 
for i in range(10):
    print(f"Predicted: {round(y_pred_proba[i])}, Actual: {y_test.iloc[i]}")


Primeras 10 Predicciones de Probabilidad de Impago (Clase 1) vs. Valores Reales:
Predicted: 1, Actual: 1
Predicted: 0, Actual: 0
Predicted: 0, Actual: 0
Predicted: 0, Actual: 0
Predicted: 1, Actual: 0
Predicted: 0, Actual: 0
Predicted: 0, Actual: 0
Predicted: 0, Actual: 0
Predicted: 0, Actual: 0
Predicted: 0, Actual: 1
