In [11]:
import pandas as pd
import numpy as np
import sklearn

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures, OneHotEncoder, OrdinalEncoder, RobustScaler, MinMaxScaler, StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif

from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split, GridSearchCV
#modelos
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier


In [13]:
df = pd.read_csv("credit.csv")

In [14]:
columna_tipo_prestamos = 'Type_of_Loan'

# Reemplazamos "No Data" por "No Loan"
df[columna_tipo_prestamos] = df[columna_tipo_prestamos].replace("No Data", "No Loan")

# Eliminamos el conector "and" en los nombres de los préstamos
df[columna_tipo_prestamos] = df[columna_tipo_prestamos].replace(r"\band\b", "", regex=True)


# Convertimos la columna a listas correctamente
df[columna_tipo_prestamos] = df[columna_tipo_prestamos].apply(lambda x: [f'"{item.strip()}"' for item in x.split(",")] if isinstance(x, str) else [])

# Aplicamos MultiLabelBinarizer solo una vez
mlb = sklearn.preprocessing.MultiLabelBinarizer()
encoded_loans = mlb.fit_transform(df[columna_tipo_prestamos])

# Convertimos la matriz en un DataFrame con nombres de columnas
df_encoded = pd.DataFrame(encoded_loans, columns=mlb.classes_)


# QUITAR LAS COMILLAS DOBLES de los nombres de las columnas en df_encoded
df_encoded.columns = df_encoded.columns.str.replace('"', '', regex=False)


# Concatenamos el DataFrame codificado con el original
df = pd.concat([df, df_encoded], axis=1)


df.pop("Type_of_Loan")
# Mapeamos la columna 'Credit_Score' a valores numéricos
df['Credit_Score'] = df['Credit_Score'].map({'Good': 0, 'Poor': 1, 'Standard': 2})

# Movemos la columna al final
credit_score_column = df.pop('Credit_Score')  # Extrae la columna ya mapeada
df['Credit_Score'] = credit_score_column      # La vuelve a insertar al final

In [15]:
# Cremos nuevas variables derivadas de las originales que ayudan a mejorar la predicción crediticia

# Evitar división por cero en cada nueva variable derivada
df["Debt_to_Income_Ratio"] = df["Outstanding_Debt"] / (df["Annual_Income"].replace(0, 1e-6))
df["Loan_to_Income_Ratio"] = (df["Total_EMI_per_month"] * 12) / (df["Annual_Income"].replace(0, 1e-6))
df["Credit_Card_to_Bank_Account_Ratio"] = df["Num_Credit_Card"] / (df["Num_Bank_Accounts"].replace(0, 1e-6))
df["Credit_Age_per_Loan"] = df["Credit_History_Age"] / (df["Num_of_Loan"] + 1)  # +1 evita división por cero
df["Credit_Inquiries_per_Year"] = df["Num_Credit_Inquiries"] / (df["Credit_History_Age"] + 1)  # +1 evita división por cero
df["Delayed_Payment_Ratio"] = df["Num_of_Delayed_Payment"] / (df["Num_of_Loan"] + 1)  # +1 evita división por cero


# Ahora podemos calcular el target encoding correctamente
occupation_target_mean = df.groupby("Occupation")["Credit_Score"].mean()
df["Occupation_encoded"] = df["Occupation"].map(occupation_target_mean)
# Eliminamos la columna original de "Occupation"
df.pop("Occupation")

# Definir el orden de las categorías "Payment_Behaviour"
payment_order = [
    "Low_spent_Small_value_payments",
    "High_spent_Small_value_payments",
    "Low_spent_Medium_value_payments",
    "High_spent_Medium_value_payments",
    "Low_spent_Large_value_payments",
    "High_spent_Large_value_payments"
]

In [16]:
column_transformer = ColumnTransformer(transformers=[
    ("x1", MinMaxScaler(), ["Age"]),
    ("x2", MinMaxScaler(), ["Occupation_encoded"]),
    ("x3", RobustScaler(), ["Annual_Income"]),
    #("x4", RobustScaler(), ["Monthly_Inhand_Salary"]),
    ("x5", MinMaxScaler(), ["Num_Bank_Accounts"]),
    ("x6", MinMaxScaler(), ["Num_Credit_Card"]),
    ("x7", MinMaxScaler(), ["Interest_Rate"]),
    ("x8", MinMaxScaler(),["Num_of_Loan"]),
    ("x9", "passthrough", ['Auto Loan']),
    ("x10","passthrough", ['Credit-Builder Loan']),
    ("x11", "passthrough", ['Debt Consolidation Loan']),
    ("x12", "passthrough", ['Home Equity Loan']),
    ("x13", "passthrough", ['Mortgage Loan']),
    ("x14", "passthrough", ['No Loan']),
    ("x15", "passthrough", ['Not Specified']),
    ("x16", "passthrough", ['Payday Loan']),
    ("x17", "passthrough", ['Personal Loan']),
    ("x18", "passthrough", ['Student Loan']),

    ("x19", RobustScaler(),["Delay_from_due_date"]),

    #("x20", MinMaxScaler(),["Num_of_Delayed_Payment"]),
    ("x21", RobustScaler(),["Changed_Credit_Limit"]),    # mejora 2%
    #("x22", RobustScaler(),["Num_Credit_Inquiries"]),


    ("x23", OrdinalEncoder(categories=[["Bad", "Standard", "Good"]]) , ['Credit_Mix']), # mejora 2%
    ("x24", RobustScaler(), ['Outstanding_Debt']), # mejora 1%
    ("x25", RobustScaler(), ['Credit_Utilization_Ratio']),
    ("x26", RobustScaler(), ['Credit_History_Age']),
    ("x27", OrdinalEncoder(categories=[["No", "NM", "Yes"]]), ['Payment_of_Min_Amount']),
    ("x28", MinMaxScaler(), ['Total_EMI_per_month']),
    #("x29", MinMaxScaler(), ['Amount_invested_monthly']),
    #("x30", OrdinalEncoder(categories=[payment_order]) , ['Payment_Behaviour']),
    #("x31", MinMaxScaler(), ['Monthly_Balance']),

    # Variables derivadas agregadas aquí
    ("x32", RobustScaler(), ["Debt_to_Income_Ratio"]),# 74.11
    ("x33", RobustScaler(), ["Loan_to_Income_Ratio"]), # 74.11
    ("x34", MinMaxScaler(), ["Credit_Card_to_Bank_Account_Ratio"]), # 74.11
    ("x35", MinMaxScaler(), ["Credit_Age_per_Loan"]), # 74.11
    ("x36", MinMaxScaler(), ["Credit_Inquiries_per_Year"]), # 74.11
    ("x37", MinMaxScaler(), ["Delayed_Payment_Ratio"]), #74.11

    ("y", "passthrough", ["Credit_Score"])
])
transformed_data = column_transformer.fit_transform(df)
# Verificamos la forma de los datos transformados
print("Forma después de transformación:", transformed_data.shape)
# Obtenemos nombres de columnas transformadas
column_names = column_transformer.get_feature_names_out()

# Convertimos en DataFrame
df_transformed = pd.DataFrame(transformed_data, columns=column_names)

print("Transformación completada")
print(df_transformed.columns)
print(df_transformed.shape)

Forma después de transformación: (80000, 32)
Transformación completada
Index(['x1__Age', 'x2__Occupation_encoded', 'x3__Annual_Income',
       'x5__Num_Bank_Accounts', 'x6__Num_Credit_Card', 'x7__Interest_Rate',
       'x8__Num_of_Loan', 'x9__Auto Loan', 'x10__Credit-Builder Loan',
       'x11__Debt Consolidation Loan', 'x12__Home Equity Loan',
       'x13__Mortgage Loan', 'x14__No Loan', 'x15__Not Specified',
       'x16__Payday Loan', 'x17__Personal Loan', 'x18__Student Loan',
       'x19__Delay_from_due_date', 'x21__Changed_Credit_Limit',
       'x23__Credit_Mix', 'x24__Outstanding_Debt',
       'x25__Credit_Utilization_Ratio', 'x26__Credit_History_Age',
       'x27__Payment_of_Min_Amount', 'x28__Total_EMI_per_month',
       'x32__Debt_to_Income_Ratio', 'x33__Loan_to_Income_Ratio',
       'x34__Credit_Card_to_Bank_Account_Ratio', 'x35__Credit_Age_per_Loan',
       'x36__Credit_Inquiries_per_Year', 'x37__Delayed_Payment_Ratio',
       'y__Credit_Score'],
      dtype='object')
(80000,

In [17]:
# Separamos las variables predictoras (X) y la variable objetivo (y)
X = df_transformed.iloc[:, :-1]  # Variables independientes
y = df_transformed.iloc[:, -1]   # Variable objetivo (variable dependiente)

Buscamos los mejores parámetros para Regresión logística

In [36]:
# Dividimos: TEST separado
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Pipeline 1: PCA + Regresión
# ###########################
pipe_pca_rl = Pipeline([ ('pca', PCA(random_state=42)), ('clf', LogisticRegression(max_iter=500, class_weight='balanced'))])
param_grid_pca_rl = { 'pca__n_components': [10, 15, 20], 'clf__penalty': ['l2'], 'clf__C': [0.01, 0.1, 1, 10], 'clf__solver': ['lbfgs', 'liblinear']
}
grid_pca_rl = GridSearchCV(pipe_pca_rl, param_grid_pca_rl, cv=5, scoring='accuracy', n_jobs=-1, verbose=2)
grid_pca_rl.fit(X_train, y_train)

# Pipeline 2: SelectKBest + Regresión Logística
###############################################
pipe_kbest_rl = Pipeline([ ('select', SelectKBest(score_func=f_classif)), ('clf', LogisticRegression(max_iter=500, class_weight='balanced')) ])
param_grid_kbest_rl = {'select__k': [8, 10, 15, 20], 'clf__penalty': ['l2'], 'clf__C': [0.01, 0.1, 1, 10],'clf__solver': ['lbfgs', 'liblinear']
}
grid_kbest_rl = GridSearchCV(pipe_kbest_rl, param_grid_kbest_rl, cv=5, scoring='accuracy', n_jobs=-1, verbose=2)
grid_kbest_rl.fit(X_train, y_train)

# Mostramos resultados:
print("PCA + Logistic Regression - Best Params:", grid_pca_rl.best_params_)
print("Accuracy CV:", grid_pca.best_score_)

print("SelectKBest + Logistic Regression - Best Params:", grid_kbest_rl.best_params_)
print("Accuracy CV:", grid_kbest.best_score_)



Fitting 5 folds for each of 24 candidates, totalling 120 fits
Fitting 5 folds for each of 32 candidates, totalling 160 fits
PCA + Logistic Regression - Best Params: {'clf__C': 0.01, 'clf__penalty': 'l2', 'clf__solver': 'liblinear', 'pca__n_components': 20}
Accuracy CV: 0.635375
SelectKBest + Logistic Regression - Best Params: {'clf__C': 0.1, 'clf__penalty': 'l2', 'clf__solver': 'liblinear', 'select__k': 20}
Accuracy CV: 0.6530178571428571


Buscamos los mejor parámetro para knn

In [37]:
# Pipeline 1: PCA + KNN
# #####################
pipeline_pca = Pipeline([('pca', PCA(random_state=42)),('knn', KNeighborsClassifier())])

param_dist_pca = {'pca__n_components': [10, 15, 20],'knn__n_neighbors': np.arange(5, 11),'knn__weights': ['uniform', 'distance'],'knn__p': [1, 2] }

search_pca = RandomizedSearchCV( pipeline_pca, param_distributions=param_dist_pca, n_iter=8, cv=3, scoring='accuracy', n_jobs=-1, verbose=2, random_state=42 )

search_pca.fit(X_train, y_train)

print("Mejor PCA + KNN:", search_pca.best_params_)
print("Accuracy en CV (PCA + KNN):", search_pca.best_score_)

# Pipeline 2: SelectKBest + KNN
###############################
pipeline_kbest = Pipeline([ ('scaler', StandardScaler()), ('select', SelectKBest(score_func=f_classif)), ('knn', KNeighborsClassifier()) ])

param_dist_kbest = { 'select__k': [8, 10, 15, 20], 'knn__n_neighbors': np.arange(5, 11), 'knn__weights': ['uniform', 'distance'], 'knn__p': [1, 2] }

search_kbest = RandomizedSearchCV( pipeline_kbest, param_distributions=param_dist_kbest, n_iter=8, cv=3, scoring='accuracy', n_jobs=-1, verbose=2, random_state=42 )

search_kbest.fit(X_train, y_train)

print("Mejor SelectKBest + KNN:", search_kbest.best_params_)
print("Accuracy en CV (SelectKBest + KNN):", search_kbest.best_score_)


Fitting 3 folds for each of 8 candidates, totalling 24 fits
Mejor PCA + KNN: {'pca__n_components': 15, 'knn__weights': 'distance', 'knn__p': 1, 'knn__n_neighbors': np.int64(7)}
Accuracy en CV (PCA + KNN): 0.7181428391056363
Fitting 3 folds for each of 8 candidates, totalling 24 fits
Mejor SelectKBest + KNN: {'select__k': 10, 'knn__weights': 'distance', 'knn__p': 1, 'knn__n_neighbors': np.int64(9)}
Accuracy en CV (SelectKBest + KNN): 0.7691784446847127


Buscando los mejores parámetros para Random Forest

In [38]:
# Pipeline 1: PCA + Random Forest
##########################################################
pipe_pca_rf = Pipeline([ ('pca', PCA(random_state=42)), ('clf', RandomForestClassifier(random_state=42, class_weight='balanced')) ])

param_grid_pca_rf = { 'pca__n_components': [10, 15, 25], 'clf__n_estimators': [30, 140], 'clf__max_depth': [None,10,25], 'clf__min_samples_split': [2, 10],}

grid_pca_rf = GridSearchCV(pipe_pca_rf, param_grid_pca_rf, cv=3, scoring='accuracy', n_jobs=-1, verbose=2)
grid_pca_rf.fit(X_train, y_train)

# Pipeline 2: SelectKBest + Random Forest
##########################################
pipe_kbest_rf = Pipeline([ ('select', SelectKBest(score_func=f_classif)), ('clf', RandomForestClassifier(random_state=42, class_weight='balanced')) ])

param_grid_kbest_rf = { 'select__k': [8, 10], 'clf__n_estimators': [30, 140], 'clf__max_depth': [None, 10, 25], 'clf__min_samples_split': [2, 10],}

grid_kbest_rf = GridSearchCV(pipe_kbest_rf, param_grid_kbest_rf, cv=3, scoring='accuracy', n_jobs=-1, verbose=2)
grid_kbest_rf.fit(X_train, y_train)

# Resultados y Evaluación Final
##########################################################

print("PCA + Random Forest - Best Params:", grid_pca_rf.best_params_)
print("Accuracy CV:", grid_pca_rf.best_score_)

print("SelectKBest + Random Forest - Best Params:", grid_kbest_rf.best_params_)
print("Accuracy CV:", grid_kbest_rf.best_score_)


Fitting 3 folds for each of 36 candidates, totalling 108 fits
Fitting 3 folds for each of 24 candidates, totalling 72 fits
PCA + Random Forest - Best Params: {'clf__max_depth': None, 'clf__min_samples_split': 10, 'clf__n_estimators': 140, 'pca__n_components': 25}
Accuracy CV: 0.7681428636915347
SelectKBest + Random Forest - Best Params: {'clf__max_depth': 25, 'clf__min_samples_split': 2, 'clf__n_estimators': 140, 'select__k': 10}
Accuracy CV: 0.7847856551550575


Buscamos los mejores parámetros para el modelo Arbol de decision

In [34]:
# Pipeline 1: PCA + Decision Tree
##########################################################
pipe_pca_dt = Pipeline([ ('pca', PCA(random_state=42)), ('clf', DecisionTreeClassifier(random_state=42, class_weight='balanced')) ])

param_grid_pca_dt = { 'pca__n_components': [10, 15, 25], 'clf__max_depth': [None, 10, 25], 'clf__min_samples_split': [2, 10], 'clf__min_samples_leaf': [1, 5] }

grid_pca_dt = GridSearchCV(pipe_pca_dt, param_grid_pca_dt, cv=3, scoring='accuracy', n_jobs=-1, verbose=2)
grid_pca_dt.fit(X_train, y_train)

# Pipeline 2: SelectKBest + Decision Tree
#########################################
pipe_kbest_dt = Pipeline([ ('select', SelectKBest(score_func=f_classif)), ('clf', DecisionTreeClassifier(random_state=42, class_weight='balanced')) ])

param_grid_kbest_dt = {'select__k': [8, 10],'clf__max_depth': [None, 10, 25],'clf__min_samples_split': [2, 10] 'clf__min_samples_leaf': [1, 5]
}

grid_kbest_dt = GridSearchCV(pipe_kbest_dt, param_grid_kbest_dt, cv=3, scoring='accuracy', n_jobs=-1, verbose=2)
grid_kbest_dt.fit(X_train, y_train)

# Resultados y Evaluación Final
###############################

print("PCA + Decision Tree - Best Params:", grid_pca_dt.best_params_)
print("Accuracy CV:", grid_pca_dt.best_score_)

print("SelectKBest + Decision Tree - Best Params:", grid_kbest_dt.best_params_)
print("Accuracy CV:", grid_kbest_dt.best_score_)


Fitting 3 folds for each of 12 candidates, totalling 36 fits
Fitting 3 folds for each of 36 candidates, totalling 108 fits
Fitting 3 folds for each of 24 candidates, totalling 72 fits
Decision Tree Puro - Best Params: {'clf__max_depth': 25, 'clf__min_samples_leaf': 1, 'clf__min_samples_split': 2}
Accuracy CV: 0.7340713834392907
PCA + Decision Tree - Best Params: {'clf__max_depth': 25, 'clf__min_samples_leaf': 1, 'clf__min_samples_split': 2, 'pca__n_components': 25}
Accuracy CV: 0.6870179835398961
SelectKBest + Decision Tree - Best Params: {'clf__max_depth': 25, 'clf__min_samples_leaf': 1, 'clf__min_samples_split': 2, 'select__k': 10}
Accuracy CV: 0.7558570645750415
Evaluación FINAL sobre Test Set (nunca visto):

Decision Tree Puro:
               precision    recall  f1-score   support

         0.0       0.67      0.75      0.71      4259
         1.0       0.75      0.79      0.77      7013
         2.0       0.80      0.74      0.77     12728

    accuracy                           