In [13]:
import sys
import sklearn
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Bibliotecas para crear el transformador de clusters
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics.pairwise import rbf_kernel
from sklearn.cluster import KMeans

 
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import StandardScaler


from sklearn.compose import make_column_selector, ColumnTransformer
from sklearn.pipeline import make_pipeline 
from sklearn.preprocessing import OneHotEncoder

In [14]:
prestamos = pd.read_csv('loan_data.csv')

In [15]:
prestamos["ingresos_cat"] = pd.cut(prestamos["ApplicantIncome"],
                                 bins = [0, 2600, 3333, 4288, 6000, np.inf],
                                   labels = [1, 2, 3, 4, 5])

In [16]:
#creamos los datasets estratificados
from sklearn.model_selection import train_test_split
strat_train_sett, strat_test_sett = train_test_split(prestamos, 
                                                   test_size=0.2, 
                                                   stratify=prestamos["ingresos_cat"], 
                                                   random_state=42)
 
print('Tamaño del train_set',strat_train_sett.shape)
print('Tamaño del test_set',strat_test_sett.shape)

Tamaño del train_set (304, 14)
Tamaño del test_set (77, 14)


In [17]:
# Separar las variables predictivas de la variable objetivo
prestamostrain = strat_train_sett.drop("Loan_Status", axis=1)
prestamostrain_labels = strat_train_sett["Loan_Status"].copy()

In [18]:
#Estos miden la capacidad de pago del solicitante y co-solicitante en relación al préstamo
def income_to_loan(X):
    total_income = X[:, [0]] + X[:, [1]]
    return total_income / (X[:, [2]] + 1e-5)

def income_to_loan_name(transformer, feature_names_in):
    return ["income_to_loan_ratio"]
#Estos sirven para analizar la dependencia del ingreso conjunto
def applicant_to_coapplicant(X):
    return X[:, [0]] / (X[:, [1]] + 1e-5)

def applicant_to_coapplicant_name(transformer, feature_names_in):
    return ["applicant_to_coapplicant_ratio"]
#Estos se usan para ver qué porcentaje del ingreso representa el préstamo
def loan_to_income(X):
    total_income = X[:, [0]] + X[:, [1]]
    return X[:, [2]] / (total_income + 1e-5)

def loan_to_income_name(transformer, feature_names_in):
    return ["loan_to_income_ratio"]
#Estos se usan para ver la carga mensual o anual del solicitante
def loan_to_term(X):
    return X[:, [0]] / (X[:, [1]] + 1e-5)

def loan_to_term_name(transformer, feature_names_in):
    return ["loan_to_term_ratio"]

In [19]:
income_to_loan_pipeline = make_pipeline(
    SimpleImputer(strategy="median"),
    FunctionTransformer(income_to_loan, feature_names_out=income_to_loan_name),
    StandardScaler()
)

applicant_to_coapplicant_pipeline = make_pipeline(
    SimpleImputer(strategy="median"),
    FunctionTransformer(applicant_to_coapplicant, feature_names_out=applicant_to_coapplicant_name),
    StandardScaler()
)

loan_to_income_pipeline = make_pipeline(
    SimpleImputer(strategy="median"),
    FunctionTransformer(loan_to_income, feature_names_out=loan_to_income_name),
    StandardScaler()
)

loan_to_term_pipeline = make_pipeline(
    SimpleImputer(strategy="median"),
    FunctionTransformer(loan_to_term, feature_names_out=loan_to_term_name),
    StandardScaler()
)
# Pipeline para variables categóricas
cat_pipeline = make_pipeline(
    SimpleImputer(strategy="most_frequent"),
    OneHotEncoder(handle_unknown="ignore")
)

# Pipeline para variables numéricas restantes
default_num_pipeline = make_pipeline(
    SimpleImputer(strategy="median"),
    StandardScaler()
)


In [20]:
preprocessing = ColumnTransformer([
    ("income_to_loan", income_to_loan_pipeline, ["ApplicantIncome", "CoapplicantIncome", "LoanAmount"]),
    ("applicant_to_coapplicant", applicant_to_coapplicant_pipeline, ["ApplicantIncome", "CoapplicantIncome"]),
    ("loan_to_income", loan_to_income_pipeline, ["ApplicantIncome", "CoapplicantIncome", "LoanAmount"]),
    ("loan_to_term", loan_to_term_pipeline, ["LoanAmount", "Loan_Amount_Term"]),
    ("cat", cat_pipeline, make_column_selector(dtype_include=object))
], remainder=default_num_pipeline)

In [21]:
# Aplica el preprocesamiento automátizado
X = preprocessing.fit_transform(prestamostrain)
print(type(X))
print(X.shape)

<class 'scipy.sparse._csr.csr_matrix'>
(304, 325)


In [22]:
X_dispersa = X.toarray()
print(type(X_dispersa))
print(X_dispersa.shape)

<class 'numpy.ndarray'>
(304, 325)


In [23]:
prestamostrainProc = pd.DataFrame(X_dispersa,
                               columns=preprocessing.get_feature_names_out(),
                               index=prestamostrain.index)
prestamostrainProc.head(2)

Unnamed: 0,income_to_loan__income_to_loan_ratio,applicant_to_coapplicant__applicant_to_coapplicant_ratio,loan_to_income__loan_to_income_ratio,loan_to_term__loan_to_term_ratio,cat__Loan_ID_LP001003,cat__Loan_ID_LP001005,cat__Loan_ID_LP001006,cat__Loan_ID_LP001008,cat__Loan_ID_LP001024,cat__Loan_ID_LP001029,...,cat__Dependents_3+,cat__Education_Graduate,cat__Education_Not Graduate,cat__Self_Employed_No,cat__Self_Employed_Yes,cat__Property_Area_Rural,cat__Property_Area_Semiurban,cat__Property_Area_Urban,remainder__Credit_History,remainder__ingresos_cat
74,-0.35271,1.461968,0.356319,0.034177,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.400381,1.177683
44,-0.232557,-0.825365,0.024008,0.074821,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.400381,0.367693


In [24]:
labels_pret = prestamostrain_labels.map({'Y': 1, 'N': 0})

In [25]:
from sklearn import set_config
 
set_config(display='diagram')

In [26]:
# Seleccionar una Regresión Lineal
from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression()
 
# Aplicar el pipeline de preprocesamiento 
X = preprocessing.fit_transform(prestamostrain)
 
y = labels_pret
 
# Entrenar la regresión lineal
lin_reg.fit(X, y)

In [27]:
# Forma 2
# Seleccionar una Regresión Lineal
from sklearn.linear_model import LinearRegression
 
# Definir un pipeline para preprocesar los datos y luego entrenar
lin_reg = make_pipeline(preprocessing, LinearRegression())
 
y = labels_pret
 
# Entrenar la regresión lineal
lin_reg.fit(prestamostrain, y)