In [1]:

import pandas as pd
import random

f = "accepted_2007_to_2018Q4.csv"

def limpieza_data():
    num_lines = sum(1 for i in open(f))

    size = 2_200_000
    ids = random.sample(range(1, num_lines), size)

    column_types = {
        '0': 'str',      
        '19': 'float',
        '59': 'str',
        '118': 'str',
        '129': 'float',
        '130': 'float',
        '131': 'str',
        '134': 'str',
        '135': 'float',
        '136': 'str',
        '139': 'float',
    }

    df = pd.read_csv(f, skiprows=ids, low_memory=False, dtype=column_types)

    # Limpieza de NaNs
    num = 150000
    num_nans = df.isnull().sum()
    columns_menos_nans = num_nans[num_nans < num].index
    df_sin_nans = df[columns_menos_nans]

    # Selección de columnas útiles
    df_limpio = df_sin_nans[[
        "id",
        "loan_amnt",
        "funded_amnt",
        "term",
        "int_rate",
        "installment",
        "grade",
        "emp_length",
        "annual_inc",
        "dti",
        "fico_range_high",
        "fico_range_low",
        "revol_util",
        "loan_status",
        "out_prncp",
        "total_rec_prncp",
        "total_rec_int",
        "last_fico_range_high",
        "last_fico_range_low",
        "open_acc",
        "pub_rec",
        "inq_last_6mths",
        "purpose",
        "home_ownership",
        "recoveries",
        "num_tl_90g_dpd_24m",
        "delinq_2yrs"
    ]]

    return df_limpio

df_limpio = limpieza_data()


In [3]:
df_limpio.to_csv("df_limpio_procesado.csv", index=False)


In [4]:
print(df_limpio.head())

         id  loan_amnt  funded_amnt        term  int_rate  installment grade  \
0  68340446    14000.0      14000.0   60 months     14.85       331.96     C   
1  68446093    11550.0      11550.0   60 months     16.59       284.51     D   
2  68396899    18000.0      18000.0   36 months      6.49       551.61     A   
3  68426691    21000.0      21000.0   60 months      9.80       444.13     B   
4  68436822     5200.0       5200.0   36 months     10.78       169.71     B   

  emp_length  annual_inc    dti  ...  last_fico_range_high  \
0    2 years     60000.0  24.29  ...                 514.0   
1    5 years     38000.0  21.07  ...                 529.0   
2  10+ years     76000.0  14.40  ...                 684.0   
3    7 years    125000.0   6.20  ...                 499.0   
4    7 years    160000.0   8.00  ...                 714.0   

   last_fico_range_low  open_acc pub_rec  inq_last_6mths             purpose  \
0                510.0      11.0     0.0             0.0  debt_con

In [5]:
import numpy as np

def convert_emp_length(emp_length):
    if pd.isna(emp_length):  
        return 0
    elif emp_length == '< 1 year':  
        return 0
    elif emp_length == '10+ years':  
        return 10
    else:  
        try:
            return int(emp_length.split()[0])
        except:
            return 0  

df_limpio['emp_length_numeric'] = df_limpio['emp_length'].apply(convert_emp_length)

print(df_limpio[['emp_length', 'emp_length_numeric']].head())


  emp_length  emp_length_numeric
0    2 years                   2
1    5 years                   5
2  10+ years                  10
3    7 years                   7
4    7 years                   7


In [6]:
print(df_limpio.columns)


Index(['id', 'loan_amnt', 'funded_amnt', 'term', 'int_rate', 'installment',
       'grade', 'emp_length', 'annual_inc', 'dti', 'fico_range_high',
       'fico_range_low', 'revol_util', 'loan_status', 'out_prncp',
       'total_rec_prncp', 'total_rec_int', 'last_fico_range_high',
       'last_fico_range_low', 'open_acc', 'pub_rec', 'inq_last_6mths',
       'purpose', 'home_ownership', 'recoveries', 'num_tl_90g_dpd_24m',
       'delinq_2yrs', 'emp_length_numeric'],
      dtype='object')


In [7]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score


# 1.FICO range difference
df_limpio['fico_range_high'] = pd.to_numeric(df_limpio['fico_range_high'], errors='coerce')
df_limpio['fico_range_low'] = pd.to_numeric(df_limpio['fico_range_low'], errors='coerce')
df_limpio['fico_range_diff'] = df_limpio['fico_range_high'] - df_limpio['fico_range_low']

# 2.Loan to Income Ratio
df_limpio['loan_amnt'] = pd.to_numeric(df_limpio['loan_amnt'], errors='coerce')
df_limpio['annual_inc'] = pd.to_numeric(df_limpio['annual_inc'], errors='coerce')
df_limpio['loan_income_ratio'] = df_limpio['loan_amnt'] / df_limpio['annual_inc']

# 3.Term to Numeric (36 o 60 meses)
df_limpio['term'] = df_limpio['term'].astype(str)
df_limpio['term_numeric'] = df_limpio['term'].apply(lambda x: 36 if '36 months' in x else 60)

# 4.Interest * Loan Amount
df_limpio['int_rate'] = pd.to_numeric(df_limpio['int_rate'], errors='coerce')
df_limpio['interest_loan'] = df_limpio['int_rate'] * df_limpio['loan_amnt']

print(df_limpio[['fico_range_diff', 'loan_income_ratio', 'term_numeric', 'interest_loan']].isna().sum())


fico_range_diff      0
loan_income_ratio    0
term_numeric         0
interest_loan        0
dtype: int64


In [8]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score


# 1. FICO range difference
df_limpio['fico_range_diff'] = df_limpio['fico_range_high'] - df_limpio['fico_range_low']
#La diferencia entre fico_range_high y fico_range__low puede indicarnos la inestabilidad en la calificación crediticia de los clientes. A mayor diferencia, más inestable puede ser el comportamiento del cliente.

# 2. Loan to Income Ratio
df_limpio['loan_income_ratio'] = df_limpio['loan_amnt'] / df_limpio['annual_inc']
# 3. 
df_limpio['term'] = df_limpio['term'].astype(str)


df_limpio['term_numeric'] = df_limpio['term'].apply(lambda x: 36 if '36 months' in x else (60 if '60 months' in x else None))

df_limpio = df_limpio.dropna(subset=['term_numeric'])

print(df_limpio['term_numeric'].isna().sum())


# 4. Interest * Loan Amount
df_limpio['interest_loan'] = df_limpio['int_rate'] * df_limpio['loan_amnt']

0


In [9]:
# 5. Agrupar propósito en categorías más amplias
def group_purpose(purpose):
    if purpose in ['debt_consolidation', 'credit_card']:
        return 'debt_related'
    elif purpose in ['home_improvement', 'major_purchase']:
        return 'home_related'
    else:
        return 'other'

df_limpio['purpose_grouped'] = df_limpio['purpose'].apply(group_purpose)


df_limpio['loan_status_binary'] = df_limpio['loan_status'].apply(lambda x: 1 if x == 'Fully Paid' else 0)


X = df_limpio.drop(columns=['loan_status', 'loan_status_binary', 'id'])  
y = df_limpio['loan_status_binary']


num_features = X.select_dtypes(include=['int64', 'float64']).columns
cat_features = X.select_dtypes(include=['object']).columns


num_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),  
    ('scaler', StandardScaler())  
])


cat_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),  
    ('onehot', OneHotEncoder(handle_unknown='ignore'))  
])

In [10]:

preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_transformer, num_features),
        ('cat', cat_transformer, cat_features)
    ]
)

model = RandomForestClassifier(random_state=42)

pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', model)
])

In [11]:

X.replace([np.inf, -np.inf], np.nan, inplace=True)

X.replace([np.inf, -np.inf], np.nan, inplace=True)


X_numeric = X.select_dtypes(include=[np.number])
X[X_numeric.columns] = X_numeric.fillna(X_numeric.mean())


X[X_numeric.columns] = X_numeric.clip(lower=-1e10, upper=1e10)


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


pipeline.fit(X_train, y_train)

y_pred = pipeline.predict(X_test)


accuracy = accuracy_score(y_test, y_pred)


print(f"Precisión del modelo: {accuracy * 100:.2f}%")

X_numeric = X.select_dtypes(include=[np.number])
X[X_numeric.columns] = X_numeric.fillna(X_numeric.mean())


X[X_numeric.columns] = X_numeric.clip(lower=-1e10, upper=1e10)



Precisión del modelo: 99.42%


In [12]:
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score

# Matriz de confusión
cm = confusion_matrix(y_test, y_pred)
print("Matriz de confusión:")
print(cm)

# Reporte de clasificación (precision, recall, f1-score)
print("\nReporte de clasificación:")
print(classification_report(y_test, y_pred))

# ROC AUC Score
roc_auc = roc_auc_score(y_test, pipeline.predict_proba(X_test)[:, 1])
print(f"\nROC AUC Score: {roc_auc:.2f}")


Matriz de confusión:
[[6284   67]
 [   3 5787]]

Reporte de clasificación:
              precision    recall  f1-score   support

           0       1.00      0.99      0.99      6351
           1       0.99      1.00      0.99      5790

    accuracy                           0.99     12141
   macro avg       0.99      0.99      0.99     12141
weighted avg       0.99      0.99      0.99     12141


ROC AUC Score: 1.00


In [None]:
#Verdaderos negativos (6280): Préstamos que no se pagaron totalmente y fueron correctamente clasificados.
#Falsos positivos (63): Préstamos que no se pagaron totalmente pero fueron erróneamente clasificados como "Fully Paid".
#Falsos negativos (2): Préstamos que fueron totalmente pagados pero erróneamente clasificados como no pagados.
#Verdaderos positivos (5796): Préstamos totalmente pagados correctamente clasificados.


In [None]:
#Precision para clase 0 (No pagado): 1.00 → De todos los préstamos predichos como no pagados, el 100% eran realmente no pagados.
#Precision para clase 1 (Pagado): 0.99 → De todos los préstamos que predichos como pagados, el 99% eran realmente pagados.
#Recall para clase 0 (No pagado): 0.99 → De todos los préstamos que no se pagaron realmente, el 99% fueron correctamente predichos.
#Recall para clase 1 (Pagado): 1.00 → De todos los préstamos que fueron pagados, el 100% fueron correctamente predichos.

#El f1-score es muy equilibrado para ambas clases (0.99 en ambos casos)


In [13]:
from sklearn.model_selection import cross_val_score

# Evaluar usando validación cruzada con 5 "folds"
cv_scores = cross_val_score(pipeline, X, y, cv=5)
print(f"Scores de validación cruzada: {cv_scores}")
print(f"Promedio de validación cruzada: {cv_scores.mean():.2f}")


Scores de validación cruzada: [0.9925871  0.99604613 0.98714992 0.99275124 0.99670511]
Promedio de validación cruzada: 0.99


In [None]:
#La validación cruzada confirma que el rendimiento del modelo es consistente en diferentes subconjuntos de los datos, con una precisión que varía entre 0.987 y 0.996. El promedio de 0.99 es un excelente indicador de que el modelo tiene un rendimiento muy robusto.

In [14]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'classifier__n_estimators': [100, 200, 300],
    'classifier__max_depth': [10, 20, 30],
    'classifier__min_samples_split': [2, 5, 10]
}

grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

print(f"Mejores hiperparámetros: {grid_search.best_params_}")


Mejores hiperparámetros: {'classifier__max_depth': 30, 'classifier__min_samples_split': 5, 'classifier__n_estimators': 200}


In [None]:
#El ajuste de hiperparámetros ha dado lugar a un Random Forest con una profundidad máxima de 30 y 200 estimadores. Es decir, un árbol relativamente profundo funciona mejor con los datos, y un número más alto de estimadores mejora la estabilidad del modelo.



In [15]:
#feature importance
feature_importances = pipeline.named_steps['classifier'].feature_importances_

features = num_features.tolist() + list(pipeline.named_steps['preprocessor'].transformers_[1][1]['onehot'].get_feature_names_out(cat_features))
feature_importance_df = pd.DataFrame({'Feature': features, 'Importance': feature_importances})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
print(feature_importance_df.head(10))  


                 Feature  Importance
9              out_prncp    0.453470
10       total_rec_prncp    0.250852
17            recoveries    0.056122
12  last_fico_range_high    0.028923
1            funded_amnt    0.026821
13   last_fico_range_low    0.025284
0              loan_amnt    0.022052
3            installment    0.020176
11         total_rec_int    0.017734
24         interest_loan    0.014942


In [17]:
import joblib

joblib.dump(pipeline, 'modelo_random_forest.pkl')

pipeline_cargado = joblib.load('modelo_random_forest.pkl')
