In [1]:
import warnings
warnings.filterwarnings('ignore')

### 1. Librerias

In [2]:
# Importar librerías necesarias
import pandas as pd
import numpy as np
from sklearn import metrics
from sklearn import preprocessing

## Modelos de Machine Learning
import statsmodels.api as sm
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import BaggingClassifier

## Métricas de los modelos
from sklearn.metrics import confusion_matrix
from sklearn.metrics import auc
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import classification_report

## Selección de Variables
from sklearn.model_selection import train_test_split

# visualización
import seaborn as sns
import matplotlib.pyplot as plt

In [3]:
# Valor de semilla que se utilizará siempre que sea necesario
seed = 16
np.random.seed(seed)

### 2. Leer dataset

In [5]:
DATASET_PATH = 'gs://bucket2025nahumfg/inputs/trabajo_final_modelo/CreditScoring.csv'
dataset = pd.read_csv(DATASET_PATH)
dataset.head(100)

Unnamed: 0,ID,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
0,1,1,0.766127,45,2,0.802982,9120.0,13,0,6,0,2.0
1,2,0,0.957151,40,0,0.121876,2600.0,4,0,0,0,1.0
2,3,0,0.658180,38,1,0.085113,3042.0,2,1,0,0,0.0
3,4,0,0.233810,30,0,0.036050,3300.0,5,0,0,0,0.0
4,5,0,0.907239,49,1,0.024926,63588.0,7,0,1,0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
95,96,0,0.245353,37,0,0.288417,6500.0,11,1,1,1,0.0
96,97,0,0.542243,48,2,10.000000,,2,0,0,0,
97,98,0,0.010531,57,0,0.280665,5714.0,6,0,1,0,0.0
98,99,0,0.363200,32,0,0.480524,2900.0,4,0,1,0,0.0


In [6]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150000 entries, 0 to 149999
Data columns (total 12 columns):
 #   Column                                Non-Null Count   Dtype  
---  ------                                --------------   -----  
 0   ID                                    150000 non-null  int64  
 1   SeriousDlqin2yrs                      150000 non-null  int64  
 2   RevolvingUtilizationOfUnsecuredLines  150000 non-null  float64
 3   age                                   150000 non-null  int64  
 4   NumberOfTime30-59DaysPastDueNotWorse  150000 non-null  int64  
 5   DebtRatio                             150000 non-null  float64
 6   MonthlyIncome                         120269 non-null  float64
 7   NumberOfOpenCreditLinesAndLoans       150000 non-null  int64  
 8   NumberOfTimes90DaysLate               150000 non-null  int64  
 9   NumberRealEstateLoansOrLines          150000 non-null  int64  
 10  NumberOfTime60-89DaysPastDueNotWorse  150000 non-null  int64  
 11  

In [7]:
dataset.ID.nunique()

150000

### 3. Resumen de análisis de datos

In [8]:
dataset.describe()

Unnamed: 0,ID,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
count,150000.0,150000.0,150000.0,150000.0,150000.0,150000.0,120269.0,150000.0,150000.0,150000.0,150000.0,146076.0
mean,75000.5,0.06684,6.048438,52.295207,0.421033,353.005076,6670.221,8.45276,0.265973,1.01824,0.240387,0.757222
std,43301.414527,0.249746,249.755371,14.771866,4.192781,2037.818523,14384.67,5.145951,4.169304,1.129771,4.155179,1.115086
min,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,37500.75,0.0,0.029867,41.0,0.0,0.175074,3400.0,5.0,0.0,0.0,0.0,0.0
50%,75000.5,0.0,0.154181,52.0,0.0,0.366508,5400.0,8.0,0.0,1.0,0.0,0.0
75%,112500.25,0.0,0.559046,63.0,0.0,0.868254,8249.0,11.0,0.0,2.0,0.0,1.0
max,150000.0,1.0,50708.0,109.0,98.0,329664.0,3008750.0,58.0,98.0,54.0,98.0,20.0


In [9]:
print('Original dataset: ')
print(dataset['SeriousDlqin2yrs'].value_counts())

print('')
print('Normalize dataset: ')
print(dataset['SeriousDlqin2yrs'].value_counts(normalize=True))

Original dataset: 
SeriousDlqin2yrs
0    139974
1     10026
Name: count, dtype: int64

Normalize dataset: 
SeriousDlqin2yrs
0    0.93316
1    0.06684
Name: proportion, dtype: float64


### 4. Imputación de datos

In [10]:
# Imputation of the empty data
dataset['MonthlyIncome'] = dataset['MonthlyIncome'].fillna(dataset['MonthlyIncome'].median())
dataset['NumberOfDependents'] = dataset['NumberOfDependents'].fillna(dataset['NumberOfDependents'].median())

In [11]:
dataset.isnull().sum()

ID                                      0
SeriousDlqin2yrs                        0
RevolvingUtilizationOfUnsecuredLines    0
age                                     0
NumberOfTime30-59DaysPastDueNotWorse    0
DebtRatio                               0
MonthlyIncome                           0
NumberOfOpenCreditLinesAndLoans         0
NumberOfTimes90DaysLate                 0
NumberRealEstateLoansOrLines            0
NumberOfTime60-89DaysPastDueNotWorse    0
NumberOfDependents                      0
dtype: int64

In [12]:
dataset.drop('ID',axis=1).corr(method= 'pearson')

Unnamed: 0,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
SeriousDlqin2yrs,1.0,-0.001802,-0.115386,0.125587,-0.007602,-0.017151,-0.029669,0.117175,-0.007038,0.102261,0.046869
RevolvingUtilizationOfUnsecuredLines,-0.001802,1.0,-0.005898,-0.001314,0.003961,0.006513,-0.011281,-0.001061,0.006235,-0.001048,0.001193
age,-0.115386,-0.005898,1.0,-0.062995,0.024188,0.027581,0.147705,-0.061005,0.03315,-0.057159,-0.215693
NumberOfTime30-59DaysPastDueNotWorse,0.125587,-0.001314,-0.062995,1.0,-0.006542,-0.00837,-0.055312,0.983603,-0.030565,0.987005,-0.00459
DebtRatio,-0.007602,0.003961,0.024188,-0.006542,1.0,-0.018006,0.049565,-0.00832,0.120046,-0.007533,-0.044476
MonthlyIncome,-0.017151,0.006513,0.027581,-0.00837,-0.018006,1.0,0.086949,-0.0105,0.116273,-0.009252,0.066314
NumberOfOpenCreditLinesAndLoans,-0.029669,-0.011281,0.147705,-0.055312,0.049565,0.086949,1.0,-0.079984,0.433959,-0.071077,0.074026
NumberOfTimes90DaysLate,0.117175,-0.001061,-0.061005,0.983603,-0.00832,-0.0105,-0.079984,1.0,-0.045205,0.992796,-0.011962
NumberRealEstateLoansOrLines,-0.007038,0.006235,0.03315,-0.030565,0.120046,0.116273,0.433959,-0.045205,1.0,-0.039722,0.129399
NumberOfTime60-89DaysPastDueNotWorse,0.102261,-0.001048,-0.057159,0.987005,-0.007533,-0.009252,-0.071077,0.992796,-0.039722,1.0,-0.012678


### 5. Split dataset

In [13]:
X,y = dataset.drop(['ID', 'SeriousDlqin2yrs'], axis=1),dataset[["SeriousDlqin2yrs"]]
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.25, random_state=1,stratify=y)

### 6. Modelos

In [18]:
model_RL = LogisticRegression()
model_RL.fit(X_train, y_train)


# Separate the features and the target variable
predict_train_rl = model_RL.predict_proba(X_train)[:,1]
predict_test_rl = model_RL.predict_proba(X_test)[:,1]


# Print scores on both  predict_proba(best_X_train)[:,1]
print("auc on training in LogisticRegression data : {:.3f}"
      .format(roc_auc_score(y_train, predict_train_rl) ))
print("auc on testing in LogisticRegression  data : {:.3f}"
      .format(roc_auc_score(y_test, predict_test_rl) ))

auc on training in LogisticRegression data : 0.681
auc on testing in LogisticRegression  data : 0.679


In [19]:
model_RL.predict(X_train)

array([0, 0, 0, ..., 0, 0, 0])

In [20]:
predict_train_rl

array([0.15366781, 0.0129492 , 0.10270039, ..., 0.09315861, 0.14100428,
       0.00532631])

In [21]:
clasificacion_predict_train_logit= np.where(predict_train_rl<0.5, 0, 1)
clasificacion_predict_test_logit= np.where(predict_test_rl<0.5, 0, 1)

print("accuracy on training in LogisticRegression data : {:.3f}".format(accuracy_score(y_train, clasificacion_predict_train_logit)))
print("accuracy on testing in LogisticRegression  data : {:.3f}".format(accuracy_score(y_test,clasificacion_predict_test_logit)))

accuracy on training in LogisticRegression data : 0.933
accuracy on testing in LogisticRegression  data : 0.933


### 7. Desplegar

In [24]:
import joblib
from google.cloud import storage

BUCKET_NAME="bucket2025nahumfg"

### 7.1. Guardar los pesos del modelo

In [25]:
FILE_NAME = "model.joblib"
joblib.dump(model_RL, FILE_NAME)

# Upload the saved model file to GCS

BLOB_PATH = "credit_scoring/model/"
BLOB_NAME = BLOB_PATH + FILE_NAME

bucket = storage.Client().bucket(BUCKET_NAME)
blob = bucket.blob(BLOB_NAME)
blob.upload_from_filename("model.joblib")

### 7.2. Model Registry

In [28]:
REGION = "us-central1"
MODEL_DISPLAY_NAME = "credit-scoring-model"
ARTIFACT_GCS_PATH = f'gs://{BUCKET_NAME}/{BLOB_PATH}'


! gcloud beta ai models upload \
 --region=$REGION \
 --display-name=$MODEL_DISPLAY_NAME \
 --container-image-uri="us-docker.pkg.dev/vertex-ai/prediction/sklearn-cpu.0-24:latest" \
 --artifact-uri=$ARTIFACT_GCS_PATH


Using endpoint [https://us-central1-aiplatform.googleapis.com/]
Waiting for operation [1440533852144533504]...done.                            


In [29]:
! gcloud beta ai models list \
 --region $REGION \
 --filter=display_name=$MODEL_DISPLAY_NAME


Using endpoint [https://us-central1-aiplatform.googleapis.com/]
MODEL_ID             DISPLAY_NAME
4796699740521627648  credit-scoring-model


In [31]:
MODEL_ID = "4796699740521627648"

### 7.3. Online Prediction - create

In [32]:
ENDPOINT_DISPLAY_NAME = "credit-scoring-ep1"

In [33]:
! gcloud beta ai endpoints create \
 --region=$REGION \
 --display-name=$ENDPOINT_DISPLAY_NAME

Using endpoint [https://us-central1-aiplatform.googleapis.com/]
Waiting for operation [1165251324921511936]...done.                            
Created Vertex AI endpoint: projects/330930586045/locations/us-central1/endpoints/8408986863905275904.


In [34]:
! gcloud beta ai endpoints list \
 --region=$REGION \
 --filter=display_name=$ENDPOINT_DISPLAY_NAME

Using endpoint [https://us-central1-aiplatform.googleapis.com/]
ENDPOINT_ID          DISPLAY_NAME
8408986863905275904  credit-scoring-ep1


In [35]:
ENDPOINT_ID = "8408986863905275904"

### 7.3. Online Prediction - deploy

In [36]:
DEPLOYED_MODEL_NAME = "credit-scoring-model-v1"
MACHINE_TYPE = "n1-standard-2"

In [None]:
! gcloud beta ai endpoints deploy-model $ENDPOINT_ID\
 --region=$REGION \
 --model=$MODEL_ID \
 --display-name=$DEPLOYED_MODEL_NAME \
 --machine-type=$MACHINE_TYPE \
 --traffic-split=0=100

Using endpoint [https://us-central1-aiplatform.googleapis.com/]
Waiting for operation [3815619705629048832]...⠛                                