In [44]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

from feature_engine.imputation import AddMissingIndicator, MeanMedianImputer, CategoricalImputer
from feature_engine.encoding import RareLabelEncoder, OrdinalEncoder
from feature_engine.transformation import YeoJohnsonTransformer
from feature_engine.outliers import Winsorizer
from sklearn.tree  import DecisionTreeClassifier

In [45]:
dataTrain = pd.read_csv("train.csv")


  dataTrain = pd.read_csv("train.csv")


In [46]:
# Algunas columnas tiene como signo adicional '_' en su valor numerico por lo que se debe limpiar las columnas 
# previo a convertirlas en numerico

# PASO 2: REEMPLAZAR '-' CON LOS VALORES RESPECTIVOS

# En este caso 'numero_' debe ser 'numero' por tanto se hace el cambio '_' = ''
dataTrain['Annual_Income'] = dataTrain['Annual_Income'].str.replace('_', '')
dataTrain['Outstanding_Debt'] = dataTrain['Outstanding_Debt'].str.replace('_', '')
dataTrain['Age'] = dataTrain['Age'].str.replace('_', '')
dataTrain['Num_of_Loan'] = dataTrain['Num_of_Loan'].str.replace('_', '')

# En este caso '_' debe ser '0' porque significa sin cambio en limite de credito, por tanto se reemplaza '_' = '0'
dataTrain['Changed_Credit_Limit'] = dataTrain['Changed_Credit_Limit'].str.replace('_', '0')

# En este caso solo hay un numero con problema: ' __10000__' por tanto se sustituye con 10000
dataTrain['Amount_invested_monthly'] = dataTrain['Amount_invested_monthly'].str.replace('__10000__', '10000')
# En este caso solo hay un numero con problema: _-3333333_ por tanto se sustituye con -3333333
dataTrain['Monthly_Balance'] = dataTrain['Monthly_Balance'].str.replace('__-333333333333333333333333333__', '33333333333333333')

#PASO 3: CONVERSION A NUMERICO 

#TIPO FLOAT

dataTrain['Annual_Income'] = dataTrain['Annual_Income'].astype(float)
dataTrain['Changed_Credit_Limit'] = dataTrain['Changed_Credit_Limit'].astype(float)
dataTrain['Outstanding_Debt'] = dataTrain['Outstanding_Debt'].astype(float)
dataTrain['Amount_invested_monthly'] = dataTrain['Amount_invested_monthly'].astype(float)
dataTrain['Monthly_Balance'] = dataTrain['Monthly_Balance'].astype(float)

#TIPO INT

dataTrain['Age'] = dataTrain['Age'].astype(int)
dataTrain['Num_of_Loan'] = dataTrain['Num_of_Loan'].astype(int)

#### 2. Balanceo en Target

In [47]:
df_standard = dataTrain[dataTrain['Credit_Score'] == 'Standard']
df_poor = dataTrain[dataTrain['Credit_Score'] == 'Poor']
df_good = dataTrain[dataTrain['Credit_Score'] == 'Good']
cantidadstandard = 2*df_good.shape[0]
df_standard = df_standard.sample(n=cantidadstandard, random_state=2023)
dataTrain = pd.concat([df_standard, df_poor, df_good])


#### 3. Complete Case Analysis para Payment_Behaviour

In [48]:
#CCA
dataTrain = dataTrain[dataTrain['Payment_Behaviour'].str.contains('!@9#%8') == False]

#### 4. Train Test Split para Entrenamiento y Prueba

In [49]:
#codificación de variable target.
creditMapp = {'Credit_Score': {"Poor":1,"Standard": 2, "Good": 3}}
dataTrain = dataTrain.replace(creditMapp)

In [50]:
X_train, X_test, y_train, y_test = train_test_split(
    dataTrain.drop(['ID', 'Credit_Score','Changed_Credit_Limit','Credit_Utilization_Ratio','Total_EMI_per_month',
                    'Amount_invested_monthly',], axis=1),
    dataTrain['Credit_Score'],
    test_size=0.3,
    random_state=2022)

#### 3. Configuración del Pipeline

In [51]:


#imputación de variables categoricas con indicador de Faltante (Missing)
CATEGORICAL_VARS_WITH_NA_MISSING = ['Credit_Mix','Occupation']

#Imputación de variables numéricas

NUMERICAL_VARS_WITH_NA_MEAN = ['Num_Credit_Inquiries','Monthly_Balance']


#Variables para transfomraicón Yeo

NUMERICAL_YEON_VARS =['Annual_Income',
 'Interest_Rate',
 'Outstanding_Debt',
 'Num_Bank_Accounts',
 'Num_Credit_Card',
 'Num_Credit_Inquiries',
 'Delay_from_due_date',
 'Monthly_Balance']


#Variables para codificación por frecuencia
CATEGORICAL_VARS = ['SSN','Occupation','Payment_of_Min_Amount','Payment_Behaviour','Month','Credit_Mix']


#Variables a utilzar en el entrenamiento
FEATURES = [
'Annual_Income',
'Interest_Rate',
'Outstanding_Debt',
'Num_Bank_Accounts',
'Num_Credit_Card',
'Num_Credit_Inquiries',
'Delay_from_due_date',
'Monthly_Balance',
'Month',
'SSN',
'Occupation',
'Credit_Mix',
'Payment_of_Min_Amount',
'Payment_Behaviour'
]

In [52]:
X_train = X_train[FEATURES]

In [53]:
def SinNaNiNegativos(data,colist):
    for c in colist:
        minimo=data[c].min()
        if minimo<=0:
            data[c]=data[c]+abs(minimo)+1
        else:
            data[c]=data[c]

In [54]:
SinNaNiNegativos(X_train, NUMERICAL_YEON_VARS )
SinNaNiNegativos(X_test, NUMERICAL_YEON_VARS )

#### 4. Construcción del Pipeline

In [55]:
CreditScore_pipeline_v112022 = Pipeline([
    
    #=========== IMPUTACIONES ===============
    
    #1. Imputacion de variables categóricas con indicador de faltante
    ('missing_imputation',
        CategoricalImputer(imputation_method='missing', variables=CATEGORICAL_VARS_WITH_NA_MISSING)
    ),
    
 
        #3. Indicador faltane en variables numericas para imputación
    ('missing_indicator_numeric',
        AddMissingIndicator(variables=NUMERICAL_VARS_WITH_NA_MEAN)
    ),

    
    #4. Imputación de variables numéricas
    ('mean_imputation',
         MeanMedianImputer(imputation_method='mean', variables=NUMERICAL_VARS_WITH_NA_MEAN)
    ),
    
    #============= CODIFICACIÓN DE VARIABLES CATEGORICAS NOMINALES ==================
    ('rare_label_encoder',
        RareLabelEncoder(n_categories=1, tol=0.01, variables=CATEGORICAL_VARS)
    ),
    
    ('categorical_encoder',
        OrdinalEncoder(encoding_method='ordered', variables=CATEGORICAL_VARS)
    ),

    #=============== TRATAMIENTO DE OUTLIERS ============
    ('Tratamiento_outliers',
        Winsorizer(capping_method='iqr', variables=NUMERICAL_YEON_VARS)),
        
    #=============== TRANSFORMACIÓN DE VARIABLES CONTINUAS ============
    
    ('yeo_transformer',
        YeoJohnsonTransformer(variables=NUMERICAL_YEON_VARS)
    ),

     #=============== SCALER ============
    ('scaler',
        StandardScaler(),
    ),
         #=============== MODELO ============
    ('modelo',
       DecisionTreeClassifier(max_depth= 10, max_features=14)
    )
])

In [56]:
CreditScore_pipeline_v112022.fit(X_train, y_train)

Pipeline(steps=[('missing_imputation',
                 CategoricalImputer(variables=['Credit_Mix', 'Occupation'])),
                ('missing_indicator_numeric',
                 AddMissingIndicator(variables=['Num_Credit_Inquiries',
                                                'Monthly_Balance'])),
                ('mean_imputation',
                 MeanMedianImputer(imputation_method='mean',
                                   variables=['Num_Credit_Inquiries',
                                              'Monthly_Balance'])),
                ('rare_label_encoder',
                 RareLabelE...
                                       'Num_Credit_Inquiries',
                                       'Delay_from_due_date',
                                       'Monthly_Balance'])),
                ('yeo_transformer',
                 YeoJohnsonTransformer(variables=['Annual_Income',
                                                  'Interest_Rate',
                                  

In [57]:
X_test = X_test[FEATURES]

In [58]:
preds = CreditScore_pipeline_v112022.predict(X_test)

In [59]:
rmseTest = np.sqrt(mean_squared_error(np.exp(y_test), np.exp(preds)))
np.min(np.exp(y_train)), np.max(np.exp(y_train))
rmseTest / (np.max(np.exp(y_train)) - np.min(np.exp(y_train)))


0.3441765823120803

In [74]:
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix

cm = confusion_matrix(y_test, preds)
cm_df = pd.DataFrame(cm, index = ["Poor","Standard","Good"], columns = ["Poor","Standard","Good"])
cm_df

Unnamed: 0,Poor,Standard,Good
Poor,5989,1249,741
Standard,1944,6342,1734
Good,226,1092,3546


In [76]:
#caso32991
TP32991=cm_df.iloc[0,0]
FN32991=cm_df.iloc[0,1]+cm_df.iloc[0,2]
FP32991=cm_df.iloc[1,0]+cm_df.iloc[2,0]
TN32991=cm_df.iloc[1,1]+cm_df.iloc[1,2]+cm_df.iloc[2,1]+cm_df.iloc[2,2]
#caso16445
TP16445=cm_df.iloc[1,1]
FN16445=cm_df.iloc[1,0]+cm_df.iloc[1,2]
FP16445=cm_df.iloc[0,1]+cm_df.iloc[2,1]
TN16445=cm_df.iloc[0,0]+cm_df.iloc[0,2]+cm_df.iloc[2,0]+cm_df.iloc[2,2]
#caso26773
TP26773=cm_df.iloc[2,2]
FN26773=cm_df.iloc[2,0]+cm_df.iloc[2,1]
FP26773=cm_df.iloc[0,2]+cm_df.iloc[1,2]
TN26773=cm_df.iloc[0,0]+cm_df.iloc[0,1]+cm_df.iloc[1,0]+cm_df.iloc[1,1]
#Sensibilidad para cada clase y general
Sen32991=TP32991/(TP32991+FN32991)
Sen16445=TP16445/(TP16445+FN16445)
Sen26773=TP26773/(TP26773+FN26773)
SenPromedio=(Sen32991+Sen16445+Sen26773)/3
#Specifity
Sp32991=TN32991/(TN32991+FP32991)
Sp16445=TN16445/(TN16445+FP16445)
Sp26773=TN26773/(TN26773+FP26773)
SpPromedio=(Sp32991+Sp16445+Sp26773)/3

0.8448066451455994

In [78]:
d = y_test.unique()
class_name = list(d.flatten())
class_name

[1, 3, 2]

In [87]:
from sklearn.metrics import roc_auc_score

y_proba = CreditScore_pipeline_v112022.predict_proba(X_test)
classes = CreditScore_pipeline_v112022.classes_


In [88]:
roc_auc_score(y_test, y_proba, labels = classes, multi_class = 'ovr', average = 'macro')

0.8501292183884953

In [96]:
len(X_test.columns)
len(X_train.columns)

14