In [1]:
# Basics
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Preprocesamiento
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder

# Modelos clasificación
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB

# Herramientas de selección de modelos y evaluación
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

# Para manejar desequilibrios en las clases (opcional)
from imblearn.over_sampling import SMOTE

# Configuración de visualización
%matplotlib inline
sns.set(style="whitegrid")




## ÍNDICE

### 1 - Problema de negocio y aproach técnico

You are working as a data scientist in a global finance company. Over the years, the company has collected basic bank details and gathered a lot of credit-related information. The management wants to build an intelligent system to segregate the people into credit score brackets to reduce the manual efforts.

Task
Given a person’s credit-related information, build a machine learning model that can classify the credit score.

Dada la información de crédito de una persona, crearemos un modelo de machine learning que pueda clasificar la puntuación de crédito.

El problema y/o objetivo planteado, apunta a la evaluación de solvencia crediticia. La solvencia financiera define la capacidad de una persona, física o jurídica, para cumplir con todas sus obligaciones. El puntaje de crédito es un número que determina su comportamiento crediticio, cuán probable es que usted realice los pagos de un préstamo, y de su puntualidad a la hora de realizar los pagos, es decir su capacidad de afrontar deudas presentes o futuras.

Para lograr nuestra meta utilizaremos modelos supervisados de clasificación, donde evaluaremos los resultados para resolver el problema planteado y lograr el objetivo.

### 2 - Carga de datos y pre-análisis

In [15]:
train_df = pd.read_csv('./data/train.csv', index_col=0, low_memory=False)
test_df = pd.read_csv('./data/test.csv', index_col=0, low_memory=False)
pd.set_option('display.max_columns', None)
train_df.head()

Unnamed: 0_level_0,Customer_ID,Month,Name,Age,SSN,Occupation,Annual_Income,Monthly_Inhand_Salary,Num_Bank_Accounts,Num_Credit_Card,Interest_Rate,Num_of_Loan,Type_of_Loan,Delay_from_due_date,Num_of_Delayed_Payment,Changed_Credit_Limit,Num_Credit_Inquiries,Credit_Mix,Outstanding_Debt,Credit_Utilization_Ratio,Credit_History_Age,Payment_of_Min_Amount,Total_EMI_per_month,Amount_invested_monthly,Payment_Behaviour,Monthly_Balance,Credit_Score
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1
0x1602,CUS_0xd40,January,Aaron Maashoh,23,821-00-0265,Scientist,19114.12,1824.843333,3,4,3,4,"Auto Loan, Credit-Builder Loan, Personal Loan,...",3,7.0,11.27,4.0,_,809.98,26.82262,22 Years and 1 Months,No,49.574949,80.41529543900253,High_spent_Small_value_payments,312.49408867943663,Good
0x1603,CUS_0xd40,February,Aaron Maashoh,23,821-00-0265,Scientist,19114.12,,3,4,3,4,"Auto Loan, Credit-Builder Loan, Personal Loan,...",-1,,11.27,4.0,Good,809.98,31.94496,,No,49.574949,118.28022162236736,Low_spent_Large_value_payments,284.62916249607184,Good
0x1604,CUS_0xd40,March,Aaron Maashoh,-500,821-00-0265,Scientist,19114.12,,3,4,3,4,"Auto Loan, Credit-Builder Loan, Personal Loan,...",3,7.0,_,4.0,Good,809.98,28.609352,22 Years and 3 Months,No,49.574949,81.699521264648,Low_spent_Medium_value_payments,331.2098628537912,Good
0x1605,CUS_0xd40,April,Aaron Maashoh,23,821-00-0265,Scientist,19114.12,,3,4,3,4,"Auto Loan, Credit-Builder Loan, Personal Loan,...",5,4.0,6.27,4.0,Good,809.98,31.377862,22 Years and 4 Months,No,49.574949,199.4580743910713,Low_spent_Small_value_payments,223.45130972736783,Good
0x1606,CUS_0xd40,May,Aaron Maashoh,23,821-00-0265,Scientist,19114.12,1824.843333,3,4,3,4,"Auto Loan, Credit-Builder Loan, Personal Loan,...",6,,11.27,4.0,Good,809.98,24.797347,22 Years and 5 Months,No,49.574949,41.420153086217326,High_spent_Medium_value_payments,341.48923103222177,Good


In [13]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 100000 entries, 0x1602 to 0x25fed
Data columns (total 27 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   Customer_ID               100000 non-null  object 
 1   Month                     100000 non-null  object 
 2   Name                      90015 non-null   object 
 3   Age                       100000 non-null  object 
 4   SSN                       100000 non-null  object 
 5   Occupation                100000 non-null  object 
 6   Annual_Income             100000 non-null  object 
 7   Monthly_Inhand_Salary     84998 non-null   float64
 8   Num_Bank_Accounts         100000 non-null  int64  
 9   Num_Credit_Card           100000 non-null  int64  
 10  Interest_Rate             100000 non-null  int64  
 11  Num_of_Loan               100000 non-null  object 
 12  Type_of_Loan              88592 non-null   object 
 13  Delay_from_due_date       100000 non-null  

#### Análsis previo de variables


| Nombre de Variable         | Descripción                                  | Tipo de Variable | Importancia | Comentarios |
|----------------------------|----------------------------------------------|------------------|-------------|-------------|
| Customer_ID                | Identificador único por cliente.             | object           | 1           |  No interesa           |
| Month                      | Mes del registro.                            | object           | 2           | No interesa            |
| Name                       | Nombre del cliente.                          | object           | 1           |  No interesa           |
| Age                        | Edad en años.                                | object           | 4           |             |
| SSN                        | Número de Seguridad Social.                  | object           | 1           | No interesa            |
| Occupation                 | Profesión.                                   | object           | 5           |             |
| Annual_Income              | Ingresos anuales.                            | object           | 5           |             |
| Monthly_Inhand_Salary      | Sueldo neto mensual.                         | float64          | 5           |             |
| Num_Bank_Accounts          | Número de cuentas bancarias.                 | int64            | 3           |             |
| Num_Credit_Card            | Número de tarjetas de crédito.               | int64            | 4           |             |
| Interest_Rate              | Tasa de interés de préstamos.                | int64            | 3           |             |
| Num_of_Loan                | Número de préstamos adquiridos.              | object           | 4           |             |
| Type_of_Loan               | Tipos de préstamo.                           | object           | 4           |             |
| Delay_from_due_date        | Días de atraso promedio en pagos.            | int64            | 3           |             |
| Num_of_Delayed_Payment     | Promedio de pagos atrasados.                 | object           | 3           |             |
| Changed_Credit_Limit       | Porcentaje de cambio en límite de crédito.   | object           | 3           |             |
| Num_Credit_Inquiries       | Número de consultas de crédito.              | float64          | 2           |             |
| Credit_Mix                 | Clasificación del crédito (Bad, Standard, Good). | object      | 4           |             |
| Outstanding_Debt           | Deuda total pendiente.                       | object           | 4           |             |
| Credit_Utilization_Ratio   | Ratio de utilización de crédito.             | float64          | 5           |             |
| Credit_History_Age         | Antigüedad de la historia de crédito.        | object           | 4           |             |
| Payment_of_Min_Amount      | Solo pago del mínimo realizado.              | object           | 3           |             |
| Total_EMI_per_month        | Cuota mensual equivalente (EMI).             | float64          | 5           |             |
| Amount_invested_monthly    | Inversión mensual.                           | object           | 4           |             |
| Payment_Behaviour          | Hábitos de pago.                             | object           | 4           |             |
| Monthly_Balance            | Balance mensual disponible.                  | object           | 5           |             |
| Credit_Score               | Calificación de crédito (Poor, Standard, Good). | object     | 5           |   Target          |


#### Observaciones generales

- Nulos a manejar.
- Símbolos extraños a manejar
- Valores negativos --> entender el contexto y tratarlos si es necesario
- Selección de features --> unas cuantas no son interesantes
- DType incorrecto
- Outliers en las variables numéricas --> qué hacer?
- IMPORTANTE --> cuento con 2 ficheros --> tienen las mismas features pero 'test' no tiene target, 
se podría hacer predict con todos esos datos nuevos pero no puedo obtener los resultados --> qué hacer?