In [2]:
import pandas as pd
import numpy as np

df = pd.read_csv("credit_score_clean.csv")
df.head()


Unnamed: 0,Age,Occupation,Annual_Income,Monthly_Inhand_Salary,Num_Bank_Accounts,Num_Credit_Card,Interest_Rate,Num_of_Loan,Type_of_Loan,Delay_from_due_date,...,Credit_Mix,Outstanding_Debt,Credit_Utilization_Ratio,Credit_History_Age,Payment_of_Min_Amount,Total_EMI_per_month,Amount_invested_monthly,Payment_Behaviour,Monthly_Balance,Credit_Score
0,23,Scientist,19114.12,1824.843333,3,4,3,4,"Auto Loan, Credit-Builder Loan, Personal Loan,...",3,...,Standard,809.98,26.82262,22 Years and 1 Months,No,49.574949,80.415295,High_spent_Small_value_payments,312.49408867943663,Good
1,23,Scientist,19114.12,3093.745,3,4,3,4,"Auto Loan, Credit-Builder Loan, Personal Loan,...",-1,...,Good,809.98,31.94496,15 Years and 11 Months,No,49.574949,118.280222,Low_spent_Large_value_payments,284.62916249607184,Good
2,-500,Scientist,19114.12,3093.745,3,4,3,4,"Auto Loan, Credit-Builder Loan, Personal Loan,...",3,...,Good,809.98,28.609352,22 Years and 3 Months,No,49.574949,81.699521,Low_spent_Medium_value_payments,331.2098628537912,Good
3,23,Scientist,19114.12,3093.745,3,4,3,4,"Auto Loan, Credit-Builder Loan, Personal Loan,...",5,...,Good,809.98,31.377862,22 Years and 4 Months,No,49.574949,199.458074,Low_spent_Small_value_payments,223.45130972736783,Good
4,23,Scientist,19114.12,1824.843333,3,4,3,4,"Auto Loan, Credit-Builder Loan, Personal Loan,...",6,...,Good,809.98,24.797347,22 Years and 5 Months,No,49.574949,41.420153,High_spent_Medium_value_payments,341.48923103222177,Good


In [3]:
df.shape, df.dtypes


((100000, 23),
 Age                          object
 Occupation                   object
 Annual_Income                object
 Monthly_Inhand_Salary       float64
 Num_Bank_Accounts             int64
 Num_Credit_Card               int64
 Interest_Rate                 int64
 Num_of_Loan                  object
 Type_of_Loan                 object
 Delay_from_due_date           int64
 Num_of_Delayed_Payment       object
 Changed_Credit_Limit        float64
 Num_Credit_Inquiries        float64
 Credit_Mix                   object
 Outstanding_Debt             object
 Credit_Utilization_Ratio    float64
 Credit_History_Age           object
 Payment_of_Min_Amount        object
 Total_EMI_per_month         float64
 Amount_invested_monthly     float64
 Payment_Behaviour            object
 Monthly_Balance              object
 Credit_Score                 object
 dtype: object)

2.1. Arreglar Age (edades irreales)

Idea:

Convertir Age a numérico.

Todo lo que sea < 18 o > 100 lo marcamos como dato inválido (NaN).

Luego rellenamos esos NaN con la mediana de las edades válidas.

In [4]:
# Asegurar que Age sea numérica
df["Age"] = pd.to_numeric(df["Age"], errors="coerce")

# Marcar como NaN las edades fuera de un rango razonable
mask_age = (df["Age"] < 18) | (df["Age"] > 100)
df.loc[mask_age, "Age"] = np.nan

# Imputar Age con la mediana de las edades válidas
med_age = df["Age"].median()
df["Age"] = df["Age"].fillna(med_age)

df["Age"].describe()


count    100000.000000
mean         34.370670
std           9.440642
min          18.000000
25%          27.000000
50%          34.000000
75%          41.000000
max         100.000000
Name: Age, dtype: float64

2.2. Recortar valores extremos en variables de dinero

Vamos a recortar por percentiles 1% y 99% para evitar que montos absurdamente grandes/pequeños revienten el modelo.

In [5]:
money_cols = [
    "Annual_Income",
    "Outstanding_Debt",
    "Amount_invested_monthly",
    "Monthly_Balance",
    "Total_EMI_per_month",
    "Credit_Utilization_Ratio"
]

for col in money_cols:
    # Convertir a numérico por si quedó algo extraño
    df[col] = pd.to_numeric(df[col], errors="coerce")
    
    # Calcular percentiles 1 y 99
    q1 = df[col].quantile(0.01)
    q99 = df[col].quantile(0.99)
    
    # Recortar valores fuera de ese rango
    df[col] = df[col].clip(lower=q1, upper=q99)
    
    # Re-imputar posibles NaN nuevos con la mediana
    med = df[col].median()
    df[col] = df[col].fillna(med)


2.3. Asegurar numéricas en otras columnas clave

Algunas columnas numéricas venían como texto (object), por ejemplo: Num_of_Loan, Changed_Credit_Limit, etc. Las pasamos a numérico y rellenamos.

In [6]:
extra_num_cols = [
    "Num_of_Loan",
    "Changed_Credit_Limit",
    "Num_Credit_Inquiries",
    "Num_of_Delayed_Payment"
]

for col in extra_num_cols:
    df[col] = pd.to_numeric(df[col], errors="coerce")
    med = df[col].median()
    df[col] = df[col].fillna(med)


In [7]:
df[extra_num_cols].describe()


Unnamed: 0,Num_of_Loan,Changed_Credit_Limit,Num_Credit_Inquiries,Num_of_Delayed_Payment
count,100000.0,100000.0,100000.0,100000.0
mean,2.79085,10.343671,27.32678,29.75055
std,60.987279,6.725301,191.293766,215.648683
min,-100.0,-6.49,0.0,-3.0
25%,2.0,5.42,3.0,9.0
50%,3.0,9.25,6.0,15.0
75%,5.0,14.66,9.0,19.0
max,1496.0,36.97,2597.0,4397.0


Chequeo final de nulos

Solo para asegurarte que no quedaron NaN colgados después de todo esto:

In [7]:
df.isnull().sum().sort_values(ascending=False).head(10)


Age                         0
Num_Credit_Inquiries        0
Monthly_Balance             0
Payment_Behaviour           0
Amount_invested_monthly     0
Total_EMI_per_month         0
Payment_of_Min_Amount       0
Credit_History_Age          0
Credit_Utilization_Ratio    0
Outstanding_Debt            0
dtype: int64

Separar X (features) y y (target)

In [8]:
# Variable objetivo
y = df["Credit_Score"]

# Variables de entrada (todas menos Credit_Score)
X = df.drop(columns=["Credit_Score"])

X.shape, y.shape



((100000, 22), (100000,))

Codificar variables categóricas (get_dummies)

Ahora convertimos las columnas categóricas de X a dummies (0/1) para que los modelos las puedan usar.

In [9]:
cat_cols = X.select_dtypes(include=["object"]).columns
cat_cols


Index(['Occupation', 'Type_of_Loan', 'Credit_Mix', 'Credit_History_Age',
       'Payment_of_Min_Amount', 'Payment_Behaviour'],
      dtype='object')

In [10]:
X_dummies = pd.get_dummies(X, columns=cat_cols, drop_first=True)

X_dummies.shape


(100000, 6700)

Guardar datos procesados para modelos

In [11]:
X_dummies.shape, y.shape


((100000, 6700), (100000,))

In [15]:
X_dummies.info(memory_usage="deep")


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Columns: 6700 entries, Age to Payment_Behaviour_Low_spent_Small_value_payments
dtypes: bool(6684), float64(12), int64(4)
memory usage: 649.6 MB


In [16]:
# ¿Cuántas filas únicas quedan?
len(X_dummies), len(X_dummies.drop_duplicates())


(100000, 100000)

In [16]:
y.to_csv("y_labels.csv", index=False)


In [17]:
X_dummies.to_csv("X_processed.csv", index=False)
#y.to_csv("y_labels.csv", index=False)


In [18]:
# Guardar en formato binario interno de pandas
X_dummies.to_pickle("X_processed.pkl")
y.to_frame(name="Credit_Score").to_pickle("y_labels.pkl")
