# Limpieza de datos
***

### Librerías

In [87]:
import os
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")#ignorar advertencias

### Carga de datos

In [88]:
df = pd.read_csv("../Data/df_original_250519.csv")
df

Unnamed: 0,id,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,deposit
0,1,59.0,admin.,married,secondary,no,2343,yes,no,unknown,5,may,1042,1,-1,0,unknown,yes
1,2,59.0,admin.,married,secondary,no,2343,yes,no,unknown,5,may,1042,1,-1,0,unknown,yes
2,3,56.0,admin.,married,secondary,no,45,no,no,unknown,5,may,1467,1,-1,0,unknown,yes
3,4,41.0,technician,married,secondary,no,1270,yes,no,unknown,5,may,1389,1,-1,0,unknown,yes
4,5,55.0,services,married,secondary,no,2476,yes,no,unknown,5,may,579,1,-1,0,unknown,yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16168,19679,47.0,blue-collar,married,secondary,no,1230,yes,no,telephone,12,nov,378,1,-1,0,unknown,yes
16169,7066,31.0,management,married,tertiary,no,927,no,no,cellular,18,aug,100,4,-1,0,unknown,no
16170,9824,37.0,management,single,tertiary,no,0,no,no,cellular,7,jul,643,3,-1,0,unknown,no
16171,18110,63.0,retired,married,secondary,no,2896,no,no,cellular,1,jul,304,1,119,2,success,yes


### Duplicados exactos

In [89]:
duplicados = df[df.duplicated()]
print(f"Filas duplicadas exactas: {len(duplicados)}")
duplicados

Filas duplicadas exactas: 10


Unnamed: 0,id,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,deposit
16163,9609,26.0,blue-collar,married,secondary,no,895,yes,no,cellular,4,feb,820,3,-1,0,unknown,no
16164,1027,37.0,technician,divorced,secondary,no,3068,yes,yes,cellular,20,nov,370,2,195,1,other,yes
16165,18834,45.0,management,divorced,tertiary,no,1,no,no,cellular,6,aug,490,2,-1,0,unknown,yes
16166,3028,66.0,housemaid,married,primary,no,1230,no,no,cellular,17,dec,726,3,-1,0,unknown,yes
16167,4943,49.0,management,married,secondary,no,1623,yes,no,cellular,19,nov,1081,2,-1,0,unknown,yes
16168,19679,47.0,blue-collar,married,secondary,no,1230,yes,no,telephone,12,nov,378,1,-1,0,unknown,yes
16169,7066,31.0,management,married,tertiary,no,927,no,no,cellular,18,aug,100,4,-1,0,unknown,no
16170,9824,37.0,management,single,tertiary,no,0,no,no,cellular,7,jul,643,3,-1,0,unknown,no
16171,18110,63.0,retired,married,secondary,no,2896,no,no,cellular,1,jul,304,1,119,2,success,yes
16172,20260,38.0,blue-collar,married,primary,no,4592,yes,no,cellular,12,may,360,1,-1,0,unknown,no


Se han detectado 10 filas duplicadas exactas, las eliminamos por que no aportan informacion nueva al analisis y pueden sesgar resultados

In [90]:
# Eliminar duplicados exactos
df = df.drop_duplicates()

# Confirmar que se han eliminado
print(df.shape)

(16163, 18)


### Valores nulos

In [91]:
df.isnull().sum()

id            0
age          13
job           0
marital       9
education    10
default       0
balance       0
housing      20
loan          0
contact       0
day           0
month         0
duration      0
campaign      0
pdays         0
previous      0
poutcome      0
deposit       0
dtype: int64

Tratamiento de nulos tras acuerdo grupal
- age --> media age
- marital --> moda marital
- education --> unknown
- housing --> moda housing

In [92]:
print(df['age'].mean())             # 41.17295792079208
print(df['marital'].mode())         # 0    married --> Solo hay una moda
print(df['education'].unique())     # ['secondary' 'tertiary' 'primary' 'unknown' nan]
print(df['housing'].mode())         # 0    no --> Solo hay una moda

41.171269349845204
0    married
Name: marital, dtype: object
['secondary' 'tertiary' 'primary' 'unknown' nan]
0    no
Name: housing, dtype: object


In [93]:
# Rellenar valores nulos
df['age'].fillna(df['age'].mean(), inplace=True)
df['marital'].fillna(df['marital'].mode()[0], inplace=True)
df['education'].fillna('unknown', inplace=True)
df['housing'].fillna(df['housing'].mode()[0], inplace=True)

In [94]:
# Comprobar
df.isnull().sum()

id           0
age          0
job          0
marital      0
education    0
default      0
balance      0
housing      0
loan         0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
deposit      0
dtype: int64

### Convertir columnas booleanas

In [95]:
# Detectar columnas con solo 'yes' y 'no'
for col in df.columns:
    unique_vals = df[col].dropna().unique()
    if set(unique_vals).issubset({'yes', 'no'}):
        print(f"La columna '{col}' contiene solo 'yes' y 'no'")

La columna 'default' contiene solo 'yes' y 'no'
La columna 'housing' contiene solo 'yes' y 'no'
La columna 'loan' contiene solo 'yes' y 'no'
La columna 'deposit' contiene solo 'yes' y 'no'


In [96]:
# Convertir columnas binarias 'yes'/'no' a booleanas
bool_columns = ['default', 'housing', 'loan', 'deposit']
for col in bool_columns:
    df[col] = df[col].map({'yes': True, 'no': False})
print(df.dtypes)

id             int64
age          float64
job           object
marital       object
education     object
default         bool
balance        int64
housing         bool
loan            bool
contact       object
day            int64
month         object
duration       int64
campaign       int64
pdays          int64
previous       int64
poutcome      object
deposit         bool
dtype: object


### Corrección columna `age`

In [97]:
# Convertir 'age' a entero 
df['age'] = pd.to_numeric(df['age'], errors='coerce').fillna(0).astype(int)
print(df.dtypes)

id            int64
age           int64
job          object
marital      object
education    object
default        bool
balance       int64
housing        bool
loan           bool
contact      object
day           int64
month        object
duration      int64
campaign      int64
pdays         int64
previous      int64
poutcome     object
deposit        bool
dtype: object


### Corrección columna `month`

In [98]:
# Asegurar que 'month' tenga solo la primera letra en mayúscula
df['month'] = df['month'].str.strip().str.lower().str.capitalize()
df

Unnamed: 0,id,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,deposit
0,1,59,admin.,married,secondary,False,2343,True,False,unknown,5,May,1042,1,-1,0,unknown,True
1,2,59,admin.,married,secondary,False,2343,True,False,unknown,5,May,1042,1,-1,0,unknown,True
2,3,56,admin.,married,secondary,False,45,False,False,unknown,5,May,1467,1,-1,0,unknown,True
3,4,41,technician,married,secondary,False,1270,True,False,unknown,5,May,1389,1,-1,0,unknown,True
4,5,55,services,married,secondary,False,2476,True,False,unknown,5,May,579,1,-1,0,unknown,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16158,21379,82,housemaid,divorced,primary,False,1381,False,False,cellular,28,Jan,86,3,93,1,failure,False
16159,21380,35,blue-collar,married,unknown,False,1084,True,False,cellular,10,Jul,1019,1,-1,0,unknown,False
16160,21381,33,technician,married,secondary,True,72,True,False,unknown,6,May,623,1,-1,0,unknown,False
16161,21382,23,student,single,secondary,False,480,False,False,cellular,9,Feb,742,2,182,1,failure,True


### Crear columna `date`

In [99]:
# Crear columna 'date' combinando 'day' y 'month' (asumiendo año 2024)
df['date'] = pd.to_datetime(
    df['day'].astype(str) + '-' + df['month'] + '-2024',
    dayfirst=True
)
df

Unnamed: 0,id,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,deposit,date
0,1,59,admin.,married,secondary,False,2343,True,False,unknown,5,May,1042,1,-1,0,unknown,True,2024-05-05
1,2,59,admin.,married,secondary,False,2343,True,False,unknown,5,May,1042,1,-1,0,unknown,True,2024-05-05
2,3,56,admin.,married,secondary,False,45,False,False,unknown,5,May,1467,1,-1,0,unknown,True,2024-05-05
3,4,41,technician,married,secondary,False,1270,True,False,unknown,5,May,1389,1,-1,0,unknown,True,2024-05-05
4,5,55,services,married,secondary,False,2476,True,False,unknown,5,May,579,1,-1,0,unknown,True,2024-05-05
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16158,21379,82,housemaid,divorced,primary,False,1381,False,False,cellular,28,Jan,86,3,93,1,failure,False,2024-01-28
16159,21380,35,blue-collar,married,unknown,False,1084,True,False,cellular,10,Jul,1019,1,-1,0,unknown,False,2024-07-10
16160,21381,33,technician,married,secondary,True,72,True,False,unknown,6,May,623,1,-1,0,unknown,False,2024-05-06
16161,21382,23,student,single,secondary,False,480,False,False,cellular,9,Feb,742,2,182,1,failure,True,2024-02-09


### Guardar el DataFrame limpio

In [100]:
df.to_csv("../Data/df_cleaned_250519.csv")