# Limpieza de datos
***

In [49]:
import os
import pandas as pd
import numpy as np
df = pd.read_csv("C:/Users/astar/Desktop/simulacion empresarial/ProjecteData/Equip_9/Data/df_original_250519.csv")



In [50]:
# Verificar duplicados exactos
duplicados = df[df.duplicated()]
print(f"Filas duplicadas exactas: {len(duplicados)}")
duplicados.head()

Filas duplicadas exactas: 10


Unnamed: 0,id,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,deposit
16163,9609,26.0,blue-collar,married,secondary,no,895,yes,no,cellular,4,feb,820,3,-1,0,unknown,no
16164,1027,37.0,technician,divorced,secondary,no,3068,yes,yes,cellular,20,nov,370,2,195,1,other,yes
16165,18834,45.0,management,divorced,tertiary,no,1,no,no,cellular,6,aug,490,2,-1,0,unknown,yes
16166,3028,66.0,housemaid,married,primary,no,1230,no,no,cellular,17,dec,726,3,-1,0,unknown,yes
16167,4943,49.0,management,married,secondary,no,1623,yes,no,cellular,19,nov,1081,2,-1,0,unknown,yes


Se han detectado 10 filas duplicadas exactas, las eliminamos por que no aportan informacion nueva al analisis y pueden sesgar resultados

In [51]:
# Eliminar duplicados exactos
df_cleaned_250519 = df.drop_duplicates()


# Confirmar que se han eliminado
print(df_cleaned_250519.shape)



(16163, 18)


In [52]:
df = pd.read_csv("C:/Users/astar/Desktop/simulacion empresarial/ProjecteData/Equip_9/Data/df_original_250519.csv")
df_cleaned_250519 = df.drop_duplicates()

# Lista de columnas a limpiar
columnas_a_rellenar = ["education", "marital", "housing"]

# Diccionario para guardar modas y nulos
resumen_limpieza = {}

# Revisión y limpieza
for col in columnas_a_rellenar:
    nulos_antes = df_cleaned_250519[col].isna().sum()
    moda = df_cleaned_250519[col].mode()[0]
    df_cleaned_250519[col].fillna(moda, inplace=True)
    
    resumen_limpieza[col] = {
        "nulos_reemplazados": nulos_antes,
        "moda_usada": moda
    }

# Mostrar resultados
print("Resumen de limpieza de columnas con valores nulos:\n")
for col, info in resumen_limpieza.items():
    print(f"- {col}: {info['nulos_reemplazados']} nulos reemplazados con '{info['moda_usada']}'")




Resumen de limpieza de columnas con valores nulos:

- education: 10 nulos reemplazados con 'secondary'
- marital: 9 nulos reemplazados con 'married'
- housing: 20 nulos reemplazados con 'no'


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_cleaned_250519[col].fillna(moda, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned_250519[col].fillna(moda, inplace=True)


In [53]:
# Detectar columnas con solo 'yes' y 'no'
for col in df_cleaned_250519.columns:
    unique_vals = df_cleaned_250519[col].dropna().unique()
    if set(unique_vals).issubset({'yes', 'no'}):
        print(f"La columna '{col}' contiene solo 'yes' y 'no'")

# Convertir 'age' a entero (por si acaso contiene flotantes por error)
df_cleaned_250519['age'] = pd.to_numeric(df_cleaned_250519['age'], errors='coerce').fillna(0).astype(int)

# Convertir columnas binarias 'yes'/'no' a booleanas
bool_columns = ['default', 'housing', 'loan', 'deposit']
for col in bool_columns:
    df_cleaned_250519[col] = df_cleaned_250519[col].map({'yes': True, 'no': False})
print(df_cleaned_250519.dtypes)


La columna 'default' contiene solo 'yes' y 'no'
La columna 'housing' contiene solo 'yes' y 'no'
La columna 'loan' contiene solo 'yes' y 'no'
La columna 'deposit' contiene solo 'yes' y 'no'
id            int64
age           int32
job          object
marital      object
education    object
default        bool
balance       int64
housing        bool
loan           bool
contact      object
day           int64
month        object
duration      int64
campaign      int64
pdays         int64
previous      int64
poutcome     object
deposit        bool
dtype: object


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned_250519['age'] = pd.to_numeric(df_cleaned_250519['age'], errors='coerce').fillna(0).astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned_250519[col] = df_cleaned_250519[col].map({'yes': True, 'no': False})
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned_250519[c

In [54]:
# Asegurar que 'month' tenga solo la primera letra en mayúscula
df_cleaned_250519['month'] = df_cleaned_250519['month'].str.strip().str.capitalize()

# Crear columna 'date' combinando 'day' y 'month' (asumiendo año 2024)
df_cleaned_250519['date'] = pd.to_datetime(
    df_cleaned_250519['day'].astype(str) + '-' + df_cleaned_250519['month'] + '-2024',
    dayfirst=True
)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned_250519['month'] = df_cleaned_250519['month'].str.strip().str.capitalize()
  df_cleaned_250519['date'] = pd.to_datetime(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned_250519['date'] = pd.to_datetime(


In [55]:
df_cleaned_250519.to_csv("df.csv", index=False)