In [1]:
import pandas as pd
from datetime import datetime
import os

customer_df = pd.read_csv("../data/raw/customer_data.csv")
retail_df = pd.read_csv("../data/raw/retail_data.csv")


# Cleaning

Customer

In [2]:
# 1. signup_date: convertir a datetime y forzar rango 2020-01-01 hasta hoy
customer_df['signup_date'] = pd.to_datetime(customer_df['signup_date'], errors='coerce')
min_signup = pd.Timestamp("2020-01-01")
max_signup = pd.Timestamp.today()
customer_df['signup_date'] = customer_df['signup_date'].apply(
    lambda x: min_signup if pd.isna(x) or x < min_signup else (max_signup if x > max_signup else x)
)

# Verificar que no haya nulos
assert customer_df['signup_date'].notnull().all(), "Aún hay NaT en signup_date"

# 2. email: eliminar duplicados agregando índice
duplicates = customer_df.duplicated(subset=['email'], keep=False)
customer_df.loc[duplicates, 'email'] = customer_df.loc[duplicates, 'email'] + \
                                      '_' + customer_df.loc[duplicates].index.astype(str)

# 3. phone: limpiar y estandarizar
customer_df['phone'] = customer_df['phone'].astype(str).str.replace(r'\D', '', regex=True)
customer_df['phone'] = customer_df['phone'].apply(lambda x: x if len(x) >= 10 else '0000000000')

# 4. age: asegurar rango 18-100
customer_df['age'] = customer_df['age'].clip(lower=18, upper=100)

# 5. full_name: rellenar nulos
customer_df['full_name'].fillna('Unknown', inplace=True)

# Guardar clean
customer_df.to_csv("../data/clean/customer_data_clean.csv", index=False)
print("Customer Data limpio listo")


Customer Data limpio listo


Retail

In [3]:
# Normalizar columnas
retail_df.columns = retail_df.columns.str.strip().str.lower()

# 1. transaction_id único y no nulo
retail_df.drop_duplicates(subset=['transaction_id'], inplace=True)
retail_df['transaction_id'].fillna(-1, inplace=True)  # valor temporal si hay NaN

# 2. purchase_date: convertir a datetime y forzar rango 2025-01-01 hasta hoy
retail_df['purchase_date'] = pd.to_datetime(retail_df['purchase_date'], errors='coerce')
min_date = pd.Timestamp("2025-01-01")
max_date = pd.Timestamp.today().normalize()
retail_df['purchase_date'] = retail_df['purchase_date'].apply(
    lambda x: min_date if pd.isna(x) or x < min_date else (max_date if x > max_date else x)
)
retail_df['purchase_date'] = pd.to_datetime(retail_df['purchase_date'], errors='coerce')
assert retail_df['purchase_date'].notnull().all(), "Aún hay NaT en purchase_date"
assert (retail_df['purchase_date'] >= min_date).all() and (retail_df['purchase_date'] <= max_date).all(), \
       "Algunos valores de purchase_date están fuera del rango"

# 3. amount: convertir a float y asegurar rango 0.01-10000
retail_df['amount'] = pd.to_numeric(retail_df['amount'], errors='coerce').fillna(0.01)
retail_df['amount'] = retail_df['amount'].clip(lower=0.01, upper=10000)

# 4. product_category: reemplazar nulos y vacíos
retail_df['product_category'].fillna('No specified', inplace=True)
retail_df['product_category'] = retail_df['product_category'].replace("", "No specified")


In [4]:
print("\n--- Customer Data ---")
print(customer_df.info())
print("\nNulos por columna:")
print(customer_df.isnull().sum())

print("\n--- Retail Data ---")
print(retail_df.info())
print("\nNulos por columna:")
print(retail_df.isnull().sum())

print("\nLimpieza completada correctamente.")


--- Customer Data ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   id           1000 non-null   int64         
 1   full_name    1000 non-null   object        
 2   email        892 non-null    object        
 3   phone        1000 non-null   object        
 4   address      897 non-null    object        
 5   signup_date  1000 non-null   datetime64[ns]
 6   name         1000 non-null   object        
 7   gender       1000 non-null   object        
 8   age          1000 non-null   int64         
dtypes: datetime64[ns](1), int64(2), object(6)
memory usage: 70.4+ KB
None

Nulos por columna:
id               0
full_name        0
email          108
phone            0
address        103
signup_date      0
name             0
gender           0
age              0
dtype: int64

--- Retail Data ---
<class 'pandas.core.frame.DataFra

In [5]:
# Crear carpeta clean si no existe
os.makedirs("../data/clean", exist_ok=True)

# Rutas de guardado
customer_clean_path = "../data/clean/customer_data_clean.csv"
retail_clean_path = "../data/clean/retail_data_clean.csv"

# Guardar los CSV limpios
customer_df.to_csv(customer_clean_path, index=False)
retail_df.to_csv(retail_clean_path, index=False)

print(f"Cleaned customer data saved at: {customer_clean_path}")
print(f"Cleaned retail data saved at: {retail_clean_path}")

Cleaned customer data saved at: ../data/clean/customer_data_clean.csv
Cleaned retail data saved at: ../data/clean/retail_data_clean.csv
