In [132]:
import pandas as pd
from datetime import datetime
import os

customer_df = pd.read_csv("../data/raw/customer_data.csv")
retail_df = pd.read_csv("../data/raw/retail_data.csv")


# Cleaning

Customer

In [133]:
import pandas as pd
import re

# --- 1. Normalizar nombres de columnas ---
customer_df.columns = customer_df.columns.str.strip().str.lower()
print("Nombres de columnas normalizados.")

# --- 2. signup_date: convertir a datetime y forzar rango 2020-01-01 hasta hoy ---
customer_df['signup_date'] = pd.to_datetime(customer_df['signup_date'], errors='coerce')

min_signup = pd.Timestamp("2020-01-01")
max_signup = pd.Timestamp.today().normalize()

# Reemplazar valores fuera de rango por NaT
customer_df['signup_date'] = customer_df['signup_date'].apply(
    lambda x: pd.NaT if pd.isna(x) or x < min_signup or x > max_signup else x
)

# Eliminar filas con fechas inválidas
invalid_signup = customer_df['signup_date'].isna().sum()
if invalid_signup > 0:
    print(f"Eliminando {invalid_signup} registros con signup_date inválido...")
    customer_df = customer_df.dropna(subset=['signup_date'])

print("signup_date limpiado y validado correctamente.")

# --- 3. email: limpiar, validar y asegurar unicidad ---
# Regex robusta para validar emails
email_regex = (
    r"^(?!\.)[A-Za-z0-9._%+-]+@"
    r"(?!-)[A-Za-z0-9-]+(\.[A-Za-z0-9-]+)*\.[A-Za-z]{2,}$"
)

# Reemplazar emails vacíos o inválidos con uno único basado en el ID
def fix_email(row):
    email = str(row['email']).strip()
    if pd.isna(email) or not re.match(email_regex, email):
        return f"unknown_{row['id']}@domain.com"
    return email

customer_df['email'] = customer_df.apply(fix_email, axis=1)

# Resolver duplicados agregando índice al final
dup_emails = customer_df.duplicated(subset=['email'], keep=False)
if dup_emails.any():
    print(f"Se encontraron {dup_emails.sum()} emails duplicados, corrigiendo...")
    customer_df.loc[dup_emails, 'email'] = (
        customer_df.loc[dup_emails, 'email'] + "_" + customer_df.loc[dup_emails].index.astype(str)
    )

print("email limpiado, validado y sin duplicados correctamente.")

# --- 4. phone: limpiar dígitos, validar longitud y unicidad ---
customer_df['phone'] = customer_df['phone'].astype(str).str.replace(r'\D', '', regex=True)
customer_df['phone'] = customer_df['phone'].apply(lambda x: x if len(x) == 10 else pd.NA)

invalid_phones = customer_df['phone'].isna().sum()
if invalid_phones > 0:
    print(f"Eliminando {invalid_phones} registros con teléfonos inválidos...")
    customer_df = customer_df.dropna(subset=['phone'])

dup_phones = customer_df.duplicated(subset=['phone'], keep=False)
if dup_phones.any():
    print(f"Se encontraron {dup_phones.sum()} duplicados en phone, corrigiendo...")
    customer_df.loc[dup_phones, 'phone'] = (
        customer_df.loc[dup_phones, 'phone'] + "_" + customer_df.loc[dup_phones].index.astype(str)
    )

print("phone limpiado y validado correctamente.")

# --- 5. age: convertir a numérico y forzar rango 18-100 ---
customer_df['age'] = pd.to_numeric(customer_df['age'], errors='coerce').fillna(18).astype(int)
customer_df['age'] = customer_df['age'].clip(lower=18, upper=100)

print("age limpiado y validado correctamente.")

# --- 6. gender: normalizar y validar ---
customer_df['gender'] = customer_df['gender'].astype(str).str.strip().str.capitalize()
valid_genders = ["Male", "Female", "M", "F", "Other"]
customer_df['gender'] = customer_df['gender'].apply(lambda x: x if x in valid_genders else "Other")
customer_df['gender'].fillna("Other", inplace=True)

print("gender limpiado y normalizado correctamente.")

# --- 7. address y full_name: llenar nulos ---
customer_df['address'].fillna("Unknown Address", inplace=True)
customer_df['full_name'].fillna("Unknown", inplace=True)

print("address y full_name limpiados correctamente.")

Nombres de columnas normalizados.
Eliminando 732 registros con signup_date inválido...
signup_date limpiado y validado correctamente.
email limpiado, validado y sin duplicados correctamente.
Eliminando 83 registros con teléfonos inválidos...
phone limpiado y validado correctamente.
age limpiado y validado correctamente.
gender limpiado y normalizado correctamente.
address y full_name limpiados correctamente.


Retail

In [134]:
# --- 1. Eliminar duplicados en transaction_id ---
duplicados = retail_df.duplicated(subset=["transaction_id"], keep="first")
if duplicados.any():
    print(f"Se encontraron {duplicados.sum()} duplicados en 'transaction_id', eliminando...")
    retail_df = retail_df[~duplicados]

# --- 2. purchase_date: convertir a datetime y forzar rango 2025-01-01 hasta hoy ---
retail_df['purchase_date'] = pd.to_datetime(retail_df['purchase_date'], errors='coerce')

min_date = pd.Timestamp("2025-01-01")
max_date = pd.Timestamp.today().normalize()

# Rellenar nulos y valores fuera de rango
retail_df['purchase_date'] = retail_df['purchase_date'].apply(
    lambda x: min_date if pd.isna(x) or x < min_date else (max_date if x > max_date else x)
)

print("purchase_date limpiado y validado correctamente.")

# --- 3. amount: convertir a float y asegurar rango ---
# Reemplazar valores no numéricos por NaN
retail_df['amount'] = pd.to_numeric(retail_df['amount'], errors='coerce')

# Llenar nulos con valor mínimo
retail_df['amount'].fillna(0.01, inplace=True)

# Forzar rango
retail_df['amount'] = retail_df['amount'].clip(lower=0.01, upper=10000)

print("amount limpiado y validado correctamente.")

# --- 4. product_category: llenar nulos y vacíos ---
retail_df['product_category'].fillna('No specified', inplace=True)
retail_df['product_category'].replace("", "No specified", inplace=True)

print("product_category limpiado correctamente.")


Se encontraron 720 duplicados en 'transaction_id', eliminando...
purchase_date limpiado y validado correctamente.
amount limpiado y validado correctamente.
product_category limpiado correctamente.


In [135]:
print("\n--- Customer Data ---")
print(customer_df.info())
print("\nNulos por columna:")
print(customer_df.isnull().sum())

print("\n--- Retail Data ---")
print(retail_df.info())
print("\nNulos por columna:")
print(retail_df.isnull().sum())

print("\nLimpieza completada correctamente.")


--- Customer Data ---
<class 'pandas.core.frame.DataFrame'>
Index: 185 entries, 0 to 999
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   id           185 non-null    int64         
 1   full_name    185 non-null    object        
 2   email        185 non-null    object        
 3   phone        185 non-null    object        
 4   address      185 non-null    object        
 5   signup_date  185 non-null    datetime64[ns]
 6   name         185 non-null    object        
 7   gender       185 non-null    object        
 8   age          185 non-null    int64         
dtypes: datetime64[ns](1), int64(2), object(6)
memory usage: 14.5+ KB
None

Nulos por columna:
id             0
full_name      0
email          0
phone          0
address        0
signup_date    0
name           0
gender         0
age            0
dtype: int64

--- Retail Data ---
<class 'pandas.core.frame.DataFrame'>
Index: 100 entries,

In [136]:
customer_df.reset_index(drop=True, inplace=True)
retail_df.reset_index(drop=True, inplace=True)

# Crear carpeta clean si no existe
os.makedirs("../data/clean", exist_ok=True)

# Rutas de guardado
customer_clean_path = "../data/clean/customer_data_clean.csv"
retail_clean_path = "../data/clean/retail_data_clean.csv"

# Guardar los CSV limpios
customer_df.to_csv(customer_clean_path, index=False)
retail_df.to_csv(retail_clean_path, index=False)

print(f"Cleaned customer data saved at: {customer_clean_path}")
print(f"Cleaned retail data saved at: {retail_clean_path}")

Cleaned customer data saved at: ../data/clean/customer_data_clean.csv
Cleaned retail data saved at: ../data/clean/retail_data_clean.csv
