In [3]:
# Freature Pipeline Notebook
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
import joblib

# Cargar los datos
df = pd.read_csv('supermarket_sales.csv')

# Convertir 'Date' a formato datetime
df['Date'] = pd.to_datetime(df['Date'])

# Extraer características adicionales de 'Date'
df['Day'] = df['Date'].dt.day
df['Month'] = df['Date'].dt.month
df['Year'] = df['Date'].dt.year
df['DayOfWeek'] = df['Date'].dt.dayofweek

# Convertir 'Time' a minutos desde la medianoche
df['Time'] = pd.to_datetime(df['Time']).dt.hour * 60 + pd.to_datetime(df['Time']).dt.minute

# Codificar las variables categóricas
le = LabelEncoder()
categorical_cols = ['Branch', 'City', 'Customer type', 'Gender', 'Product line', 'Payment']
for col in categorical_cols:
    df[col] = le.fit_transform(df[col])

# Seleccionar características para el modelo
features = ['Branch', 'City', 'Customer type', 'Gender', 'Product line', 'Unit price', 
            'Quantity', 'Tax 5%', 'Payment', 'Day', 'Month', 'Year', 'DayOfWeek', 'Time']
target = 'Total'

X = df[features]
y = df[target]

# Dividir los datos en conjuntos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Manejar los valores faltantes (si los hay)
imputer = SimpleImputer(strategy='mean')
X_train = pd.DataFrame(imputer.fit_transform(X_train), columns=X_train.columns)
X_test = pd.DataFrame(imputer.transform(X_test), columnas=X_test.columns)

# Escalar las características
scaler = StandardScaler()
X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns)
X_test_scaled = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)

# Guardar los datos procesados
X_train_scaled.to_csv('X_train_scaled.csv', index=False)
X_test_scaled.to_csv('X_test_scaled.csv', index=False)
y_train.to_csv('y_train.csv', index=False)
y_test.to_csv('y_test.csv', index=False)

# Guardar los objetos de preprocesamiento para uso posterior
joblib.dump(le, 'label_encoder.joblib')
joblib.dump(imputer, 'imputer.joblib')
joblib.dump(scaler, 'scaler.joblib')

print("Feature pipeline completed. Processed data and preprocessing objects saved.")

Feature pipeline completed. Processed data and preprocessing objects saved.


  df['Time'] = pd.to_datetime(df['Time']).dt.hour * 60 + pd.to_datetime(df['Time']).dt.minute
  df['Time'] = pd.to_datetime(df['Time']).dt.hour * 60 + pd.to_datetime(df['Time']).dt.minute
