In [6]:


import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

# Step 1: Load CSV and clean column names
df = pd.read_csv('zara.csv', sep=';')  # your file uses tab as delimiter
df.columns = df.columns.str.strip()  # remove leading/trailing whitespace

# Step 2: Drop irrelevant columns
drop_cols = ['Product ID', 'url', 'sku', 'name', 'description', 'currency', 'scraped_at', 'terms', 'section']
df = df.drop(columns=[col for col in drop_cols if col in df.columns])

# Step 3: Define features and target
target = 'Sales Volume'
X = df.drop(columns=[target])
y = df[target]

# Step 4: Separate feature types
categorical_cols = X.select_dtypes(include='object').columns.tolist()
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

# Step 5: Define preprocessing pipelines
categorical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

numerical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

# Step 6: Combine pipelines
preprocessor = ColumnTransformer([
    ('cat', categorical_pipeline, categorical_cols),
    ('num', numerical_pipeline, numerical_cols)
])

# Step 7: Build pipeline and split data
pipeline = Pipeline([
    ('preprocessor', preprocessor)
])

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Step 8: Apply preprocessing
X_train_prepared = pipeline.fit_transform(X_train)
X_test_prepared = pipeline.transform(X_test)

print("✅ Data preparation complete.")
print("X_train_prepared shape:", X_train_prepared.shape)
print("X_test_prepared shape:", X_test_prepared.shape)




✅ Data preparation complete.
X_train_prepared shape: (201, 10)
X_test_prepared shape: (51, 10)
