In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split

# Load CSV with proper delimiter
df = pd.read_csv("Zara_Sales_Analysis.csv", delimiter=';')

# Drop irrelevant or mostly constant columns
columns_to_drop = ['Product ID', 'url', 'sku', 'description', 'brand', 'currency', 'scraped_at', 'terms']
df_clean = df.drop(columns=columns_to_drop)

# Drop missing values
df_clean = df_clean.dropna()

# Encode 'name' using label encoding (or drop it if not needed)
le = LabelEncoder()
df_clean['name'] = le.fit_transform(df_clean['name'])

# One-hot encode all categorical columns
categorical_cols = ['Product Position', 'Promotion', 'Seasonal', 'section', 'Product Category']
df_encoded = pd.get_dummies(df_clean, columns=categorical_cols, drop_first=True)

# Define features and target
X = df_encoded.drop('Sales Volume', axis=1)
y = df_encoded['Sales Volume']

# Scale numeric features
scaler = StandardScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42)

# View shapes
print("X_train:", X_train.shape)
print("X_test:", X_test.shape)
print("y_train:", y_train.shape)
print("y_test:", y_test.shape)


X_train: (200, 7)
X_test: (51, 7)
y_train: (200,)
y_test: (51,)
