In [None]:
#library imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns  

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import joblib

In [None]:
#load the dataset
csv_path = r"C:\Users\fedi\Documents\ESEN-HACK\enhanced_pet_sales.csv"
df = pd.read_csv(csv_path, parse_dates=['order_date'])

In [None]:
# Quick peek at the first few rows
print(" First 5 rows of raw data:")
print(df.head(), "\n")

In [None]:
# Ensure 'sales' is numeric; if not, coerce to NaN and fill with the column mean
df['sales'] = pd.to_numeric(df['sales'], errors='coerce')
df['sales'].fillna(df['sales'].mean(), inplace=True)

In [None]:
# Ensure 'price' is numeric; if not, coerce to NaN and fill with the column median
df['price'] = pd.to_numeric(df['price'], errors='coerce')
df['price'].fillna(df['price'].median(), inplace=True)


In [None]:
# Convert boolean columns to integers (0 or 1)
df['in_stock'] = df['in_stock'].astype(int)
df['holiday_season'] = df['holiday_season'].astype(int)
df['is_discounted'] = df['is_discounted'].astype(int)

In [None]:
# Convert other numeric-ish columns to numeric dtype, filling any missing values
df['discount_pct'] = pd.to_numeric(df['discount_pct'], errors='coerce').fillna(0.0)
df['avg_review_score'] = pd.to_numeric(df['avg_review_score'], errors='coerce').fillna(0.0)
df['num_reviews'] = pd.to_numeric(df['num_reviews'], errors='coerce').fillna(0)
df['stock_left'] = pd.to_numeric(df['stock_left'], errors='coerce').fillna(0)
df['ad_spend'] = pd.to_numeric(df['ad_spend'], errors='coerce').fillna(0.0)
df['click_through_rate'] = pd.to_numeric(df['click_through_rate'], errors='coerce').fillna(0.0)

In [None]:
#data after cleaning
print(">>> Data types after cleaning:")
print(df.dtypes, "\n")

In [None]:
# Extract "month" and "weekday" from the 'order_date' (we already parsed it above)
df['month'] = df['order_date'].dt.month
df['weekday'] = df['order_date'].dt.weekday

In [None]:
# One‐Hot Encode categorical columns: 'animal_type' & 'category'
#     After this, we get columns like 'animal_type_dog' (1 if dog, 0 if cat),
#     plus 'category_Bed', 'category_Collar', etc.
dummies = pd.get_dummies(df[['animal_type', 'category']], drop_first=True)
df = pd.concat([df, dummies], axis=1)

In [None]:
# 4.1 Histogram of 'sales' 
plt.figure(figsize=(8, 5))
sns.histplot(df['sales'], bins=30, kde=True)
plt.title("Distribution of Sales")
plt.xlabel("Sales (units)")
plt.ylabel("Count")
plt.tight_layout()
plt.savefig("hist_sales.png")
plt.close()
print("Saved plot → hist_sales.png")

In [None]:
# 4.2 Boxplot of 'price' by 'animal_type' to compare cat vs. dog pricing
plt.figure(figsize=(6, 4))
sns.boxplot(x='animal_type', y='price', data=df)
plt.title("Price Distribution by Animal Type")
plt.xlabel("Animal Type")
plt.ylabel("Price (USD)")
plt.tight_layout()
plt.savefig("boxplot_price_by_animal.png")
plt.close()
print("Saved plot → boxplot_price_by_animal.png")

In [None]:
# 4.3 Correlation matrix (numeric + dummy columns)
#      First collect a list of numeric & dummy columns:
numeric_cols = [
    'price', 'month', 'weekday',
    'is_discounted', 'discount_pct',
    'avg_review_score', 'num_reviews',
    'in_stock', 'stock_left',
    'ad_spend', 'click_through_rate',
    'holiday_season'
]
dummy_cols = [col for col in df.columns if col.startswith('animal_type_') or col.startswith('category_')]
all_corr_cols = numeric_cols + dummy_cols

corr_matrix = df[all_corr_cols].corr()

plt.figure(figsize=(12, 10))
sns.heatmap(
    corr_matrix,
    annot=True,
    fmt=".2f",
    cmap="viridis",    
    linewidths=0.5
)
plt.title("Feature Correlation Matrix")
plt.tight_layout()
plt.savefig("corr_matrix.png")
plt.close()
print("Saved plot → corr_matrix.png\n")

In [None]:
# Define features (X) and target (y)
feature_cols = all_corr_cols
X = df[feature_cols]
y = df['sales']


In [None]:
# Sanity check: make sure none of the feature columns are missing
missing_cols = [col for col in feature_cols if col not in X.columns]
if missing_cols:
    raise ValueError(f"Missing columns in X: {missing_cols}")
print("✅ All feature columns are present.\n")

In [None]:
# Split into train (80%) and test (20%)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=42
)
print(f"Training set size:  {X_train.shape[0]} rows")
print(f"Testing set size:   {X_test.shape[0]} rows\n")

In [None]:
#  MODEL SELECTION & HYPERPARAMETER TUNING (RANDOM FOREST)
rf = RandomForestRegressor(random_state=42)

param_grid = {
    'n_estimators': [50, 100],
    'max_depth': [5, 10, None]
}

grid_search = GridSearchCV(
    estimator=rf,
    param_grid=param_grid,
    cv=3,
    scoring='neg_mean_squared_error',
    n_jobs=-1,
    verbose=1
)

print("Starting GridSearchCV for RandomForestRegressor...")
grid_search.fit(X_train, y_train)

best_forest = grid_search.best_estimator_
print("✔︎ Best hyperparameters:", grid_search.best_params_, "\n")

In [None]:
#  EVALUATE ON TEST SET
y_pred = best_forest.predict(X_test)

mse_test = mean_squared_error(y_test, y_pred)
r2_test = r2_score(y_test, y_pred)

print(f"MSE on test set:  {mse_test:.2f}")
print(f"R² on test set:   {r2_test:.2f}\n")

In [None]:
#  PLOT FEATURE IMPORTANCES (SEABORN BARPLOT)
importances = best_forest.feature_importances_
importance_df = pd.DataFrame({
    'feature': feature_cols,
    'importance': importances
}).sort_values(by='importance', ascending=False)

plt.figure(figsize=(8, 6))
sns.barplot(
    x='importance',
    y='feature',
    data=importance_df,
    palette='magma'
)
plt.title("Feature Importances (Random Forest)")
plt.xlabel("Relative Importance")
plt.ylabel("Feature")
plt.tight_layout()
plt.savefig("feature_importances_rf.png")
plt.close()
print("Saved plot → feature_importances_rf.png\n")

In [None]:
 # PLOT ACTUAL VS. PREDICTED (SCATTER WITH SEABORN)
plt.figure(figsize=(6, 6))
sns.scatterplot(x=y_test, y=y_pred, alpha=0.6, edgecolor=None)
plt.plot([y_test.min(), y_test.max()],
         [y_test.min(), y_test.max()], 'k--')
plt.title("Actual vs. Predicted Sales")
plt.xlabel("True Sales")
plt.ylabel("Predicted Sales")
plt.tight_layout()
plt.savefig("actual_vs_predicted_rf.png")
plt.close()
print("Saved plot → actual_vs_predicted_rf.png\n")

In [None]:
# Serialize the trained model to disk
model_filename = "rf_sales_model.joblib"
joblib.dump(best_forest, model_filename)
print(f"✔︎ Saved trained model → {model_filename}\n")


In [None]:
# Print top‐selling and lowest‐selling products (by true sales)
top_idx = df['sales'].idxmax()
worst_idx = df['sales'].idxmin()

print(" Top‐selling product (true sales):")
print(df.loc[top_idx, ['product_name', 'animal_type', 'category', 'sales']], "\n")

print("  Lowest‐selling product (true sales):")
print(df.loc[worst_idx, ['product_name', 'animal_type', 'category', 'sales']], "\n")