In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import joblib
import numpy as np

# Step 1: Load the dataset
data = pd.read_csv('dataset.csv')

# Step 2: Data Preprocessing
# Convert Calendar_Week to datetime
data['Calendar_Week'] = pd.to_datetime(data['Calendar_Week'], errors='coerce')

# Handle missing values (fill forward)
data.fillna(method='ffill', inplace=True)

# Step 3: Exploratory Data Analysis (EDA)
# Plot Sales over time
plt.figure(figsize=(12, 6))
plt.plot(data['Calendar_Week'], data['Sales'], label='Sales', marker='o')
plt.title('Sales Over Time')
plt.xlabel('Date')
plt.ylabel('Sales')
plt.xticks(rotation=45)
plt.legend()
plt.show()

# Correlation Heatmap
correlation = data.corr()
plt.figure(figsize=(10, 8))
sns.heatmap(correlation, annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.show()

# Step 4: Feature Engineering
# Extract time-based features from Calendar_Week
data['Month'] = data['Calendar_Week'].dt.month
data['Week'] = data['Calendar_Week'].dt.isocalendar().week
data['Day'] = data['Calendar_Week'].dt.day

# Create impression ratio features
data['Google_Impression_Ratio'] = data['Google_Impressions'] / data['Overall_Views']
data['Facebook_Impression_Ratio'] = data['Facebook_Impressions'] / data['Overall_Views']
data['Email_Impression_Ratio'] = data['Email_Impressions'] / data['Overall_Views']

# Step 5: Define Features and Target
X = data.drop(columns=['Sales', 'Calendar_Week', 'Division'])  # Features (drop non-numeric columns)
y = data['Sales']  # Target variable (Sales)

# Print feature names to see the order and names
print("Feature names used for training:")
print(X.columns.tolist())

# Step 6: Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert 'Week' column to int64
X_train['Week'] = X_train['Week'].astype(np.int64)
X_test['Week'] = X_test['Week'].astype(np.int64)

# Check the data types in X_train
print("Data types in X_train:")
print(X_train.dtypes)


# Step 7: Model Training (Linear Regression and Random Forest)

# Initialize and train the Linear Regression model
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

# Initialize and train the Random Forest model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Step 8: Model Prediction
# Predict using both models on the test set
y_pred_lr = lr_model.predict(X_test)
y_pred_rf = rf_model.predict(X_test)

# Step 9: Model Evaluation (Linear Regression vs Random Forest)
# Linear Regression
mae_lr = mean_absolute_error(y_test, y_pred_lr)
rmse_lr = mean_squared_error(y_test, y_pred_lr, squared=False)
r2_lr = r2_score(y_test, y_pred_lr)
print(f"Linear Regression - MAE: {mae_lr}, RMSE: {rmse_lr}, R2: {r2_lr}")

# Random Forest
mae_rf = mean_absolute_error(y_test, y_pred_rf)
rmse_rf = mean_squared_error(y_test, y_pred_rf, squared=False)
r2_rf = r2_score(y_test, y_pred_rf)
print(f"Random Forest - MAE: {mae_rf}, RMSE: {rmse_rf}, R2: {r2_rf}")

# Step 10: Feature Importance from Random Forest Model
importances = rf_model.feature_importances_
indices = np.argsort(importances)

# Plot feature importance
plt.figure(figsize=(10, 6))
plt.title("Feature Importance")
plt.barh(range(len(indices)), importances[indices], align='center')
plt.yticks(range(len(indices)), [X.columns[i] for i in indices])
plt.xlabel("Importance")
plt.show()

# Step 11: Hyperparameter Tuning for Random Forest
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30],
}

# Grid Search with 3-fold cross-validation
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=3, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)

# Best hyperparameters and model
best_params = grid_search.best_params_
best_rf_model = grid_search.best_estimator_

# Step 12: Evaluate Best Model
y_pred_best_rf = best_rf_model.predict(X_test)
mae_best_rf = mean_absolute_error(y_test, y_pred_best_rf)
rmse_best_rf = mean_squared_error(y_test, y_pred_best_rf, squared=False)
r2_best_rf = r2_score(y_test, y_pred_best_rf)

print(f"Best Random Forest Model - MAE: {mae_best_rf}, RMSE: {rmse_best_rf}, R2: {r2_best_rf}")

# Step 13: Save the Best Model for Future Use
joblib.dump(best_rf_model, 'sales_prediction_model.pkl')

# Load the model back for predictions (optional)
loaded_model = joblib.load('sales_prediction_model.pkl')
y_pred_loaded_model = loaded_model.predict(X_test)

# Verify the predictions from the saved model
print(f"Loaded Model - First few predictions: {y_pred_loaded_model[:5]}")

import shap

# Step 3: Explain the model's predictions using SHAP
# Initialize the SHAP explainer
explainer = shap.Explainer(best_rf_model, X_train)

# Calculate SHAP values for the test set
shap_values = explainer(X_test)

# Step 4: Visualize the SHAP values
# Summary plot
shap.summary_plot(shap_values, X_test)

# Feature importance plot for a specific prediction
# Choose the first instance in the test set for explanation
shap.initjs()
shap.plots.waterfall(shap_values[0])
