In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_validate
from statsmodels.tsa.statespace.sarimax import SARIMAX
from sklearn.metrics import mean_squared_error

In [None]:
features = pd.read_csv('Features data set.csv')
sales = pd.read_csv('sales data-set.csv')
stores = pd.read_csv('stores data-set.csv')

In [None]:
features = features.drop(['IsHoliday'], axis=1)

combined = sales.merge(features, on=['Store', 'Date'])
combined = combined.merge(stores, on=['Store'])

combined.head()


In [None]:
combined.info()

In [None]:
combined = combined.fillna(value=0)

In [None]:
combined.info()

In [None]:
merged_df = combined.groupby(['Store', 'Date', 'IsHoliday', 'Temperature', 'Fuel_Price', 'MarkDown1', 'MarkDown2', 'MarkDown3',
                               'MarkDown4', 'MarkDown5', 'CPI', 'Unemployment', 'Type', 'Size'], as_index=False).sum()

merged_df = merged_df.drop(['Dept'], axis = 1)
print(merged_df.head())

In [None]:
merged_df['Date'] = pd.to_datetime(merged_df['Date'], dayfirst=True)
merged_df['Year'] = merged_df['Date'].dt.year
merged_df['Month'] = merged_df['Date'].dt.month

merged_df['WeekOfYear'] = merged_df['Date'].dt.isocalendar().week
merged_df['WeekOfYear'] = merged_df['WeekOfYear'].astype(np.int32)

merged_df['DayOfWeek'] = merged_df['Date'].dt.dayofweek

merged_df = merged_df.sort_values(by=['Store', 'Date'])
print(merged_df.head())

In [None]:
merged_df['Holiday_Weight'] = 1

super_bowl_week = (merged_df['Date'].dt.month == 2) & (merged_df['Date'].dt.isocalendar().week <= 6) & (merged_df['Date'].dt.dayofweek == 6)
labor_day_week = (merged_df['Date'].dt.month == 9) & (merged_df['Date'].dt.isocalendar().week == 36)
thanksgiving_week = (merged_df['Date'].dt.month == 11) & (merged_df['Date'].dt.isocalendar().week == 47)
christmas_period = (merged_df['Date'].dt.month == 12) & (merged_df['Date'].dt.isocalendar().week <= 24)

merged_df.loc[super_bowl_week, 'Holiday_Weight'] = 5
merged_df.loc[labor_day_week, 'Holiday_Weight'] = 5
merged_df.loc[thanksgiving_week, 'Holiday_Weight'] = 5
merged_df.loc[christmas_period, 'Holiday_Weight'] = 5

merged_df['Weighted_Weekly_Sales'] = merged_df['Weekly_Sales'] * merged_df['Holiday_Weight']

merged_df.head()

In [None]:
label_encoder = LabelEncoder()
merged_df['Type'] = label_encoder.fit_transform(merged_df['Type'])
merged_df.info()


In [None]:
numeric_df = merged_df.select_dtypes(include=[np.number]).drop(columns=['Weekly_Sales', 'Weighted_Weekly_Sales', 'Holiday_Weight', ])

# Compute correlation matrix
correlation = numeric_df.corrwith(merged_df['Weekly_Sales']).sort_values(ascending=False)

print("Correlation of each feature with Weighted_Weekly_Sales:")
print(correlation)

plt.figure(figsize=(10, 6))

sns.heatmap(correlation.to_frame(), annot=True, cmap="coolwarm", cbar=True, fmt=".2f")

plt.title("Correlation with Weighted Weekly Sales")

plt.xlabel("Weighted Weekly Sales")

plt.ylabel("Features")

plt.tight_layout()

plt.show()

In [None]:
pca = PCA()
scaler = StandardScaler()

#making a copy to preserve the original unscaled df
combined_scaled = merged_df.copy()

#scaling each of the numerical columns
combined_scaled['Weekly_Sales'] = scaler.fit_transform(combined_scaled[['Weekly_Sales']])
combined_scaled['Temperature'] = scaler.fit_transform(combined_scaled[['Temperature']])
combined_scaled['Fuel_Price'] = scaler.fit_transform(combined_scaled[['Fuel_Price']])
combined_scaled['MarkDown1'] = scaler.fit_transform(combined_scaled[['MarkDown1']])
combined_scaled['MarkDown2'] = scaler.fit_transform(combined_scaled[['MarkDown2']])
combined_scaled['MarkDown3'] = scaler.fit_transform(combined_scaled[['MarkDown3']])
combined_scaled['MarkDown4'] = scaler.fit_transform(combined_scaled[['MarkDown4']])
combined_scaled['MarkDown5'] = scaler.fit_transform(combined_scaled[['MarkDown5']])
combined_scaled['CPI'] = scaler.fit_transform(combined_scaled[['CPI']])
combined_scaled['Unemployment'] = scaler.fit_transform(combined_scaled[['Unemployment']])
combined_scaled['Size'] = scaler.fit_transform(combined_scaled[['Size']])
combined_scaled['Weighted_Weekly_Sales'] = scaler.fit_transform(combined_scaled[['Weighted_Weekly_Sales']])
combined_scaled['Type'] = scaler.fit_transform(combined_scaled[['Type']])
combined_scaled.head()

#performing PCA on the numerical features. Which features we select will likely need tweaking
#pca.fit(combined_scaled[['Weekly_Sales', 'Temperature', 'Fuel_Price', 'MarkDown1', 'MarkDown2', 'MarkDown3', 'MarkDown4', 'MarkDown5', 'CPI', 'Unemployment', 'Size', 'Weighted_Weekly_Sales']])

pca.fit(combined_scaled[['Temperature', 'Fuel_Price', 'MarkDown1', 'MarkDown2', 'MarkDown3', 'MarkDown4', 'MarkDown5', 'CPI', 'Unemployment', 'Size', 'Type']])

print(pca.explained_variance_ratio_)

PC_values = np.arange(pca.n_components_) + 1
plt.plot(PC_values, pca.explained_variance_ratio_, 'o-', linewidth=2, color='blue')
plt.title('Scree Plot')
plt.xlabel('Principal Component')
plt.ylabel('Variance Explained')
plt.show()

In [None]:
# Define features and target variable
X = merged_df.drop(columns=['Weekly_Sales', 'Date', 'Holiday_Weight', 'Weighted_Weekly_Sales'])
X['Type'] = label_encoder.fit_transform(X['Type'])

y = merged_df['Weighted_Weekly_Sales']

# Initialize and fit Random Forest model
rf_model = RandomForestRegressor(n_estimators=100, random_state=123)

output  = cross_validate(rf_model, X, y, cv= 10, scoring = 'r2', return_estimator =True)

# Display feature importances for each estimator
for idx, estimator in enumerate(output['estimator']):
    print(f"Features sorted by their score for estimator {idx}:")
    feature_importances = pd.DataFrame({
        'feature': X.columns,
        'importance': estimator.feature_importances_
    }).sort_values('importance', ascending=False)
    print(feature_importances)

In [None]:

# Define features and target variable
X = merged_df.drop(columns=['Weekly_Sales', 'Date', 'Holiday_Weight', 'Weighted_Weekly_Sales'])
X['Type'] = label_encoder.fit_transform(X['Type'])

y = merged_df['Weighted_Weekly_Sales']

# Initialize and fit Random Forest model
rf_model = RandomForestRegressor(n_estimators=100, random_state=123)
rf_model.fit(X, y)

# Calculate feature importances
feature_importances = pd.DataFrame({
    'Feature': X.columns,
    'Importance': rf_model.feature_importances_
}).sort_values(by='Importance', ascending=False)

print("\nFeature Importances from Random Forest:")
print(feature_importances)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, StratifiedGroupKFold
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import numpy as np


X = merged_df.drop(columns=['WeekOfYear','Date', 'Weekly_Sales', 'DayOfWeek'])
y = merged_df['Weekly_Sales']

label_encoder = LabelEncoder()
X['Type'] = label_encoder.fit_transform(X['Type'])
X['IsHoliday'] = label_encoder.fit_transform(X['IsHoliday'])

linear_regression_model = LinearRegression()

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
linear_regression_model.fit(x_train, y_train)
y_pred = linear_regression_model.predict(x_test)

mse =mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R-squared: {r2}")

plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred, alpha=0.5)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')  # Line of perfect prediction
plt.xlabel("Actual Weekly Sales")
plt.ylabel("Predicted Weekly Sales")
plt.title("Actual vs. Predicted Weekly Sales")
plt.grid(True)
plt.show()


print(linear_regression_model.coef_)

In [None]:
merged_df.info()

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, StratifiedGroupKFold
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import numpy as np


X = merged_df.drop(columns=['WeekOfYear', 'Date', 'Weighted_Weekly_Sales'])
y = merged_df['Weighted_Weekly_Sales']

label_encoder = LabelEncoder()
X['Type'] = label_encoder.fit_transform(X['Type'])
#X['Holiday'] = label_encoder.fit_transform(X['Holiday'])

linear_regression_model = LinearRegression()

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
linear_regression_model.fit(x_train, y_train)
y_pred = linear_regression_model.predict(x_test)

mse =mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R-squared: {r2}")

plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred, alpha=0.5)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')  # Line of perfect prediction
plt.xlabel("Actual Weighted Weekly Sales")
plt.ylabel("Predicted Weighted Weekly Sales")
plt.title("Actual vs. Predicted Weekly Sales")
plt.grid(True)
plt.show()


print(linear_regression_model.coef_)

In [None]:

store_df = merged_df[['Date', 'Weekly_Sales', 'Weighted_Weekly_Sales']]
store_df = store_df.reset_index(drop=True)


unweighted_store_df = store_df[['Date', 'Weekly_Sales']]
unweighted_store_df = unweighted_store_df.groupby('Date', as_index=False)['Weekly_Sales'].sum()

unweighted_store_df.set_index('Date', inplace=True)

unweighted_store_df.head()

y = unweighted_store_df['Weekly_Sales']

train_size = int(len(y) * 0.8)
y_train, y_test = y[:train_size], y[train_size:]


sarima_order = (1, 1, 1)
seasonal_order = (1, 1, 1, 52)  # Assuming weekly seasonality (adjust if needed)


model = SARIMAX(y_train, order=sarima_order, seasonal_order=seasonal_order)
sarima_fit = model.fit(disp=False)


forecast = sarima_fit.get_forecast(steps=len(y_test))
y_pred = forecast.predicted_mean
conf_int = forecast.conf_int()


mse = mean_squared_error(y_test, y_pred)
correlation = np.corrcoef(y_test, y_pred)[0, 1]  # Correlation coefficient

print(f"Mean Squared Error: {mse}")
print(f"Correlation: {correlation}")


plt.figure(figsize=(12, 6))

# Plot training data
plt.plot(y_train.index, y_train, label='Training Data', color='blue')

# Plot actual test data
plt.plot(y_test.index, y_test, label='Actual Test Data', color='green')

# Plot predictions
plt.plot(y_test.index, y_pred, label='SARIMA Forecast', color='orange')


plt.fill_between(y_test.index, conf_int.iloc[:, 0], conf_int.iloc[:, 1], color='pink', alpha=0.3)

plt.title('SARIMA: Actual vs Predicted Weekly Sales')
plt.xlabel('Date')
plt.ylabel('Weekly Sales')
plt.legend()
plt.grid(True)

# Add correlation as text to the plot
plt.text(x=0.05, y=0.95, s=f"Correlation: {correlation:.2f}", fontsize=12,
         transform=plt.gca().transAxes, color='darkred', bbox=dict(facecolor='white', alpha=0.6))

plt.show()


In [None]:

store_df = merged_df[['Date', 'Weekly_Sales', 'Weighted_Weekly_Sales']]
store_df = store_df.reset_index(drop=True)

weighted_store_df = store_df[['Date', 'Weighted_Weekly_Sales']]
weighted_store_df = weighted_store_df.groupby('Date', as_index=False)['Weighted_Weekly_Sales'].sum()

weighted_store_df.set_index('Date', inplace=True)

weighted_store_df.head()

y = weighted_store_df['Weighted_Weekly_Sales']

train_size = int(len(y) * 0.8)
y_train, y_test = y[:train_size], y[train_size:]


sarima_order = (1, 1, 1)
seasonal_order = (1, 1, 1, 52)  # Assuming weekly seasonality (adjust if needed)


model = SARIMAX(y_train, order=sarima_order, seasonal_order=seasonal_order)
sarima_fit = model.fit(disp=False)


forecast = sarima_fit.get_forecast(steps=len(y_test))
y_pred = forecast.predicted_mean
conf_int = forecast.conf_int()


mse = mean_squared_error(y_test, y_pred)
correlation = np.corrcoef(y_test, y_pred)[0, 1]  # Correlation coefficient

print(f"Mean Squared Error: {mse}")
print(f"Correlation: {correlation}")


plt.figure(figsize=(12, 6))

# Plot training data
plt.plot(y_train.index, y_train, label='Training Data', color='blue')

# Plot actual test data
plt.plot(y_test.index, y_test, label='Actual Test Data', color='green')

# Plot predictions
plt.plot(y_test.index, y_pred, label='SARIMA Forecast', color='orange')


plt.fill_between(y_test.index, conf_int.iloc[:, 0], conf_int.iloc[:, 1], color='pink', alpha=0.3)

plt.title('SARIMA: Actual vs Predicted Weighted Weekly Sales')
plt.xlabel('Date')
plt.ylabel('Weekly Sales')
plt.legend()
plt.grid(True)

# Add correlation as text to the plot
plt.text(x=0.05, y=0.95, s=f"Correlation: {correlation:.2f}", fontsize=12,
         transform=plt.gca().transAxes, color='darkred', bbox=dict(facecolor='white', alpha=0.6))

plt.show()


In [None]:
#Test with using multiple variables

store_df = merged_df


unweighted_store_df = store_df
unweighted_store_df.sort_values(by='Date', inplace=True)

unweighted_store_df.set_index('Date', inplace=True)

unweighted_store_df.head()

y = unweighted_store_df['Weekly_Sales']

train_size = int(len(y) * 0.8)
y_train, y_test = y[:train_size], y[train_size:]


sarima_order = (1, 1, 1)
seasonal_order = (1, 1, 1, 52)  # Assuming weekly seasonality (adjust if needed)

exog = unweighted_store_df[['Size']]
exog_train, exog_test = exog[:train_size], exog[train_size:]


model = SARIMAX(y_train, exog=exog_train, order=sarima_order, seasonal_order=seasonal_order)
sarima_fit = model.fit(disp=False)



forecast = sarima_fit.get_forecast(steps=len(y_test), exog=exog_test)
y_pred = forecast.predicted_mean
conf_int = forecast.conf_int()


mse = mean_squared_error(y_test, y_pred)
correlation = np.corrcoef(y_test, y_pred)[0, 1]  # Correlation coefficient

print(f"Mean Squared Error: {mse}")
print(f"Correlation: {correlation}")


plt.figure(figsize=(12, 6))

# Plot training data
plt.plot(y_train.index, y_train, label='Training Data', color='blue')

# Plot actual test data
plt.plot(y_test.index, y_test, label='Actual Test Data', color='green')

# Plot predictions
plt.plot(y_test.index, y_pred, label='SARIMA Forecast', color='orange')


plt.fill_between(y_test.index, conf_int.iloc[:, 0], conf_int.iloc[:, 1], color='pink', alpha=0.3)

plt.title('SARIMA: Actual vs Predicted Weekly Sales')
plt.xlabel('Date')
plt.ylabel('Weekly Sales')
plt.legend()
plt.grid(True)

# Add correlation as text to the plot
plt.text(x=0.05, y=0.95, s=f"Correlation: {correlation:.2f}", fontsize=12,
         transform=plt.gca().transAxes, color='darkred', bbox=dict(facecolor='white', alpha=0.6))

plt.show()


In [None]:
# Drop the 'Weighted_Weekly_Sales' column (if applicable) and sort by date
new_unweighted_store_df = store_df

new_unweighted_store_df.reset_index

# Group by 'Date' and apply custom aggregation
new_unweighted_store_df = new_unweighted_store_df.groupby('Date', as_index=False).agg({
    'Weekly_Sales': 'sum',  # Sum Weekly Sales
    'Size': 'first'         # Use the first Size value for each group (or replace with 'mean', 'mode', etc.)
})

# Sort and set 'Date' as the index
new_unweighted_store_df.set_index('Date', inplace=True)

# Verify the results
print(new_unweighted_store_df.head())

# Extract the target variable (y) and exogenous variable (Size)
y = new_unweighted_store_df['Weekly_Sales']
exog = new_unweighted_store_df[['Size']]

# Split data into training and testing sets
train_size = int(len(y) * 0.8)
y_train, y_test = y[:train_size], y[train_size:]
exog_train, exog_test = exog[:train_size], exog[train_size:]

# Fit the SARIMAX model
model = SARIMAX(y_train, exog=exog_train, order=sarima_order, seasonal_order=seasonal_order)
sarima_fit = model.fit(disp=False)

# Forecasting
forecast = sarima_fit.get_forecast(steps=len(y_test), exog=exog_test)
y_pred = forecast.predicted_mean
conf_int = forecast.conf_int()

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
correlation = np.corrcoef(y_test, y_pred)[0, 1]

print(f"Mean Squared Error: {mse}")
print(f"Correlation: {correlation}")

# Plot results
plt.figure(figsize=(12, 6))
plt.plot(y_train.index, y_train, label='Training Data', color='blue')
plt.plot(y_test.index, y_test, label='Actual Test Data', color='green')
plt.plot(y_test.index, y_pred, label='SARIMA Forecast', color='orange')
plt.fill_between(y_test.index, conf_int.iloc[:, 0], conf_int.iloc[:, 1], color='pink', alpha=0.3)
plt.title('SARIMA: Actual vs Predicted Weekly Sales')
plt.xlabel('Date')
plt.ylabel('Weekly Sales')
plt.legend()
plt.grid(True)
plt.text(
    x=0.05, y=0.95, s=f"Correlation: {correlation:.2f}", fontsize=12,
    transform=plt.gca().transAxes, color='darkred', bbox=dict(facecolor='white', alpha=0.6)
)
plt.show()