# Applications in Data Science

In [None]:
# Initiate Google Drive connection
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Install required packages
!pip install -q pmdarima

In [None]:
import pandas as pd
import pmdarima as pm
import statsmodels.api as sm
import matplotlib.pyplot as plt

# Analyzing the revenue data

In [None]:
# Read in your dataset
data = pd.read_excel("/content/drive/MyDrive/NBADataScience/Workshop data/data_retail.xlsx")

In [None]:
# Print first 5 rows of data
data.head()

In [None]:
# Show data of the revenue column
data['revenue']

In [None]:
# Calculate the sum of the revenue column
data['revenue'].sum()

In [None]:
# Group the data by month, and give the mean of the revenue column.
data.groupby("month").agg({"revenue": "mean"})

# Time series decomposition

In [None]:
# Convert the date column to a datetime type
data['Date'] = pd.to_datetime(data['date'])

# Set the date column as the index of the DataFrame
data.set_index('Date', inplace=True)

# Perform time series decomposition using statsmodels
decomposition = sm.tsa.seasonal_decompose(data['revenue'], model='additive')

# Extract the trend, seasonal, and residual components
trend = decomposition.trend
seasonal = decomposition.seasonal
residual = decomposition.resid

In [None]:
# Plot the original time series data
plt.figure(figsize=(10, 6))
plt.subplot(411)
plt.plot(data['revenue'])
plt.ylabel('revenue')
plt.title('Original Time Series')

# Plot the trend component
plt.subplot(412)
plt.plot(trend)
plt.ylabel('Trend')
plt.title('Trend Component')

# Plot the seasonal component
plt.subplot(413)
plt.plot(seasonal)
plt.ylabel('Seasonality')
plt.title('Seasonal Component')

# Plot the residual component
plt.subplot(414)
plt.plot(residual)
plt.ylabel('Residual')
plt.title('Residual Component')

# Adjust the spacing between subplots
plt.tight_layout()

# Display the plot
plt.show()

# Forecasting revenues

In [None]:
# Create the Holt-Winters model
model = sm.tsa.ExponentialSmoothing(data['revenue'], trend='add', seasonal='add', seasonal_periods=12)

# Fit the model to the data
holt_winters_model = model.fit()

# Forecast future values using the Holt-Winters model
forecast_values = holt_winters_model.predict(start='2023-10-01', end='2025-10-01')

print(forecast_values.head())

In [None]:
# Plot the original time series data along with the forecasted values
plt.figure(figsize=(10, 6))
plt.plot(data['revenue'], label='Original')
plt.plot(forecast_values, label='Holt-Winters Forecast')
plt.xlabel('Date'), plt.ylabel('Revenue (milj)')
plt.title('Time Series Forecasting with Holt-Winters')
plt.legend()

# Display the plot
plt.show()

In [None]:
forecast_data = pd.DataFrame(forecast_values, columns=["revenue"])
forecast_data.head()

In [None]:
# Save the data to excel.
forecast_data.to_excel("/content/drive/MyDrive/NBADataScience/forecasted_data.xlsx")

In [None]:
# Preparations for model evaulation (prepared by your data analyst)
from sklearn.metrics import mean_absolute_error
import numpy as np

# Define Y true and Y predicted based on historic data
y_pred = holt_winters_model.predict(start='1988-05-01', end='2023-9-01')
y_true = data['revenue']

def mean_absolute_percentage_error(y_true, y_pred):
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

## Model MAE
De MAE (Mean Absolute Error, gemiddeld absolute fout) van ons model is als volgt:

In [None]:
mean_absolute_error(y_true, y_pred)

Interpretatie: ...

## Model MAPE
De MAPE (Mean Absolute Percentage Error, gemiddeld percentage fout) van ons model is als volgt:

In [None]:
mean_absolute_percentage_error(y_true, y_pred)

Interpretatie: ...