### Import libraries.

In [None]:
# Import the necessary libraries.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import seaborn as sns
import re
import calendar
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor

### Create functions.

In [None]:
# Standard vertical bar chart function.
def standard_vertical_bar_chart(df, x, y, title="Bar Chart", x_label=None, y_label=None,
                                figsize=(10,6), legend_title=None, **kwargs):
    """
    Create and display a vertical bar chart from a DataFrame, save the plot as a PNG file.

    Parameters:
    -----------
    df: pandas.DataFrame
        The DataFrame containing the data to plot.
    x: str
        The column name to use for the x-axis.
    y: str
        The column name to use for the y-axis.
    title: str, optional
        The title of the plot (default is "Bar Chart").
    x_label: str or None, optional
        Label for the x-axis. If None, no label is set.
    y_label: str or None, optional
        Label for the y-axis. If None, no label is set.
    figsize: tuple, optional
        Figure size in inches (width, height), default is (10, 6).
    legend_title: str or None, optional
        Title for the legend, if a legend is present.
    **kwargs: dict
        Additional keyword arguments to pass to seaborn.barplot.

    Saves:
    -------
    A PNG image of the plot, named based on the title.

    Displays:
    ---------
    The generated vertical bar chart.
    """

    # Create the bar chart.
    plt.figure(figsize=figsize)
    sns.barplot(data=df, x=x, y=y, **kwargs)

    # Get the axes so that the legend part of this function works.
    ax = plt.gca()

    # Plot the title, x and y labels, and rotate the x axis labels.
    plt.title(title)
    plt.xlabel(x_label)
    plt.ylabel(y_label)
    plt.xticks(rotation=45)

    # Add a legend if there are handles.
    handles, labels = ax.get_legend_handles_labels()
    if handles:
        plt.legend(handles, labels, title=legend_title, loc="upper left", bbox_to_anchor=(1, 1))

    # Save the figure using the title of the plot.
    clean_title = re.sub(r'[^\w\s-]', '', title.replace("\n", " ")).strip().replace(' ', '_').lower()
    filename = f"{clean_title}.png"

    plt.savefig(filename, dpi=500, bbox_inches="tight")

    plt.show()

In [None]:
# Standard horizontal bar chart function.
def standard_horizontal_bar_chart(df, x, y, title="Bar Chart", x_label=None, y_label=None,
                                  figsize=(10,6), legend_title=None, **kwargs):
    """
    Create and display a horizontal bar chart from a DataFrame, save the plot as a PNG file.

    Bars are sorted in descending order by the y-values, with the largest bar at the top.
    If a palette name is provided, the colors are reversed so the largest bar is darkest.

    Parameters:
    -----------
    df: pandas.DataFrame
        The DataFrame containing the data to plot.
    x: str
        The column name to use for the y-axis (bars).
    y: str
        The column name to use for the x-axis (bar lengths).
    title: str, optional
        The title of the plot (default is "Bar Chart").
    x_label: str or None, optional
        Label for the y-axis. If None, no label is set.
    y_label: str or None, optional
        Label for the x-axis. If None, no label is set.
    figsize: tuple, optional
        Figure size in inches (width, height), default is (10, 6).
    legend_title: str or None, optional
        Title for the legend, if a legend is present.
    **kwargs: dict
        Additional keyword arguments to pass to seaborn.barplot.

    Saves:
    -------
    A PNG image of the plot, named based on the title.

    Displays:
    ---------
    The generated horizontal bar chart.
    """

    # Sort by descending value so largest is on top
    df_sorted = df.sort_values(by=y, ascending=False)

    # Reverse the palette so that the first bar (top = largest) is darkest
    if 'palette' in kwargs and isinstance(kwargs['palette'], str):
        palette_name = kwargs['palette']
        kwargs['palette'] = sns.color_palette(palette_name, len(df_sorted))[::-1]

    # Create the bar chart.
    plt.figure(figsize=figsize)
    sns.barplot(data=df_sorted, x=y, y=x, **kwargs)

    ax = plt.gca()

    # Plot the title, x and y labels, and rotate the x axis labels.
    plt.title(title)
    plt.xlabel(y_label)
    plt.ylabel(x_label)
    plt.xticks(rotation=0)

    # Add a legend if needed
    handles, labels = ax.get_legend_handles_labels()
    if handles:
        plt.legend(handles, labels, title=legend_title, loc="upper left", bbox_to_anchor=(1, 1))

    plt.tight_layout()

    # Save the figure using the title
    clean_title = re.sub(r'[^\w\s-]', '', title.replace("\n", " ")).strip().replace(' ', '_').lower()
    filename = f"{clean_title}.png"
    plt.savefig(filename, dpi=500, bbox_inches="tight")

    plt.show()

### Load all data.

In [None]:
# Load provided data, including filtered ones.
sales = pd.read_csv('sales.csv')
temperatures = pd.read_csv('temperatures.csv')
filtered_sales = pd.read_csv('filtered_sales.csv')
filtered_sales_temp = pd.read_csv('filtered_sales_temp.csv')

# Load City's holiday calendar.
holidays = pd.read_csv('calendario.csv',
                       sep=';',
                       parse_dates=['Dia'],
                      dayfirst=True)

# Load sport's match calendar.
sport = pd.read_csv('Allcity_sport.csv',
                     parse_dates=['Date'],
                    dayfirst=True)

# Load sport1's match calendar.
sport1 = pd.read_csv('International_city_country.csv',
                    parse_dates=['Date'],
                    dayfirst=True)

### Exploratory analysis.

#### The below only includes visualisations that were used in either or both the presentation and technical report and that were created using Python. All other exploratory visualisation were created using Tableau and can be viewed in the Tableau workbook.

#### You'll also find below the code used to create new columns/data frames that were used to create visualisations in Tableau.

***Adding a district/town column and creating a new data frame called filtered_sales_temp2. This file was used to create some charts in Tableau.***

In [None]:
# Add a district/town column to filtered_sales_temp.
# Create a district/town dictionary.
district_town_dict = {'removed to ensure anonymity of employer'}

filtered_sales_temp['district/town'] = filtered_sales_temp['postcode'].replace(district_town_dict)
filtered_sales_temp.head()

In [None]:
# View district/town unique values.
filtered_sales_temp['district/town'].unique()

In [None]:
# Check for nulls in the district/town column.
filtered_sales_temp['district/town'].isna().sum()

In [None]:
# Drop town/city column.
filtered_sales_temp.drop(columns='town/city', inplace=True)

# View columns.
filtered_sales_temp.columns

In [None]:
# Move the district/town column to be the second column.
col = filtered_sales_temp.pop('district/town')
filtered_sales_temp.insert(1, 'district/town', col)
filtered_sales_temp.head()

In [None]:
# Save just postcode and district/town as a dataframe (to be used when joining to file in Tableau).
district_town_postcode = filtered_sales_temp[['district/town', 'postcode']].drop_duplicates()
district_town_postcode.to_csv('district_town_postcode.csv', index=False)

# Save dataframe with district/town column in a new csv.
filtered_sales_temp.to_csv('filtered_sales_temp2.csv', index=False)

# Create a copy of filtered_sales_temp and rename it filtered_sales_temp2 so it matches the csv file name.
filtered_sales_temp2 = filtered_sales_temp.copy()

***Creating a new data frame with the percentage of days that had no sales. This data frame was used in Tableau to create a map visual.***

In [None]:
# Filter to only rows where sold_units > 0.
df_sales_all = filtered_sales_temp[filtered_sales_temp['sold_units'] > 0]

# Group by petrol_station and count unique dates.
sales_days_count_all = df_sales_all.groupby('petrol_station')['date'].nunique().reset_index()

# Rename "date" to 'no_sales_count'.
sales_days_count_all.rename(columns={'date':'n_days_sales'}, inplace=True)

# Sort by 'sales_count' descending.
sales_days_count_all = sales_days_count_all.sort_values(by='n_days_sales', ascending=True)

# View no_sales_days_count
sales_days_count_all

In [None]:
# Create a percentage column.
sales_days_count_all['percentage'] = (sales_days_count_all['n_days_sales'])/(713)*100

# Round the percentage column to 2 decimal places.
sales_days_count_all['percentage'] = sales_days_count_all['percentage'].round(2)

# View sales_days_count_all
sales_days_count_all

In [None]:
# Save as a csv to use in a map visual in Tableau.
sales_days_count_all.to_csv('sales_days_count.csv', index=False)

***Creating a bar chart with error bars on the different event/holiday categories. This chart was used in the technical report.***

In [None]:
# Convert date column in filtered_sales_temp2 into datetime.
filtered_sales_temp2['date'] = pd.to_datetime(filtered_sales_temp2['date'])
filtered_sales_temp2.info()

In [None]:
# Rename 'Date' columns in sport events to lower case.
sport.rename(columns={'Date': 'date'}, inplace=True)
sport1.rename(columns={'Date': 'date'}, inplace=True)

In [None]:
# Rename the 'X' column in holidays to 'date'.
holidays.rename(columns={'X': 'date'}, inplace=True)
holidays.rename(columns={'Y': 'holiday_name'}, inplace=True)

In [None]:
# Drop unnecessary columns in holidays.
holidays = holidays.drop(['removed to ensure anonymity of employer'], axis=1)
holidays.head()

In [None]:
# Drop unnecessary columns in sport events.
sport = sport.drop(['Round Number', 'Location', 'Home Team', 'Away Team', 'Result'], axis=1)
sport.head()

In [None]:
# Drop unnecessary columns in sport events.
sport1 = sport1.drop(['Round Number', 'Unnamed: 2', 'Location', 'Home Team', 'Away Team', 'Group', 'Result'], axis=1)
sport1.head()

In [None]:
# Drop duplicates.
holidays = holidays.drop_duplicates(subset='date')
sport = sport.drop_duplicates(subset='date')
sport1 = sport1.drop_duplicates(subset='date')

In [None]:
# Create a new dataframe and merge the three external datasets into filtered_sales_temp2.
filtered_sales_temp3 = filtered_sales_temp2.merge(holidays, on='date', how='left') \
         .merge(sport, on='date', how='left') \
         .merge(sport1, on='date', how='left')

filtered_sales_temp3.info()

In [None]:
# Rename the Matches columns accordingly.
filtered_sales_temp3.rename(columns={'Number_x': 'sport_match'}, inplace=True)
filtered_sales_temp3.rename(columns={'Number_y': 'sport1_match'}, inplace=True)

In [None]:
# Convert to Boolean: True if value is not null, else False.
filtered_sales_temp3['is_holiday'] = filtered_sales_temp3['holiday_name'].notna()
filtered_sales_temp3['is_sport'] = filtered_sales_temp3['sport_match'].notna()
filtered_sales_temp3['is_sport1'] = filtered_sales_temp3['sport1_match'].notna()

In [None]:
# Combine sport_match and sport1_match into 1 single column.
filtered_sales_temp3['is_match'] = filtered_sales_temp3['is_sport'] | filtered_sales_temp3['is_sport1']
filtered_sales_temp3.head()

In [None]:
# Define event category.
def classify_event(row):
    if row['is_holiday'] and row['is_match']:
        return 'Both'
    elif row['is_holiday']:
        return 'Holiday only'
    elif row['is_match']:
        return 'Match only'
    else:
        return 'No event'

filtered_sales_temp3['event_category'] = filtered_sales_temp3.apply(classify_event, axis=1)

# Print uplift summary (mean sales per category).
uplift_summary = (
    filtered_sales_temp3.groupby('event_category')['sold_units']
      .agg(['count', 'mean', 'std'])
      .reindex(['No event', 'Holiday only', 'Match only', 'Both'])
      .reset_index()
)

print('Average Sales per Event Category:\n')
print(uplift_summary)

In [None]:
# Create bar chart with error bars showing the baseline, absolute sales, and uplift percentages.
baseline = uplift_summary[uplift_summary['event_category'] == 'No event']['mean'].values[0]

plt.figure(figsize=(8, 5))
sns.barplot(
    data=uplift_summary,
    x='event_category',
    y='mean',
    yerr=uplift_summary['std'],
    color='skyblue',
    capsize=0.2)

# Add baseline reference line.
plt.axhline(y=baseline, color='gray', linestyle='--', linewidth=1.5, alpha=0.6)
plt.text(-0.4, baseline + 0.2, 'Baseline', color='gray', fontweight='bold', fontsize=9)

# Set y-axis limit.
ymin, ymax = plt.ylim()
y_offset = (ymax - ymin) * 0.02

# Annotate bars with absolute sales + uplift percentage.
for i, row in uplift_summary.iterrows():
    uplift_pct = ((row['mean'] - baseline) / baseline) * 100
    label = f"{row['mean']:.0f} ({uplift_pct:+.0f}%)"

    y_offset = (plt.ylim()[1] - plt.ylim()[0]) * 0.05
    plt.text(i, row['mean'] + y_offset, label, ha='center', fontsize=12, fontweight='bold')

plt.title('Average daily sales by event category')
plt.ylabel('Average units sold')
plt.xlabel('Event category')
plt.tight_layout()
plt.savefig(fname='avg_daily_sales_by_event_category', dpi=500, bbox_inches='tight')
plt.show()

**Average daily sales by event category - uplift analysis**
- Holiday-only days drive the highest uplift in sales, with a +20% increase over non-event days — confirming holidays as a powerful demand trigger.
- Match-only days also show a modest +3% uplift, indicating sports events influence purchasing behaviour, though to a much lesser extent.
- Interestingly, days with both a holiday and a match do not yield the highest uplift, suggesting possible demand dilution or overlapping audience saturation.
- This chart validates the importance of event-aware forecasting and highlights holidays as a key opportunity for targeted inventory scaling.

***Creating a heatmap calendar. This chart was used in the presentation and technical report.***

In [None]:
# Filter Dec 1–14 for both years.
dec_1_14_2021 = filtered_sales_temp3[(filtered_sales_temp3['date'].dt.year == 2021) &
(filtered_sales_temp3['date'].dt.month == 12) & (filtered_sales_temp3['date'].dt.day <= 14)]
dec_1_14_2022 = filtered_sales_temp3[(filtered_sales_temp3['date'].dt.year == 2022) &
(filtered_sales_temp3['date'].dt.month == 12) & (filtered_sales_temp3['date'].dt.day <= 14)]

# Compute total sold units.
total_2021 = dec_1_14_2021['sold_units'].sum()
total_2022 = dec_1_14_2022['sold_units'].sum()

# Compute uplift factor (handle divide-by-zero).
uplift_factor = total_2022 / total_2021 if total_2021 > 0 else 1.0

# Get Dec 15–31, 2021 data.
dec_15_31_2021 = filtered_sales_temp3[(filtered_sales_temp3['date'].dt.year == 2021) &
(filtered_sales_temp3['date'].dt.month == 12) & (filtered_sales_temp3['date'].dt.day >= 15)].copy()

# Adjust date and scale sales.
dec_15_31_2021['date'] = dec_15_31_2021['date'] + pd.DateOffset(years=1)
dec_15_31_2021['sold_units'] = (dec_15_31_2021['sold_units'] * uplift_factor).round().astype(int)

print(f"Uplift factor applied: {uplift_factor:.2f}")
print(dec_15_31_2021[['date', 'sold_units']].head())

In [None]:
# Merge the scaled data of 15-31 Dec 2021 into another dataframe.
full_heatmap_data = pd.concat([filtered_sales_temp3, dec_15_31_2021], ignore_index=True)

In [None]:
# Extract necessary time components.
full_heatmap_data['month'] = full_heatmap_data['date'].dt.month
full_heatmap_data['day'] = full_heatmap_data['date'].dt.day
full_heatmap_data['month_name'] = full_heatmap_data['date'].dt.strftime('%b')

# Set desired month display order.
month_order = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun',
               'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']

# Group by month and day to calculate total sales.
heatmap_totals = full_heatmap_data.groupby(['month_name', 'day'])['sold_units'].sum().unstack()
heatmap_totals = heatmap_totals.reindex(index=month_order)

# Plot the heatmap showing total daily sales by day and month.
plt.figure(figsize=(14, 6))
sns.heatmap(
    heatmap_totals,
    cmap='YlGnBu',
    linewidths=0.5,
    annot=True,
    fmt='.0f',
    annot_kws={'fontsize': 9},
    cbar_kws={'label': 'Total Units Sold'}
)

plt.title('Total Daily Sales by Day and Month (Including Estimated Data for 15–31 Dec 2022)')
plt.xlabel('Day of Month')
plt.ylabel('Month')
plt.tight_layout()
plt.savefig(fname='total_daily_sales_by_day_month', dpi=500, bbox_inches='tight')
plt.show()

**Heatmap of total daily sales by day and month (2021-2022), including estimated data for 15-31 Dec 2022**
- Peak sales occur in July and August, with multiple high-volume days above 50,000 units, especially mid- to late-month.
- Sales ramp up in late spring (May to June) and decline sharply after September, highlighting strong seasonality.
- Weekend-driven spikes are clearly visible, particularly on Fridays, Saturdays, and Sundays in summer months.
- The estimated values for 15-31 Dec 2022 (scaled from 2021) help complete the heatmap and preserve year-end visibility, though they should be interpreted with caution.
- Holidays like Christmas, New Year's Eve, and New Year, should not be discounted as they also drive high sales.

***Creating calendar-style heatmaps for 2021 and 2022. These were used in the appendix of the technical report.***

In [None]:
# Filter for 2022.
calendar_df = filtered_sales_temp3[filtered_sales_temp3['date'].dt.year == 2022].copy()

# Extract calendar components.
calendar_df['month'] = calendar_df['date'].dt.month
calendar_df['day'] = calendar_df['date'].dt.day
calendar_df['weekday'] = calendar_df['date'].dt.weekday  # Monday=0

# Group by date to get total units sold.
calendar_df_grouped = calendar_df.groupby('date')['sold_units'].sum().reset_index()
sales_by_date = calendar_df_grouped.set_index('date')['sold_units']

# Create calendar-style grids for each month.
monthly_grids = {}

for month in range(1, 13):
    # All calendar days (including spillover from prev/next month).
    month_dates = [day for day in calendar.Calendar().itermonthdates(2022, month)]
    grid = np.full((6, 7), np.nan)

    for date in month_dates:
        if date.year == 2022:
            # Position: week_row and weekday column.
            week_row = (date.day + calendar.monthrange(2022, month)[0] - 1) // 7
            col = date.weekday()
            if hasattr(sales_by_date.index[0], 'date'):
                sales_by_date.index = [d.date() for d in sales_by_date.index]
            grid[week_row, col] = sales_by_date.get(date, 0)

    monthly_grids[month] = grid

# Plot the full-year calendar-style heatmap.
fig, axes = plt.subplots(4, 3, figsize=(16, 10))
plt.subplots_adjust(hspace=0.8)

for i, ax in enumerate(axes.flat, start=1):
    if i > 12:
        ax.axis('off')
        continue
    sns.heatmap(
        monthly_grids[i],
        ax=ax,
        cmap='YlGnBu',
        linewidths=0.5,
        linecolor='white',
        cbar=False,
        annot=True,
        fmt='.0f',
        annot_kws={"fontsize": 7}
    )
    ax.set_title(calendar.month_name[i], fontweight='bold')
    ax.set_xticks(np.arange(7) + 0.5)
    ax.set_xticklabels(['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'], rotation=0)
    ax.set_yticks([])
    ax.set_yticklabels([])
    ax.invert_yaxis()

plt.suptitle('2022 Calendar-Style Sales Heatmap', fontsize=16)
plt.savefig(fname='2022_cal_heatmap', dpi=500, bbox_inches='tight')
plt.show()

33. Create a calendar style heatmap for 2021.

In [None]:
# Filter for 2021.
calendar_df_2021 = filtered_sales_temp3[filtered_sales_temp3['date'].dt.year == 2021].copy()

# Extract calendar components.
calendar_df_2021['month'] = calendar_df_2021['date'].dt.month
calendar_df_2021['day'] = calendar_df_2021['date'].dt.day
calendar_df_2021['weekday'] = calendar_df_2021['date'].dt.weekday  # Monday=0

# Group by date to get total units sold.
calendar_df_grouped_2021 = calendar_df_2021.groupby('date')['sold_units'].sum().reset_index()
sales_by_date_2021 = calendar_df_grouped_2021.set_index('date')['sold_units']

# Ensure index is datetime.date for matching.
if hasattr(sales_by_date_2021.index[0], 'date'):
    sales_by_date_2021.index = [d.date() for d in sales_by_date_2021.index]

# Create calendar-style grids for each month.
monthly_grids_2021 = {}

for month in range(1, 13):
     # All calendar days (including spillover from prev/next month).
    month_dates = [day for day in calendar.Calendar().itermonthdates(2021, month)]
    grid = np.full((6, 7), np.nan)
    for date in month_dates:
        if date.year == 2021:
            # Position: week_row and weekday column.
            week_row = (date.day + calendar.monthrange(2021, month)[0] - 1) // 7
            col = date.weekday()
            grid[week_row, col] = sales_by_date_2021.get(date, 0)
    monthly_grids_2021[month] = grid

# Plot the full-year calendar-style heatmap.
fig, axes = plt.subplots(4, 3, figsize=(16, 10))
plt.subplots_adjust(hspace=0.8)

for i, ax in enumerate(axes.flat, start=1):
    if i > 12:
        ax.axis('off')
        continue
    sns.heatmap(
        monthly_grids_2021[i],
        ax=ax,
        cmap='YlGnBu',
        linewidths=0.5,
        linecolor='white',
        cbar=False,
        annot=True,
        fmt='.0f',
        annot_kws={"fontsize": 7}
    )
    ax.set_title(calendar.month_name[i], fontweight='bold')
    ax.set_xticks(np.arange(7) + 0.5)
    ax.set_xticklabels(['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'], rotation=0)
    ax.set_yticks([])
    ax.set_yticklabels([])
    ax.invert_yaxis()

plt.suptitle('2021 Calendar-Style Sales Heatmap', fontsize=16)
plt.savefig(fname='2021_cal_heatmap', dpi=500, bbox_inches='tight')
plt.show()

***Creating weekly_city_sales and weekly_city_sales_temp data frames. These sum the total sales for each week and so can be used to create smoother time series charts. These data frames were used in Tableau.***

In [None]:
# Ensure 'date' column filtered_sales_temp3 is datetime.
filtered_sales_temp3['date'] = pd.to_datetime(filtered_sales_temp3['date'])

# Group by the start of the week.
weekly_city_sales = filtered_sales_temp3.groupby([pd.Grouper(key='date', freq='W-MON'), 'sales_location'], as_index=False).agg({
    'sold_units': 'sum',
    'district/town': 'first',
    'postcode': 'first',
    'province': 'first',
    'month': 'first',
    'year': 'first',
    'season': 'first'})

# Sort by date.
weekly_city_sales.sort_values('date', inplace=True)

print(weekly_city_sales.head())
print(weekly_city_sales.columns)

In [None]:
# Make sure the date column is in datetime.
weekly_city_sales['date'] = pd.to_datetime(weekly_city_sales['date'])
weekly_city_sales.info()

In [None]:
# Ensure date column in 'temperatures' is datetime.
temperatures['date'] = pd.to_datetime(temperatures['date'])

# Create a 'week_start' column for grouping.
temperatures['week_start'] = temperatures['date'] - pd.to_timedelta(temperatures['date'].dt.weekday, unit='d')

# Group by 'week_start' and average relevant columns.
weekly_weather = temperatures.groupby('week_start', as_index=False).agg({
    'month': 'first',
    'year': 'first',
    'season': 'first',
    'avg_temp': 'mean',
    'min_temp': 'mean',
    'max_temp': 'mean',
    'precip': 'mean',
    'wind_speed': 'mean',
    'avg_pressure': 'mean'
})

# Sort by date.
weekly_weather.sort_values(by='week_start')

# Rename week_start.
weekly_weather.rename(columns={'week_start': 'date'}, inplace=True)

In [None]:
# Merge weekly_city_sales and weekly_weather.
weekly_city_sales_temp = pd.merge(weekly_city_sales, weekly_weather, on='date', how='left')
weekly_city_sales_temp.head()

In [None]:
# Save weekly_city_sales_temp as a csv.
weekly_city_sales_temp.to_csv('weekly_city_sales_temp.csv', index=False)

***Creating a feature importance chart on weekly sales, temperature, and location. This was used in the presentation.***

In [None]:
# Encode the strings to numeric

for col in ['sales_location', 'province', 'district/town']:
    le = LabelEncoder()
    weekly_city_sales_temp[col] = le.fit_transform(weekly_city_sales_temp[col])

# Feature importance on weekly sales vs temperature vs location.
features = ['avg_temp', 'max_temp', 'sales_location', 'province', 'district/town']
X = weekly_city_sales_temp[features]
y = weekly_city_sales_temp['sold_units']

# Fit the Random Forest model.
model = RandomForestRegressor()
model.fit(X, y)

# Rename labels.
clean_labels = {
    'avg_temp': 'Avg temp (°C)',
    'max_temp': 'Max temp (°C)',
    'sales_location': 'sales_location (Encoded)',
    'province': 'Province (Encoded)',
    'district/town': 'District/Town (Encoded)'
}

# Map feature importance to cleaned labels.
raw_importance = model.feature_importances_
features_key = pd.Series(raw_importance, index=[clean_labels[f] for f in features]).sort_values(ascending=False)
color_palette = sns.color_palette('OrRd_r', len(features_key))

print('Feature Importance Ranking:')
print(features_key)

plt.figure(figsize=(8, 6))
plt.title('What Drives Weekly Sales Most?')
sns.barplot(x=features_key.values, y=features_key.index, palette='OrRd_r')
plt.xlabel('Importance Score')
plt.ylabel('Feature')
plt.tight_layout()
plt.savefig(fname='what_drives_weekly_sales_most.png', dpi=500, bbox_inches='tight')
plt.show()

***Creating a correlation matrix on daily sales. This was in the technical report.***

In [None]:
# Correlation matrix for daily sales, temperature, and location features.
# Convert location features to numeric.
label_encoders = {}
for col in ['province', 'district/town', 'sales_location']:
    le = LabelEncoder()
    filtered_sales_temp3[col] = le.fit_transform(filtered_sales_temp3[col])
    label_encoders[col] = le

filtered_sales_temp3['avg_temp'] = pd.to_numeric(filtered_sales_temp3['avg_temp'], errors='coerce')
filtered_sales_temp3['max_temp'] = pd.to_numeric(filtered_sales_temp3['max_temp'], errors='coerce')

# Select relevant features.
correlation_columns = ['sold_units', 'province', 'district/town', 'sales_location', 'avg_temp', 'max_temp']
correlation_data = filtered_sales_temp3[correlation_columns]
correlation_data = correlation_data.dropna()

correlation_matrix_3 = correlation_data.corr()

# Clean labels.
clean_labels = {
    'sold_units': 'Daily units Sold',
    'province': 'Province (Encoded)',
    'district/town': 'District/Town (Encoded)',
    'sales_location': 'sales_location (Encoded)',
    'avg_temp': 'Avg temp (°C)',
    'max_temp': 'Max temp (°C)'
}

# Rename for both rows and columns.
correlation_matrix_3.rename(index=clean_labels, columns=clean_labels, inplace=True)

print('Correlation Matrix - Daily Sales, Temperature, and Location Features')
print(correlation_matrix_3)

plt.figure(figsize=(8, 6))
sns.heatmap(correlation_matrix_3, annot=True, cmap='Blues', fmt='.2f')
plt.title('Correlation Matrix - Daily Sales, Temperature, and Location Features')
plt.tight_layout()
plt.savefig(fname='correlation_matrix_daily_sales_temp_location.png', dpi=500, bbox_inches='tight')
plt.show()

Note: 'sales_location', 'Province', and 'District/Town' are labelled as 'Encoded' because they are categorical variables turned numeric.

There is a weak positive correlation between daily sales and temperature (0.26).

***Creating a feature importance chart on daily sales, temperature, location, and day of the week. This was used in the presentation and technical report.***

In [None]:
# Feature importance on daily sales vs temperature vs location vs day of week.
features = ['province', 'district/town', 'sales_location', 'avg_temp', 'max_temp', 'day_of_week']
X = filtered_sales_temp3[features]

# Encode the strings to numeric
for col in ['province', 'district/town', 'sales_location', 'day_of_week']:
    le = LabelEncoder()
    filtered_sales_temp3[col] = le.fit_transform(filtered_sales_temp3[col])

# Update X after encoding
X = filtered_sales_temp3[features]
y = filtered_sales_temp3['sold_units']

# Fit the Random Forest model.
model = RandomForestRegressor()
model.fit(X, y)

# Clean labels.
clean_labels = {
    'province': 'Province (Encoded)',
    'district/town': 'District/Town (Encoded)',
    'sales_location': 'sales_location (Encoded)',
    'avg_temp': 'Avg temp (°C)',
    'max_temp': 'Max temp (°C)',
    'day_of_week': 'Day of week'
}

# Map feature importance to cleaned labels.
features_key = pd.Series(model.feature_importances_, index=[clean_labels[f] for f in features]).sort_values(ascending=False)
color_palette = sns.color_palette('Blues_r', len(features_key))

print("Feature Importance Ranking:")
print(features_key)

# Visualise
plt.figure(figsize=(8, 6))
plt.title('What Drives Daily Sales Most? (Including Day of Week)')
sns.barplot(x=features_key.values, y=features_key.index, palette='YlGnBu_r')
plt.xlabel('Importance Score')
plt.ylabel('Feature')
plt.tight_layout()
plt.savefig(fname='what_drives_weekly_sales_most_include_dayofweek.png', dpi=500, bbox_inches='tight')
plt.show()

***Creating a north_south column. This new weekly_city_sales was then used to create visuals in Tableau.***

In [None]:
# Create a north_south column.

north_districts = ['removed to ensure anonymity of employer']

# Assign "North" if in the list, otherwise "South".
weekly_city_sales['north_south'] = weekly_city_sales['district/town'].apply(lambda x: 'North' if x in north_districts else 'South')

# View weekly_city_sales.
weekly_city_sales.head()

In [None]:
# Any missing values?
weekly_city_sales['north_south'].isna().sum()