## Meteo Bakery - Combine datasets
This notebook serves to combine sales data with the weather summary statistics.

### import libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

### load data

In [None]:
# load sales data
sales = pd.read_excel('../data/neueFische_Umsaetze_Baeckerei.xlsx')

In [None]:
# load data on engineered weather features
weather_stats = pd.read_csv('../data/summary_stats.csv')

In [None]:
# load holidays data and append to sales dataframe

# school holidays from https://www.schulferien.org/oesterreich/ferien/2012/
school_hols = pd.read_excel("../data/school_holidays.xlsx")

# public holidays from google search "Feiertage Wien 'YEAR'"
public_hols = pd.read_excel("../data/public_holidays.xlsx")
public_hols.date = pd.to_datetime(public_hols.date)

In [None]:
# load Corona data
corona = pd.read_excel("../data/corona-measures-vienna.xlsx")

### Feature Engineering - Sales

In [None]:
# get basic information on datatypes and missings
sales.info()

In [None]:
# generate location column based on branch
# Filiale 1: U-Bahn
# Filiale 2: Innenstadt
# Filiale 3: Bahnhof

sales['Branch'] = sales.Branch.apply(lambda x: 'Metro' if x==1 else 'Center' if x==2 else 'Train_Station')
sales.head()

There are three missing values in the sales data ('SoldTurnver').

In [None]:
sales.columns

In [None]:
# rename columns
sales.rename(columns={'Branch': 'branch', 'PredictionGroupName': 'product', 'SoldTurnver': 'turnover'}, inplace=True)

In [None]:
# extract time features from Date column
sales['year'] = sales.Date.dt.year
sales['month'] = sales.Date.dt.month
sales['week'] = sales.Date.dt.week
sales['day_of_month'] = sales.Date.dt.day
sales['day_of_week'] = sales.Date.dt.dayofweek

In [None]:
sales.rename(columns={'Date': 'date'}, inplace=True)
sales.head()

### Merge dataframes

In [None]:
# append holidays to sales data by creating true/false columns
sales["school_holiday"] = sales["date"].isin(school_hols["date"])
sales["public_holiday"] = sales["date"].isin(public_hols["date"])

In [None]:
# broadcast lockdown times
sales["lock"] = 'open'
sales.loc[(sales.date >= pd.to_datetime("2020-03-10")) & (sales.date < pd.to_datetime("2020-04-14")),"lock"] = "lockdown"
sales.loc[(sales.date >= pd.to_datetime("2020-11-03")) & (sales.date < pd.to_datetime("2020-11-17")),"lock"] = "lockdown_light"
sales.loc[(sales.date >= pd.to_datetime("2020-11-17")) & (sales.date < pd.to_datetime("2020-12-06")),"lock"] = "lockdown"
sales.loc[(sales.date >= pd.to_datetime("2020-12-26")) & (sales.date < pd.to_datetime("2021-02-07")),"lock"] = "lockdown"
sales.loc[(sales.date >= pd.to_datetime("2021-04-01")) & (sales.date < pd.to_datetime("2021-05-02")),"lock"] = "lockdown"
sales.loc[(sales.date >= pd.to_datetime("2021-11-08")) & (sales.date < pd.to_datetime("2021-12-31")),"lock"] = "lockdown"

In [None]:
weather_stats.info()

In [None]:
# parse date to datetime
weather_stats['date'] = pd.to_datetime(weather_stats['date'])

In [None]:
# merge dataframes
df_joined = sales.merge(weather_stats, on='date', how='left')

In [None]:
df_joined.head(20)

In [None]:
# export combined data to csv file
df_joined.to_csv('../data/data_combined.csv', index=False)