# Store Date Preparation

In [None]:
import pandas as pd
from IPython.display import display
from sklearn import preprocessing

stores_file="../data/rossmann-store-sales/store.csv"
# read in file
df_stores=pd.read_csv(stores_file)
display(df_stores)

## Data Preparation

In [None]:
## TODO rename PromoInterval to Promo2Interval (see rename method of Pandas Data Frames)
pd.DataFrame.rename(df_stores, columns={"PromoInterval": "Promo2Interval"}, inplace=True)
## TODO ajust Data Types of Promo2 to Boolean (https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.astype.html)
pd.DataFrame.astype(df_stores, {"Promo2": "bool"}, copy=False)

### Promo2 data cleaning

In [None]:
# locate the problem
print(df_stores.isna().mean())
print("Value Count",  df_stores["Promo2"].value_counts())
print("Number of lines without promo2: ", df_stores[(df_stores["Promo2"] == False) & (df_stores["Promo2SinceWeek"].isna())].shape[0])
print("Number of lines with promo2 and Null values in Promo2Since: ", df_stores[(df_stores["Promo2"] == True) & (df_stores["Promo2SinceWeek"].isna())].shape[0])

print(df_stores.dtypes)

# TODO set Promo2Interval, Promo2SinceWeek and Promo2SinceYear to 0 when no Promo2 (see https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.fillna.html)
pd.DataFrame.fillna(df_stores, {"Promo2SinceWeek": 0, "Promo2SinceYear": 0, "Promo2Interval": 0}, inplace=True)

# TODO change data types to int Promo2SinceWeek, Promo2SinceYear
df_stores['Promo2SinceWeek'].astype(int, copy=False)
df_stores['Promo2SinceYear'].astype(int, copy=False)

print("Value Count after fix",  df_stores["Promo2SinceWeek"].value_counts())
print("Number of lines without promo2 after fix: ", df_stores[(df_stores["Promo2"] == False) & (df_stores["Promo2SinceWeek"].isna())].shape[0])
print("Number of lines with promo2 and Null values in Promo2Since after fix: ", df_stores[(df_stores["Promo2"] == True) & (df_stores["Promo2SinceWeek"].isna())].shape[0])
# verify data types

print (df_stores.dtypes)

### Null values in Competition Distance cleaning

In [None]:
# TODO handel Null values in CompetitionDistance by deleting the instances completely
df_stores.dropna(subset=['CompetitionDistance'], inplace=True)
print(df_stores.shape[0])

## Binning the Competition Distance

In [None]:
est = preprocessing.KBinsDiscretizer(n_bins=25, encode='ordinal').fit(df_stores[["CompetitionDistance"]])
df_stores["CompetitionDistanceBin"] = est.transform(df_stores[["CompetitionDistance"]])  
display(df_stores)  
# TODO assign a reasonable value to each bin (mean, ...); see https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.KBinsDiscretizer.html
bin_means = df_stores.groupby("CompetitionDistanceBin")["CompetitionDistance"].mean()
df_stores["CompetitionDistanceBin_mean"] = df_stores["CompetitionDistanceBin"].map(bin_means)

bin_medians = df_stores.groupby("CompetitionDistanceBin")["CompetitionDistance"].median()
df_stores["CompetitionDistanceBin_median"] = df_stores["CompetitionDistanceBin"].map(bin_medians)

bin_edges = est.bin_edges_[0]
bin_centers = (bin_edges[:-1] + bin_edges[1:]) / 2
df_stores["CompetitionDistanceBin_center"] = df_stores["CompetitionDistanceBin"].map(lambda x: bin_centers[int(x)])
display(df_stores)