## Create Product Location Grid (FILTERED)

In [1]:
# Libaries
import warnings
warnings.filterwarnings('ignore')
import sys

# Basics
import pandas as pd
import numpy as np
import math 
from itertools import product

# Visualisation
import seaborn as sns
import matplotlib.pyplot as plt

# Geo Data
import pgeocode
from geopy.geocoders import Nominatim

In [2]:
# Load Dataset (Select B2C Sales)

### ORDER DATA ###
df = pd.read_csv("../data/processed/b2c_orders_clean.csv")
### HOLIDAY DATA ###
holiday_df = pd.read_csv("../data/intermediate/calender.csv")
### WEATHER DATA ### 
weather_df = pd.read_csv("../data/intermediate/weather.csv")

In [3]:
### GET STATE ####

# define Germany 
nomi = pgeocode.Nominatim('de')

# get state names from zip codes and assign to new column 
df["state"] = nomi.query_postal_code(df["billing_zip"].tolist()).state_name.tolist()

In [4]:
df.business_type.value_counts()

B2C    5395860
Name: business_type, dtype: int64

In [5]:
print(df.state.isna().sum())
print((df.state.isna().sum() / len(df))*100)

342703
6.351221121378242


In [6]:
df[df['state'].isnull()].head(3)

Unnamed: 0,date,order_id,order_item_id,customer_id,order_datetime,quantity,product_variant_id,warehouse_chain,item_discount,billing_country,billing_zip,business_type,filled_prices,state
184,2022-03-28,5103674889350632864,-4046019662819641819,-5049257674105794573,2022-03-28 15:11:41,1,8.1068e+18,WH-Chain-2,4.49,DE,89387,B2C,29.95,
233,2021-05-15,3486896222173923246,6486245077222367310,-5049257674105794573,2021-05-15 20:19:24,1,3.002792e+18,WH-Chain-2,0.0,DE,93951,B2C,86.27,
733,2021-11-28,7051856995147585260,6439824637857445392,-5049257674105794573,2021-11-28 16:36:22,2,8.686326e+18,WH-Chain-2,9.88,DE,45157,B2C,32.95,


## Clean Data

There are some ZIP code that are wrong. Probably a human mistake.

- Check for other missing values 
- Delete values 
- Allign state names
- Group data according to state - sum quantity 
- Join Weather and Location data 
- Check for correlations 

In [7]:
# Check for missing values 
df.isna().sum()

date                       0
order_id                   0
order_item_id              0
customer_id                0
order_datetime             0
quantity                   0
product_variant_id         0
warehouse_chain            0
item_discount              0
billing_country            0
billing_zip                0
business_type              0
filled_prices              0
state                 342703
dtype: int64

In [8]:
# select non na rows of state
df = df[df['state'].notna()]

In [9]:
#test 
df.state.isna().sum()

0

In [10]:
# Change names 

# From get_weather_data notebook:

# Change names of station names to avoid errors
df['state'] = df['state'].str.replace('-','_')
df['state'] = df['state'].str.replace('ü','ue')
df['state'] = df['state'].str.replace('/','_')

# test
# check if both lists have the same entries
a = df.state.unique().tolist()
b = weather_df.state.unique().tolist()

# test (16 unique states) coorect!
set(a) & set(b)

{'Baden_Wuerttemberg',
 'Bayern',
 'Berlin',
 'Brandenburg',
 'Bremen',
 'Hamburg',
 'Hessen',
 'Mecklenburg_Vorpommern',
 'Niedersachsen',
 'Nordrhein_Westfalen',
 'Rheinland_Pfalz',
 'Saarland',
 'Sachsen',
 'Sachsen_Anhalt',
 'Schleswig_Holstein',
 'Thueringen'}

In [11]:
# group datasets on date and sum quantity

# Create date column 
df["date"] = pd.to_datetime(df["order_datetime"]).dt.date

# Order: Group by date and state
state_day_sales_df = df.groupby(["date", "state"]).sum()

### JOIN 

In [12]:
# Join
def join_all(state_day_sales_df, weather_df, holiday_df):

    # make date index to merge on date 
    weather_df["date"] = pd.to_datetime(weather_df["date"]).dt.date
    weather_df.set_index(["date", "state"], inplace=True)
    
    # Join state and weather
    weather_sales_df = pd.merge(state_day_sales_df, weather_df, left_index=True, right_index=True)


    
    weather_sales_df.reset_index(inplace=True)
    weather_sales_df["date"] = pd.to_datetime(holiday_df.date)
    weather_sales_df.set_index("date", inplace=True)


    holiday_df["date"] = pd.to_datetime(holiday_df.date)
    holiday_df.set_index("date", inplace=True)



    #merge weather_sales_df with holiday_df to get final_df
    final_df = pd.merge(weather_sales_df, holiday_df, left_index=True, right_index=True)

    # Drop columns useless columns
    drop_columns = ["filled_prices", 'order_id', 'order_item_id', 'customer_id',
                    'product_variant_id', 'item_discount', 'station_id']

    final_df = final_df.drop(columns=drop_columns)
    return final_df


final_df = join_all(state_day_sales_df, weather_df, holiday_df)
final_df

Unnamed: 0_level_0,state,quantity,precipitation_height,sunshine_duration,temperature_air_mean_200,sunshine_duration_h,suns_classes,temp_classes,rain_classes,is_holiday,...,holiday_Erster Weihnachtstag,holiday_Karfreitag,holiday_Neujahr,holiday_Ostermontag,holiday_Pfingstmontag,holiday_Reformationstag,holiday_Tag der Deutschen Einheit,holiday_Zweiter Weihnachtstag,xmasweek,blackweek
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2015-01-01,Baden_Wuerttemberg,3,0.0,23338.8,274.65,6.483,6,1,0,1,...,0,0,1,0,0,0,0,0,0,0
2015-01-02,Bayern,3,0.0,22798.8,274.45,6.333,6,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2015-01-03,Berlin,3,0.0,24361.2,274.25,6.767,6,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2015-01-04,Hamburg,1,0.1,32518.8,275.05,9.033,7,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2015-01-05,Hessen,1,0.0,38340.0,278.85,10.650,8,4,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-12-27,Sachsen,89,0.3,9118.8,280.95,2.533,4,1,1,0,...,0,0,0,0,0,0,0,0,0,0
2022-12-28,Sachsen_Anhalt,49,0.2,0.0,281.15,0.000,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2022-12-29,Schleswig_Holstein,150,0.6,15541.2,282.05,4.317,6,2,2,0,...,0,0,0,0,0,0,0,0,0,0
2022-12-30,Thueringen,33,1.5,0.0,279.95,0.000,0,0,4,0,...,0,0,0,0,0,0,0,0,0,0


In [20]:
final_df.to_csv("../data/final/product_location.csv")

## SOME EDA

In [19]:
#Without 0's
correlation = final_df.reset_index()
correlation.set_index(["date", "state"], inplace=True)

pd.DataFrame(np.corrcoef(correlation.values, rowvar=False), columns=correlation.columns)

Unnamed: 0,quantity,precipitation_height,sunshine_duration,temperature_air_mean_200,sunshine_duration_h,suns_classes,temp_classes,rain_classes,is_holiday,is_event,...,holiday_Erster Weihnachtstag,holiday_Karfreitag,holiday_Neujahr,holiday_Ostermontag,holiday_Pfingstmontag,holiday_Reformationstag,holiday_Tag der Deutschen Einheit,holiday_Zweiter Weihnachtstag,xmasweek,blackweek
0,1.0,0.01508,-0.151918,0.035095,-0.151918,-0.084998,-0.021489,0.036541,-0.002457,-0.023628,...,0.001409,-0.014364,0.020785,-0.011675,0.000632,-0.007999,-0.017053,0.011887,0.023345,0.015802
1,0.01508,1.0,-0.354572,-0.045202,-0.354572,-0.368872,-0.154764,0.804113,0.001691,-0.027883,...,0.014888,0.033715,-0.01062,-0.01062,-0.005761,-0.006967,-0.018667,-0.018211,0.011538,0.0039
2,-0.151918,-0.354572,1.0,0.278537,1.0,0.95054,0.385751,-0.516809,0.026899,-0.023406,...,0.005582,0.014533,0.020627,0.00289,-0.000654,-0.020048,0.035699,0.006526,0.008142,-0.028488
3,0.035095,-0.045202,0.278537,1.0,0.278537,0.182479,0.631844,-0.075772,0.025999,-0.009856,...,0.009241,0.009513,-0.004499,0.007472,0.008288,0.007875,0.008833,0.002711,0.007854,-0.003967
4,-0.151918,-0.354572,1.0,0.278537,1.0,0.95054,0.385751,-0.516809,0.026899,-0.023406,...,0.005582,0.014533,0.020627,0.00289,-0.000654,-0.020048,0.035699,0.006526,0.008142,-0.028488
5,-0.084998,-0.368872,0.95054,0.182479,0.95054,1.0,0.405549,-0.521098,0.033235,-0.029043,...,0.010475,0.012742,0.024077,0.003674,0.001407,-0.024319,0.039946,0.012742,0.014986,-0.030804
6,-0.021489,-0.154764,0.385751,0.631844,0.385751,0.405549,1.0,-0.186554,0.048098,-0.023918,...,0.019147,0.016847,0.007645,0.016847,0.019147,-0.002987,0.012246,0.009945,0.025655,-0.026074
7,0.036541,0.804113,-0.516809,-0.075772,-0.516809,-0.521098,-0.186554,1.0,-0.009456,-0.024995,...,0.008885,0.011255,-0.010076,-0.000595,-0.012446,-0.011091,-0.026667,-0.024297,0.021897,0.002928
8,-0.002457,0.001691,0.026899,0.025999,0.026899,0.033235,0.048098,-0.009456,1.0,-0.016844,...,0.327329,0.327329,0.327329,0.327329,0.327329,0.11559,0.327329,0.327329,0.105572,-0.022375
9,-0.023628,-0.027883,-0.023406,-0.009856,-0.023406,-0.029043,-0.023918,-0.024995,-0.016844,1.0,...,-0.005513,-0.005513,-0.005513,-0.005513,-0.005513,-0.001947,-0.005513,-0.005513,-0.014709,0.369037


In [None]:
plt.figure(figsize=(15,10))

test = final_df.reset_index()
test = test.set_index("date")
test.groupby('state')['quantity'].plot(legend=True)

