In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm



df = pd.read_csv("data/deponieanlieferungen-tufentobel.csv", delimiter=';')

#Check for missing values
missing_values = df.isna().sum()
missing_values

#missing values are in column "Kanton" with only 94 values missing
#also, the there are no relevant outliers in these 94 values
df[df['Kanton'].isna()].describe()

#drop missing values for Kanton
df.dropna(subset=['Kanton'], inplace=True)

#check where the anlieferungen are zero tons
df[df['Gewicht in Tonnen'] == 0].value_counts().sum()

#remove these values
df.drop(df[df['Gewicht in Tonnen'] == 0].index, inplace=True)

#test if it worked
df[df['Gewicht in Tonnen'] == 0].value_counts().sum()

#check very small values
#print(df[df['Gewicht in Tonnen'] < 0.1].value_counts().sum())

#check duplicates
duplicates = df.duplicated()
#print(f"Number of duplicate rows: {duplicates.sum()}")

# #Visualize the outliers in a plot
# plt.figure(figsize=(10, 6))
# plt.boxplot(df['Gewicht in Tonnen'], vert=False)
# plt.title('Boxplot of Gewicht in Tonnen')
# plt.xlabel('Gewicht in Tonnen')
# plt.show()


df[df['Gewicht in Tonnen'] > 35]

#there is one outlier with 56.7 tons, the other values dont go over 35 tons
#remove this outlier
df.drop(df[df['Gewicht in Tonnen'] > 50].index, inplace=True)

#Time series decomposition

# Convert the date column to datetime
df['Anlieferungsdatum'] = pd.to_datetime(df['Anlieferungsdatum'], utc=True)

# Set the date column as the index
df.set_index('Anlieferungsdatum', inplace=True)

In [8]:
df

Unnamed: 0_level_0,Deponie Typ,Material,Gewicht in Tonnen,Kanton,Baustelle Stadt intern
Anlieferungsdatum,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2021-09-20 15:11:00+00:00,Typ B,Inertstoffe,3.14,SG,nein
2021-09-20 15:44:00+00:00,Typ E,stark verschmutzte Abfälle,21.04,SG,nein
2021-09-20 15:48:00+00:00,Typ E,stark verschmutzte Abfälle,22.00,SG,nein
2021-09-20 16:12:00+00:00,Typ E,stark verschmutzte Abfälle,8.12,SG,nein
2021-09-20 16:50:00+00:00,Typ B,Inertstoffe,3.38,SG,nein
...,...,...,...,...,...
2024-02-28 13:20:00+00:00,Typ A,Sauberer Aushub,24.28,AR,nein
2024-02-28 13:54:00+00:00,Typ A,Sauberer Aushub,24.80,SG,nein
2024-02-28 15:35:00+00:00,Typ E,stark verschmutzte Abfälle,19.34,SG,nein
2024-02-28 16:10:00+00:00,Typ A,Sauberer Aushub,24.40,AR,nein


In [9]:
# Resample the data to daily frequency, sum weights, and pivot the table
daily_data = df.groupby([pd.Grouper(freq='D'), 'Material'])['Gewicht in Tonnen'].sum().unstack()

# Plot the daily data for each type of material
# daily_data.plot(figsize=(14, 10))
# plt.title('Daily Anlieferungen by Material')
# plt.xlabel('Date')
# plt.ylabel('Gewicht in Tonnen')
# plt.legend(title='Material')
# plt.show()

# Perform seasonal decomposition
daily_data.replace(np.nan, 0, inplace=True)

In [13]:
daily_data

Material,Inertstoffe,Räumungsschnee,Sauberer Aushub,Schlacke,stark verschmutzte Abfälle
Anlieferungsdatum,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2019-01-03 00:00:00+00:00,0.00,0.00,0.00,73.04,0.00
2019-01-04 00:00:00+00:00,0.00,0.00,0.00,72.58,0.00
2019-01-07 00:00:00+00:00,74.08,1977.26,0.00,74.18,42.40
2019-01-08 00:00:00+00:00,22.96,2664.14,0.00,73.16,35.26
2019-01-09 00:00:00+00:00,25.04,2657.52,0.00,71.62,7.14
...,...,...,...,...,...
2024-02-22 00:00:00+00:00,156.56,0.00,0.00,159.16,28.14
2024-02-23 00:00:00+00:00,167.80,0.00,2.20,0.00,106.96
2024-02-26 00:00:00+00:00,435.98,0.00,1805.62,181.08,111.92
2024-02-27 00:00:00+00:00,343.96,0.00,1157.26,0.00,48.02


Decomposition

In [10]:
decomposition = sm.tsa.seasonal_decompose(daily_data, model='additive', extrapolate_trend='freq')

# # Plot the decomposition
# fig = decomposition.plot()
# fig.set_size_inches(14, 10)
# plt.show()

ValueError: You must specify a period or x must be a pandas object with a PeriodIndex or a DatetimeIndex with a freq not set to None