In [35]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from preprocessing_weather import WEATHER
from preprocessing_pedestrians import PEDESTRIANS
from BO9 import BO9
from preprocessing_permits import PERMITS


df = pd.read_csv("data/deponieanlieferungen-tufentobel.csv", delimiter=';')
#preprocessing of df (tüfentobel)

#Check for missing values
missing_values = df.isna().sum()
# missing_values

#missing values are in column "Kanton" with only 94 values missing
#also, the there are no relevant outliers in these 94 values
# df[df['Kanton'].isna()].describe()

#drop missing values for Kanton
df.dropna(subset=['Kanton'], inplace=True)

#check where the anlieferungen are zero tons
df[df['Gewicht in Tonnen'] == 0].value_counts().sum()

#remove these values
df.drop(df[df['Gewicht in Tonnen'] == 0].index, inplace=True)

#test if it worked
df[df['Gewicht in Tonnen'] == 0].value_counts().sum()

#check very small values
#print(df[df['Gewicht in Tonnen'] < 0.1].value_counts().sum())

#check duplicates
duplicates = df.duplicated()
#print(f"Number of duplicate rows: {duplicates.sum()}")

# #Visualize the outliers in a plot
# plt.figure(figsize=(10, 6))
# plt.boxplot(df['Gewicht in Tonnen'], vert=False)
# plt.title('Boxplot of Gewicht in Tonnen')
# plt.xlabel('Gewicht in Tonnen')
# plt.show()


# df[df['Gewicht in Tonnen'] > 35]

#there is one outlier with 56.7 tons, the other values dont go over 35 tons
#remove this outlier
df.drop(df[df['Gewicht in Tonnen'] > 50].index, inplace=True)



ModuleNotFoundError: No module named 'chardet'

In [14]:
# Convert the date column to datetime and ensure the format is year, month, day
df['Anlieferungsdatum'] = pd.to_datetime(df['Anlieferungsdatum'], utc=True).dt.date
df = df[["Anlieferungsdatum", "Material", "Gewicht in Tonnen"]]

In [15]:
# merge the features pedestrians and weather with the df
merged_features = pd.merge(PEDESTRIANS, WEATHER, on='Date', how='inner')
merged_features['Date'] = pd.to_datetime(merged_features['Date'], utc=True).dt.date

In [17]:
# print(merged_features.head())
features = pd.merge(df, merged_features, left_on='Anlieferungsdatum', right_on="Date", how='inner')
features.head()

Unnamed: 0,Anlieferungsdatum,Material,Gewicht in Tonnen,Date,Day,Workday,Total Pedestrians,Location,Temperature mean,Temperature max,Temperature min,Precipitation in mm,Snow amount in cm
0,2021-09-20,Inertstoffe,3.14,2021-09-20,1,1,2687.0,St. Gallen,10.7,12.8,9.0,1.1,0.0
1,2021-09-20,stark verschmutzte Abfälle,21.04,2021-09-20,1,1,2687.0,St. Gallen,10.7,12.8,9.0,1.1,0.0
2,2021-09-20,stark verschmutzte Abfälle,22.0,2021-09-20,1,1,2687.0,St. Gallen,10.7,12.8,9.0,1.1,0.0
3,2021-09-20,stark verschmutzte Abfälle,8.12,2021-09-20,1,1,2687.0,St. Gallen,10.7,12.8,9.0,1.1,0.0
4,2021-09-20,Inertstoffe,3.38,2021-09-20,1,1,2687.0,St. Gallen,10.7,12.8,9.0,1.1,0.0


In [33]:
BO9.sort_values(by='Anlieferungsdatum', inplace=True, ascending=True)
nineteen =

2019-01-01 00:00:00    False
2019-01-02 00:00:00    False
2019-01-03 00:00:00    False
2019-01-04 00:00:00    False
2019-01-05 00:00:00    False
Name: Anlieferungsdatum, dtype: bool

In [22]:
# Perform a many-to-one join by merging on 'Anlieferungsdatum' and 'Date' columns
new_features = pd.merge(features, BO9, on='Anlieferungsdatum', how='left')

# Drop the redundant 'Date' column from the merged dataframe
new_features.sort_values(by='Anlieferungsdatum', inplace=True, ascending=False)
new_features.head()

Unnamed: 0,Anlieferungsdatum,Material,Gewicht in Tonnen,Date,Day,Workday,Total Pedestrians,Location,Temperature mean,Temperature max,Temperature min,Precipitation in mm,Snow amount in cm,Investments
86133,2024-02-28,Sauberer Aushub,24.4,2024-02-28,3,1,1605.0,St. Gallen,4.1,4.8,3.4,0.0,0.0,
71190,2024-02-28,stark verschmutzte Abfälle,3.4,2024-02-28,3,1,1605.0,St. Gallen,4.1,4.8,3.4,0.0,0.0,
71194,2024-02-28,Inertstoffe,1.62,2024-02-28,3,1,1605.0,St. Gallen,4.1,4.8,3.4,0.0,0.0,
71196,2024-02-28,Inertstoffe,17.16,2024-02-28,3,1,1605.0,St. Gallen,4.1,4.8,3.4,0.0,0.0,
71198,2024-02-28,Sauberer Aushub,23.62,2024-02-28,3,1,1605.0,St. Gallen,4.1,4.8,3.4,0.0,0.0,


In [5]:
# Drop the redundant 'Date' column from the merged dataframe
new_features.tail(20)

Unnamed: 0,Anlieferungsdatum,Material,Gewicht in Tonnen,Date,Day,Workday,Total Pedestrians,Location,Temperature mean,Temperature max,Temperature min,Precipitation in mm,Snow amount in cm,Investments
65408,2022-12-22,Inertstoffe,4.62,2022-12-22,4,1,3152.0,St. Gallen,9.0,11.1,7.1,0.8,0.0,0.0
65409,2022-12-22,stark verschmutzte Abfälle,1.9,2022-12-22,4,1,3152.0,St. Gallen,9.0,11.1,7.1,0.8,0.0,0.0
65410,2022-12-22,Inertstoffe,25.84,2022-12-22,4,1,3152.0,St. Gallen,9.0,11.1,7.1,0.8,0.0,0.0
65411,2022-12-22,Inertstoffe,21.34,2022-12-22,4,1,3152.0,St. Gallen,9.0,11.1,7.1,0.8,0.0,0.0
65412,2022-12-22,Inertstoffe,24.1,2022-12-22,4,1,3152.0,St. Gallen,9.0,11.1,7.1,0.8,0.0,0.0
65413,2022-12-22,Inertstoffe,7.48,2022-12-22,4,1,3152.0,St. Gallen,9.0,11.1,7.1,0.8,0.0,0.0
65414,2022-12-22,Inertstoffe,19.56,2022-12-22,4,1,3152.0,St. Gallen,9.0,11.1,7.1,0.8,0.0,0.0
65415,2022-12-22,stark verschmutzte Abfälle,3.4,2022-12-22,4,1,3152.0,St. Gallen,9.0,11.1,7.1,0.8,0.0,0.0
65416,2022-12-22,Inertstoffe,23.4,2022-12-22,4,1,3152.0,St. Gallen,9.0,11.1,7.1,0.8,0.0,0.0
65417,2022-12-22,Inertstoffe,22.68,2022-12-22,4,1,3152.0,St. Gallen,9.0,11.1,7.1,0.8,0.0,0.0


In [34]:
PERMITS.head()

NameError: name 'PERMITS' is not defined