In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import chardet

In [5]:
from preprocessing_weather import WEATHER
from preprocessing_pedestrians import PEDESTRIANS
from preprocessing_permits import PERMITS

ISO-8859-1
1       SG
4       SG
25      SG
28      SG
33      SG
        ..
1956    SG
1957    SG
1961    SG
1966    SG
1972    SG
Name: BaustKanton, Length: 437, dtype: object
['Baubewilligung erteilt']
['Baugesuch eingereicht' 'Projekt' 'Baugesuch - Vorabinfo']
  BaustadiumDatum BaustadiumDatumAlt
0      03.06.2022         22.11.2021
1      30.11.2021         02.10.2015
3      05.02.2020         17.06.2019
4      05.07.2023         21.11.2019
5      15.11.2024         06.01.2017
  BaustadiumDatum BaustadiumDatumAlt
0      2022-06-03         2021-11-22
1      2021-11-30         2015-10-02
3      2020-02-05         2019-06-17
4      2023-07-05         2019-11-21
5      2024-11-15         2017-01-06
Checking for NaT in BaustadiumDatum:
0

Checking for NaT in BaustadiumDatumAlt:
0
ObjektNr                       int64
Objektname                    object
Baustelle                     object
BaustPLZ                       int64
BaustOrt                      object
BaustKanton             

In [43]:
print(PERMITS.head())

                                          Objektname BaustKanton  \
0                      Neubau Primarschule Riethüsli          SG   
1  Teilabbruch und Erweiterung der Kantonsschule ...          SG   
3  Erweiterung/Erneuerung Regionalgefängnis mit P...          SG   
4  Neubau Pflegezentrum Schachen mit Tiefgarage u...          SG   
5    Neubau Gewerbezentrum/Parkunique mit Tiefgarage          SG   

  BaustadiumDatum          BaustadiumAlt BaustadiumDatumAlt      Baukosten  \
0      2022-06-03  Baugesuch eingereicht         2021-11-22  48.00 Mio CHF   
1      2021-11-30  Baugesuch eingereicht         2015-10-02  49.00 Mio CHF   
3      2020-02-05  Baugesuch eingereicht         2019-06-17  83.00 Mio CHF   
4      2023-07-05  Baugesuch eingereicht         2019-11-21  46.70 Mio CHF   
5      2024-11-15                Projekt         2017-01-06  25.00 Mio CHF   

   Baubeginn  Bauende  
0          1        1  
1          1        1  
3          1        1  
4          1        1  
5 

In [60]:
print(PEDESTRIANS.head())

         Date  Day  Workday  Total Pedestrians
0  2024-03-24    7        0              420.0
8  2024-03-25    1        1             1766.0
19 2024-03-26    2        1             1701.0
27 2024-03-28    4        1             1669.0
33 2024-03-29    5        1              665.0


In [27]:
print(WEATHER.head())

         Location       Date  Temperature mean  Temperature max  \
27125  St. Gallen 2019-01-08               0.9              1.9   
27130  St. Gallen 2019-01-10              -3.2             -2.5   
27135  St. Gallen 2019-01-18              -2.8              0.7   
27140  St. Gallen 2019-01-27               2.5              5.0   
27145  St. Gallen 2019-02-01               1.2              4.8   

       Temperature min  Precipitation in mm  Snow amount in cm  
27125             -0.8                  4.2               35.0  
27130             -3.8                 17.3               39.0  
27135             -7.0                  0.0               32.0  
27140             -0.9                  5.3               27.0  
27145             -1.3                  0.2               37.0  


In [45]:
# Merge the datasets
# Start with Pedestrians and Weather datasets and merge them on the Date column, performing an inner join to only keep values that are in both datasets
merged_features1 = pd.merge(PEDESTRIANS, WEATHER, on='Date', how='inner')
print(merged_features1)

           Date  Day  Workday  Total Pedestrians    Location  \
0    2024-03-24    7        0              420.0  St. Gallen   
1    2024-03-25    1        1             1766.0  St. Gallen   
2    2024-03-26    2        1             1701.0  St. Gallen   
3    2024-03-28    4        1             1669.0  St. Gallen   
4    2024-03-29    5        1              665.0  St. Gallen   
...         ...  ...      ...                ...         ...   
2044 2024-10-23    3        1             1221.0  St. Gallen   
2045 2024-10-24    4        1             1335.0  St. Gallen   
2046 2024-10-25    5        1             1404.0  St. Gallen   
2047 2024-10-26    6        0             1600.0  St. Gallen   
2048 2024-10-12    6        0             2309.0  St. Gallen   

      Temperature mean  Temperature max  Temperature min  Precipitation in mm  \
0                  1.7              3.8             -0.2                  5.2   
1                  4.7              9.3              0.4             

In [49]:
num_unique_baustadium_dates = PERMITS['BaustadiumDatum'].nunique()
print(num_unique_baustadium_dates)

339


In [57]:
print(PERMITS.shape)

(1761, 8)


In [55]:
# Now merge the Permits dataset with the merged dataset from the previous step, this time performing a left merge to make sure we keep all the values from the already merged dataset
merged_features = pd.merge(merged_features1, PERMITS, left_on='Date', right_on="BaustadiumDatum", how='left')
print(merged_features.sort_values(by='Date'))

           Date  Day  Workday  Total Pedestrians    Location  \
2727 2019-03-08    5        1               80.0  St. Gallen   
2728 2019-03-11    1        1              102.0  St. Gallen   
2729 2019-03-12    2        1              115.0  St. Gallen   
2730 2019-03-13    3        1              127.0  St. Gallen   
2731 2019-03-14    4        1              106.0  St. Gallen   
...         ...  ...      ...                ...         ...   
3233 2024-11-05    2        1             1310.0  St. Gallen   
3236 2024-11-05    2        1             1310.0  St. Gallen   
3240 2024-11-06    3        1             1383.0  St. Gallen   
3241 2024-11-06    3        1             1383.0  St. Gallen   
3214 2024-11-07    4        1             1263.0  St. Gallen   

      Temperature mean  Temperature max  Temperature min  Precipitation in mm  \
2727               4.7              6.5              2.9                  5.0   
2728               0.9              4.2             -1.7             

In [67]:
# We notice that there are some NaN values in the BaustadiumDatum column, which means that there are some dates in the Pedestrians and Weather dataset that are not in the Permits dataset
# Which makes sense since there aren't new construction permits happening every day
# Find NaN values in the merged_features dataframe
nan_values = merged_features.isna().sum()
# Check for duplicate rows in the merged_features dataframe
duplicates = merged_features[merged_features.duplicated()]
print(duplicates.value_counts())

Date        Day  Workday  Total Pedestrians  Location    Temperature mean  Temperature max  Temperature min  Precipitation in mm  Snow amount in cm  Objektname                                     BaustKanton  BaustadiumDatum  BaustadiumAlt          BaustadiumDatumAlt  Baukosten          Baubeginn  Bauende
2024-10-08  2    1        1541.0             St. Gallen   12.8              15.5             10.9            10.1                 0.0                Abbruch Einfamilienhaus                        SG           2024-10-08       Baugesuch eingereicht  2024-08-30          0.03 Mio CHF       1.0        0.0        2
2023-12-08  5    1        2009.0             St. Gallen   2.9               5.0             -1.7             3.9                  14.0               Neubau Einfamilienhaus und angebaute Garage    SG           2023-12-08       Baugesuch eingereicht  2023-06-30          0.70 Mio CHF       0.0        0.0        1
2023-12-14  4    1        2170.0             St. Gallen   3.3        