In [2]:
!pip install holidays
import pandas as pd
import holidays

# Define the URLs for the additional datasets
kiwo_url = 'https://raw.githubusercontent.com/opencampus-sh/einfuehrung-in-data-science-und-ml/main/kiwo.csv'
wetter_url = 'https://raw.githubusercontent.com/opencampus-sh/einfuehrung-in-data-science-und-ml/main/wetter.csv'
umsatzdaten_gekuerzt_url = 'https://raw.githubusercontent.com/opencampus-sh/einfuehrung-in-data-science-und-ml/main/umsatzdaten_gekuerzt.csv'

# Load the datasets
kiwo = pd.read_csv(kiwo_url, delimiter=',')
wetter = pd.read_csv(wetter_url, delimiter=',')
umsatzdaten_gekuerzt = pd.read_csv(umsatzdaten_gekuerzt_url, delimiter=',')

# Merge datasets with a left join to retain all rows from wetter
wetter_umsatzdaten = pd.merge(wetter, umsatzdaten_gekuerzt, on="Datum", how="left")

# Add KiWo flag
wetter_umsatzdaten['KiWo'] = wetter_umsatzdaten['Datum'].isin(kiwo['Datum']).astype(int)

# Convert the 'Datum' column to datetime
wetter_umsatzdaten['Datum'] = pd.to_datetime(wetter_umsatzdaten['Datum'])

# Define the range of years
years = range(2012, 2019 + 1)

# Create a list to store the holidays
holidays_sh = []

# Loop through each year and get the holidays for Schleswig-Holstein
for year in years:
    for date, name in holidays.Germany(years=year, state='SH').items():
        holidays_sh.append((date, name))

# Create a set of holiday dates
holiday_dates = set(date for date, name in holidays_sh)

# Add a new column 'Holiday' to the wetter_umsatzdaten dataframe
wetter_umsatzdaten['Holiday'] = wetter_umsatzdaten['Datum'].apply(lambda x: 1 if pd.to_datetime(x).date() in holiday_dates else 0)

# Replace NaN in Umsatz with 0
wetter_umsatzdaten['Umsatz'] = wetter_umsatzdaten['Umsatz'].fillna(0)

# Replace NaN in Warengruppe with 0
wetter_umsatzdaten['Warengruppe'] = wetter_umsatzdaten['Warengruppe'].fillna(0).astype(int)

# Create a new column 'ID' to match the sample submission format (YYMMDDW)
wetter_umsatzdaten['ID'] = (
    wetter_umsatzdaten['Datum'].dt.strftime('%y%m%d').astype(str) + 
    wetter_umsatzdaten['Warengruppe'].astype(str)
).astype(int)

# Print the first few rows to verify the new ID format
print(wetter_umsatzdaten[['Datum', 'Warengruppe', 'ID']].head())

# Save merged dataset directly
wetter_umsatzdaten.to_csv("wetter_umsatzdaten_kiwo_hol_id.csv", index=False)

print("Merge complete. Data saved for subsequent processing.")



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3 -m pip install --upgrade pip[0m
       Datum  Warengruppe       ID
0 2012-01-01            0  1201010
1 2012-01-02            0  1201020
2 2012-01-03            0  1201030
3 2012-01-04            0  1201040
4 2012-01-05            0  1201050
Merge complete. Data saved for subsequent processing.
