In [None]:
# MergeData.ipynb
import pandas as pd
import holidays
# Define the URLs for the additional datasets
kiwo_url = 'https://raw.githubusercontent.com/opencampus-sh/einfuehrung-in-data-science-und-ml/main/kiwo.csv'
wetter_url = 'https://raw.githubusercontent.com/opencampus-sh/einfuehrung-in-data-science-und-ml/main/wetter.csv'
umsatzdaten_gekuerzt_url = 'https://raw.githubusercontent.com/opencampus-sh/einfuehrung-in-data-science-und-ml/main/umsatzdaten_gekuerzt.csv'

# Load the datasets
kiwo = pd.read_csv(kiwo_url, delimiter=',')
wetter = pd.read_csv(wetter_url, delimiter=',')
umsatzdaten_gekuerzt = pd.read_csv(umsatzdaten_gekuerzt_url, delimiter=',')

# Merge datasets
wetter_umsatzdaten = pd.merge(wetter, umsatzdaten_gekuerzt, on="Datum")

# Add KiWo flag
wetter_umsatzdaten['KiWo'] = wetter_umsatzdaten['Datum'].isin(kiwo['Datum']).astype(int)
# Convert the 'Datum' column to datetime
wetter_umsatzdaten['Datum'] = pd.to_datetime(wetter_umsatzdaten['Datum'])



# Define the range of years
years = range(2012, 2019 + 1)

# Create a list to store the holidays
holidays_sh = []

# Loop through each year and get the holidays for Schleswig-Holstein
for year in years:
    for date, name in holidays.Germany(years=year, state='SH').items():
        holidays_sh.append((date, name))

# Create a set of holiday dates
holiday_dates = set(date for date, name in holidays_sh)

# Add a new column 'Holiday' to the data_kiwo dataframe
wetter_umsatzdaten['Holiday'] = wetter_umsatzdaten['Datum'].apply(lambda x: 1 if pd.to_datetime(x).date() in holiday_dates else 0)
#Clean data and set ID to merge sample dataset from Kaggl

merged_data = wetter_umsatzdaten.dropna(subset=['Umsatz'])
print(merged_data.head())

# Convert 'Warengruppe' to integer
merged_data['Warengruppe'] = merged_data['Warengruppe'].astype(int)

# Create a new column 'ID' by combining 'Datum' and 'Warengruppe' without digits
merged_data['ID'] = merged_data['Datum'].dt.strftime('%Y-%m-%d') + '_' + merged_data['Warengruppe'].astype(str)
# Print the first few rows to verify
print(merged_data.head())

# Save merged dataset directly
merged_data.to_csv("wetter_umsatzdaten_kiwo_hol_id.csv", index=False)

print("Merge complete. Data saved for subsequent processing.")


Merge complete. Data saved for subsequent processing.
