# Holiday Feature
Feriendaten werden aus dem Internet gecrawlt und in ein passendes Format überführt. Es werden nur Feiertage für NRW betrachtet.

In [2]:
import sklearn 
import pandas as pd
import requests
from bs4 import BeautifulSoup
from dateutil.relativedelta import *

In [3]:
def format_dataframe(df, holyday_type):
    """df is the dataFrame, holyday_type is a string: name of holyday_type column"""
    # Rename column
    tmp = df.rename(columns={'holiday_name' : 'name'})
    # Drop column
    tmp = tmp.drop('holiday_type', axis=1)
    return tmp

In [4]:
# Initialize DataFrame
df_holidays = pd.DataFrame()

# Loop over years
for year in range(2016, 2018):
    
    url = 'http://www.schulferien.org/deutschland/feiertage/' + str(year) + '/'

    # Fetch soup
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')

    # Loop over holiday types (gesetzl. Feiertage, Festtage, Ereignisse)
    for holiday_type, html_class in [('holiday_legal', 'row_panel gesetzlich_row'),
                                     ('holiday_not_legal', 'row_panel nicht_gesetzlich_row'),
                                     ('event', 'row_panel ereignis_row')]:

        # Loop over rows
        for row in soup.findAll('tr', {'class':html_class}):

            name = row.find('a', {'class':'feiertag_name'}).text.replace('*', '').strip()
            date = row.find('td', {'class':'feiertag_datum'}).div.contents[0].strip()[3:]

            # Sometimes region cell is empty
            try:
                region = row.find('td', {'class':'feiertag_regionen sf_tooltip sf_hide_w_xs'}).text.strip()
            except:
                region = 'alle BL'

            df_holidays = df_holidays.append(pd.DataFrame([(date, name, holiday_type, region)], 
                                                          columns=['date', 'holiday_name', 'holiday_type', 
                                                                   'holiday_region']))

In [4]:
# Convert to datetime 
df_holidays['date'] = pd.to_datetime(df_holidays['date'], dayfirst=True)

# Sort by date
df_holidays.sort_values('date', inplace=True)

# Drop 17. Juni 1953 (only legal holiday until 1990)
df_holidays = df_holidays[df_holidays['holiday_name'] != '17. Juni 1953']

# Drop Augsburger Friedensfest (only important for Augsburg)
df_holidays = df_holidays[df_holidays['holiday_name'] != 'Augsburger Friedensfest']

df_holidays_legal = df_holidays.loc[df_holidays['holiday_type'] == 'holiday_legal']
df_holidays_not_legal = df_holidays.loc[df_holidays['holiday_type'] == 'holiday_not_legal']


df_holidays_legal = format_dataframe(df_holidays_legal, "holyday_legal")
df_holidays_not_legal = format_dataframe(df_holidays_not_legal, 'holiday_not_legal')

In [4]:
# BRÜCKENTAGE

# Add weekday
df_holidays_legal['weekday'] = df_holidays_legal['date'].dt.weekday

# Initialize DataFrame
df_bridge_days = pd.DataFrame()

# Loop over all Thursday bank holidays and add Brückentage (Fridays)
for datetime in df_holidays_legal.loc[df_holidays_legal['weekday'] == 3, 'date']:

    # Fetch region
    bridge_day_region = df_holidays_legal.loc[df_holidays_legal['date'] == datetime, 'holiday_region'][0]
    
    # Add a day
    datetime_tomorrow = datetime + relativedelta(days=+1)
    
    # Assemble DataFrame
    df_temp = pd.DataFrame([(datetime_tomorrow, 'Brückentag', bridge_day_region)],
                                 columns=['date', 'name', 'holiday_region'])

    #print(datetime_tomorrow, bridge_day_region)
    
    # Append
    df_bridge_days = df_bridge_days.append(df_temp)
    
# Loop over all Thursday bank holidays and add Brückentage (Mondays)
for datetime in df_holidays_legal.loc[df_holidays_legal['weekday'] == 1, 'date']:

    # Fetch region
    bridge_day_region = df_holidays_legal.loc[df_holidays_legal['date'] == datetime, 'holiday_region'][0]
    
    # Add a day
    datetime_tomorrow = datetime + relativedelta(days=-1)
    
    # Assemble DataFrame
    df_temp = pd.DataFrame([(datetime_tomorrow, 'Brückentag', bridge_day_region)],
                                 columns=['date', 'name', 'holiday_region'])

    #print(datetime_tomorrow, bridge_day_region)
    
    # Append
    df_bridge_days = df_bridge_days.append(df_temp)

In [15]:
df_holidays = pd.concat([df_holidays_legal,df_holidays_not_legal,df_bridge_days])
df_holidays = df_holidays.reset_index(drop=True)
df_holidays.head()

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  """Entry point for launching an IPython kernel.


Unnamed: 0,date,holiday_region,name,weekday
0,2016-01-01,alle BL,Neujahr,4.0
1,2016-01-06,"BW, BY, ST",Heilige Drei Könige,2.0
2,2016-03-25,alle BL,Karfreitag,4.0
3,2016-03-27,BB,Ostersonntag,6.0
4,2016-03-28,alle BL,Ostermontag,0.0


In [14]:
# Only keep holidays in NRW.
df_holidays = df_holidays[df_holidays.holiday_region.str.contains('NW')]

In [16]:
# serialize output
df_holidays.to_pickle('df_holidays.pickle')