In [1]:
def get_df_holidays(df):
    import pandas as pd
    
    holiday_list = [
          '2010-02-12', '2011-02-11', '2012-02-10', '2013-02-08',
          '2010-09-10','2011-09-09', '2012-09-07', '2013-09-06',
          '2010-11-26', '2011-11-25', '2012-11-23', '2013-11-29',
          '2010-12-31', '2011-12-30', '2012-12-28', '2013-12-27'
    ]

    df_holiday = pd.DataFrame(zip(holiday_list, [True] * len(holiday_list)), 
                              columns=['day', 'is_holiday'])

    df_holiday['day'] = pd.to_datetime(df_holiday['day'], format='%Y-%m-%d')

    df['year'] = df.Date.dt.year
    df['week'] = df.Date.dt.isocalendar().week.astype('int64')

    list_new_holidays = [
                    (2010, 13), (2011, 16), (2012, 14), (2013, 13),
                    (2010, 26), (2011, 26), (2012, 27), (2013, 27),
    ] 

    for year, week in list_new_holidays:
        df.loc[((df.year == year) & (df.week == week)), 'is_holiday'] = True

    df = df[df['is_holiday'] == True][['Date', 'is_holiday']]

    df.columns = ['day', 'is_holiday']

    return pd.concat([df_holiday, df], ignore_index=True).drop_duplicates().reset_index(drop=True) 

In [2]:
def count_holidays_per_key(df, keys, col_name):
    df_grouped =  df.groupby(keys, as_index=False)['is_holiday'].count()

    df_grouped.columns = keys + [col_name]
    
    return (
    
        df.merge(

                df_grouped, how='left', on=keys

            )
    
    ).fillna(0)

In [3]:
def create_features_holidays(df, df_holidays, start, end):
    import pandas as pd
        
    all_days = pd.date_range(start, end, freq='D').to_series()
    
    df_all_days = pd.DataFrame(all_days, columns=['day'])

    df_all_days.loc[
        
            df_all_days.day.dt.strftime('%d') == '15', 'is_fortnight'

    ] = True

    df_all_days['is_fortnight'] = df_all_days['is_fortnight'].fillna(value=False)
    
    df_all_days['year'] = df_all_days.day.dt.year
    
    df_all_days['week'] = df_all_days.day.dt.isocalendar().week.astype('int64')
    
    df_all_days['week_day'] = df_all_days.day.dt.dayofweek
    
    df_all_days['is_month_start'] = df_all_days.day.dt.is_month_start
    
    df_all_days['is_month_end'] = df_all_days.day.dt.is_month_end

    df_all_days = df_all_days.merge(df_holidays, 'left', 'day')
    
    df_all_days['is_holiday'] = df_all_days['is_holiday'].fillna(value=False)
    
    df_week_year = df_all_days.groupby(['week', 'year'], as_index=False)

    df_week_year = df_week_year.any()
    
    df_week_year = df_week_year[[
    
        'week', 'year', 'is_month_start', 'is_fortnight', 
        'is_month_end', 'is_holiday'
    ]]
    
    df_week_year.columns = [
    
        'week', 'year', 'is_week_start_month', 'is_week_fortnight', 
        'is_week_end_month', 'is_holiday'
    ]

    return df.merge(
        
        df_week_year, 'left', ['week', 'year']

    ).fillna(value=False)    

In [4]:
def create_features(df):
    import pandas as pd
    
    start = pd.Timestamp(df.Date.dt.date.min()).to_pydatetime()
    
    end = pd.Timestamp(df.Date.dt.date.max()).to_pydatetime()
    
    df_holidays = get_df_holidays(df.copy())

    df['year'] = df.Date.dt.year
    
    df['week'] = df.Date.dt.isocalendar().week.astype('int64')
    
    df['month'] = df.Date.dt.month
    
    df = create_features_holidays(df.copy(), df_holidays, start, end)

    df = count_holidays_per_key(df.copy(), ['year', 'week'], 'qt_holiday_week')

    return df.drop_duplicates().reset_index(drop=True)

In [5]:
def convert_to_numeric(df):

    list_col_numeric = [col for col in df if df[col].dtype != 'O']

    list_col_cat = [col for col in df if df[col].dtype == 'O']

    for col in df.columns:

        if col in list_col_numeric:
            
            df[col] = df[col].fillna(-9999).astype('float64')
            
        elif col in list_col_cat:
            
            df[col] = df[col].fillna('ND')

    return df

In [6]:
def get_make_col_transformer(df):
    from sklearn.compose import make_column_transformer
    from sklearn.pipeline import make_pipeline
    from sklearn.preprocessing import OrdinalEncoder, StandardScaler
    
    list_col_numeric = [col for col in df if df[col].dtype != 'O']

    list_col_cat = [col for col in df if df[col].dtype == 'O']

    if 'Weekly_Sales' in list_col_numeric: list_col_numeric.remove('Weekly_Sales')

    list_categories = [df[column].unique() for column in df[list_col_cat]]

    encoder_col_cat = make_pipeline(
        OrdinalEncoder(categories=list_categories)
    )

    normalize_col_numerics = make_pipeline(
        StandardScaler()
    )
    
    return make_column_transformer(
        
        (encoder_col_cat, list_col_cat),
        (normalize_col_numerics, list_col_numeric)
        
    )

In [7]:
def prepare_dataframe(df, is_test = False):
    
    df = create_features(df)
    
    col_selected = [
    
           'Store', 'Dept', 'Type', 'Size', 'is_holiday',
           'year', 'week', 'is_week_start_month', 'month',
           'is_week_fortnight', 'is_week_end_month',
           'qt_holiday_week', 'Weekly_Sales'
    ]

    if is_test:
        
        col_selected.remove('Weekly_Sales')

    df = df[col_selected]
    
    df = convert_to_numeric(df) 
    
    col_transformer = get_make_col_transformer(df)

    return df, col_transformer