In [40]:
import numpy as np
import pandas as pd

# Import my modules.
import sys, os
from pathlib import Path
current_dir = os.path.join(Path().resolve())
sys.path.append(str(current_dir) + '/../')
from modules import utils

pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)

In [59]:
BASE = '../input/'
date_col = 'first_day_of_month'
cat_cols = ['county', 'state']
mbd = 'microbusiness_density'
idx = 'row_id'

df_census = pd.read_csv(BASE + 'census_starter.csv', index_col='cfips')
df_train = pd.read_csv(BASE + 'train.csv',  index_col=idx)
df_test = pd.read_csv(BASE + 'test.csv',  index_col=idx)
df_subm = pd.read_csv(BASE + 'sample_submission.csv',  index_col=idx)

state_dict = df_train[['cfips', 'state', 'county']]
state_dict = state_dict.set_index('cfips')
state_dict = state_dict.drop_duplicates()
state_dict = state_dict.to_dict()

df_test['state'] = df_test['cfips'].map(state_dict['state'])
df_test['county'] = df_test['cfips'].map(state_dict['county'])

df_all = pd.concat([df_train, df_test], axis=0)

df_all[date_col] = pd.to_datetime(df_all[date_col])

df_all['year'] = df_all[date_col].dt.year
df_all['month'] = df_all[date_col].dt.month
df_all['scale'] = (df_all[date_col] - df_all[date_col].min()).dt.days
df_all['scale'] = df_all['scale'].factorize()[0]

df_all = df_all.drop(columns=[date_col])
df_all.sort_index(inplace=True)

df_all[cat_cols] = df_all[cat_cols].astype('category')

In [60]:
for i in range(-3, 4):
    df_all[f'lag_{i}'] = df_all.groupby('cfips')[mbd].shift(i).bfill()

In [61]:
df_all['season_weight'] = ((df_all['lag_-3'] + df_all['lag_-2'] + df_all['lag_2'] + df_all['lag_3']) / 4) / ((df_all['lag_-1'] + df_all['lag_0'] * 2 + df_all['lag_1']) / 4)

In [62]:
df_cfips_month = df_all.loc[df_all['scale']>=3].groupby(['cfips', 'month']).mean()['season_weight'].reset_index()
df_cfips_month.rename(columns={'season_weight':'season'}, inplace=True)

In [65]:
df_all.reset_index(inplace=True)
df_all = df_all.merge(df_cfips_month, how='left', on=['cfips', 'month'])
df_all.set_index('row_id', inplace=True)
df_all.head()

Unnamed: 0_level_0,cfips,county,state,microbusiness_density,active,year,month,scale,lag_-3,lag_-2,lag_-1,lag_0,lag_1,lag_2,lag_3,season_weight,season
row_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
10001_2019-08-01,10001,Kent County,Delaware,4.995701,6624.0,2019,8,0,5.124666,5.09827,5.031902,4.995701,4.995701,4.995701,4.995701,1.009757,1.063181
10001_2019-09-01,10001,Kent County,Delaware,5.031902,6672.0,2019,9,1,5.086203,5.124666,5.09827,5.031902,4.995701,4.995701,4.995701,1.002207,1.130119
10001_2019-10-01,10001,Kent County,Delaware,5.09827,6760.0,2019,10,2,5.017472,5.086203,5.124666,5.09827,5.031902,4.995701,4.995701,0.987322,1.054611
10001_2019-11-01,10001,Kent County,Delaware,5.124666,6795.0,2019,11,3,5.056963,5.017472,5.086203,5.124666,5.09827,5.031902,4.995701,0.983764,0.978167
10001_2019-12-01,10001,Kent County,Delaware,5.086203,6744.0,2019,12,4,5.083041,5.056963,5.017472,5.086203,5.124666,5.09827,5.031902,0.997816,0.970705


In [66]:
df_output = df_all['season']
df_output.to_csv('../output/feature_season.csv')