In [99]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import tensorflow as tf
from sklearn.preprocessing import MinMaxScaler
import joblib
import gc
import random

In [100]:
SEED=42
random.seed(SEED)
np.random.seed(SEED)
tf.random.set_seed(SEED)

In [101]:
VALIDATE = False

In [102]:
best_model_params = {}

In [103]:
df_train = pd.read_csv("/kaggle/input/godaddy-microbusiness-density-forecasting/train.csv")

In [104]:
df_train_revealed = pd.read_csv("/kaggle/input/godaddy-microbusiness-density-forecasting/revealed_test.csv")

In [105]:
df_train = pd.concat([df_train, df_train_revealed], axis="rows").sort_values(["cfips", "row_id"]).reset_index(drop=True)

In [106]:
df_train["is_test"] = 0

In [107]:
df_test = pd.read_csv("/kaggle/input/godaddy-microbusiness-density-forecasting/test.csv")

In [108]:
df_test["is_test"] = 1

In [109]:
df_test["is_test"] = df_test["is_test"].astype(int)

In [110]:
all_data = pd.concat([df_train, df_test], axis="rows").sort_values(["cfips", "row_id"]).reset_index(drop=True)

In [111]:
all_data["dcount"] = all_data.groupby("cfips")["row_id"].cumcount()

In [112]:
all_data["state"] = all_data.groupby(["cfips"])["state"].ffill()

In [113]:
all_data["county"] = all_data.groupby("cfips")["county"].ffill()

In [114]:
lag=1
all_data[f"mbd_lag_{lag}"] = all_data.groupby("cfips")["microbusiness_density"].shift(lag).bfill()
all_data["diff"] = (all_data["microbusiness_density"] / all_data[f"mbd_lag_{lag}"]).fillna(1).clip(0, None) - 1
all_data.loc[(all_data[f"mbd_lag_{lag}"] == 0), 'diff'] = 0
all_data.loc[(all_data["microbusiness_density"] > 0) & (all_data[f"mbd_lag_{lag}"] == 0), 'diff'] = 1
all_data['diff'] = all_data['diff'].abs()

In [115]:
all_data.head(10)

Unnamed: 0,row_id,cfips,county,state,first_day_of_month,microbusiness_density,active,is_test,dcount,mbd_lag_1,diff
0,1001_2019-08-01,1001,Autauga County,Alabama,2019-08-01,3.007682,1249.0,0,0,3.007682,0.0
1,1001_2019-09-01,1001,Autauga County,Alabama,2019-09-01,2.88487,1198.0,0,1,3.007682,0.040833
2,1001_2019-10-01,1001,Autauga County,Alabama,2019-10-01,3.055843,1269.0,0,2,2.88487,0.059265
3,1001_2019-11-01,1001,Autauga County,Alabama,2019-11-01,2.993233,1243.0,0,3,3.055843,0.020489
4,1001_2019-12-01,1001,Autauga County,Alabama,2019-12-01,2.993233,1243.0,0,4,2.993233,0.0
5,1001_2020-01-01,1001,Autauga County,Alabama,2020-01-01,2.96909,1242.0,0,5,2.993233,0.008066
6,1001_2020-02-01,1001,Autauga County,Alabama,2020-02-01,2.909326,1217.0,0,6,2.96909,0.020129
7,1001_2020-03-01,1001,Autauga County,Alabama,2020-03-01,2.933231,1227.0,0,7,2.909326,0.008217
8,1001_2020-04-01,1001,Autauga County,Alabama,2020-04-01,3.000167,1255.0,0,8,2.933231,0.02282
9,1001_2020-05-01,1001,Autauga County,Alabama,2020-05-01,3.004948,1257.0,0,9,3.000167,0.001594


In [116]:
outliers = []
cnt = 0

In [117]:
for o in all_data["cfips"].unique():
    indices = all_data["cfips"] == o
    current_fold = all_data.loc[indices].copy().reset_index(drop=True)
    var = current_fold["microbusiness_density"].values.copy()
    for i in range(37, 2, -1):
        thr = 0.20 * np.mean(var[:i])
        difa = abs(var[i]-var[i-1])
        if (difa>=thr):
            var[:i] *= (var[i]/var[i-1])
            outliers.append(o)
            cnt+=1
    var[0] = var[1]*0.99
    all_data.loc[indices, 'microbusiness_density'] = var
    
outliers = np.unique(outliers)
len(outliers), cnt

  if __name__ == "__main__":
  if __name__ == "__main__":
  if __name__ == "__main__":
  import sys


(481, 732)

In [118]:
print(outliers)

[ 1013  1035  1037  1045  1057  1059  1085  1111  1125  1131  1133  2060
  2070  2100  2164  2188  2282  5011  5029  5031  5049  5061  5065  5077
  5079  5081  5091  5093  5113  5141  6005  6015  8011  8014  8031  8047
  8055  8057  8069  8079  8081  8105  8121 10003 10005 12001 12013 12029
 12037 12045 12051 12065 12077 12107 12131 13007 13019 13025 13033 13037
 13061 13087 13101 13119 13131 13149 13169 13193 13239 13243 13251 13287
 13291 13301 13307 13317 15005 16009 16021 16025 16033 16037 16045 16049
 16051 16077 16079 17013 17037 17063 17065 17075 17105 17109 17127 17147
 17175 17191 18017 18025 18031 18039 18041 18049 18073 18083 18087 18103
 18143 18145 18153 18171 19001 19003 19005 19033 19037 19051 19063 19065
 19067 19071 19125 19129 19133 19143 19165 19175 19177 19185 19189 19195
 20011 20019 20043 20065 20071 20085 20089 20093 20095 20107 20109 20135
 20145 20149 20165 20169 20175 20183 20191 20197 20205 21001 21007 21027
 21039 21045 21051 21053 21065 21069 21095 21105 21

In [69]:
lag=1
all_data[f"mbd_lag_{lag}"] = all_data.groupby("cfips")["microbusiness_density"].shift(lag).bfill()
all_data["diff"] = (all_data["microbusiness_density"] / all_data[f"mbd_lag_{lag}"]).fillna(1).clip(0, None) - 1
all_data.loc[(all_data[f"mbd_lag_{lag}"] == 0), 'diff'] = 0
all_data.loc[(all_data["microbusiness_density"] > 0) & (all_data[f"mbd_lag_{lag}"] == 0), 'diff'] = 1
all_data['diff'] = all_data['diff'].abs()

In [119]:
all_data['target'] = all_data.groupby('cfips')['microbusiness_density'].shift(-1)
all_data['target'] = all_data['target']/all_data['microbusiness_density'] - 1

In [120]:
all_data.head(10)

Unnamed: 0,row_id,cfips,county,state,first_day_of_month,microbusiness_density,active,is_test,dcount,mbd_lag_1,diff,target
0,1001_2019-08-01,1001,Autauga County,Alabama,2019-08-01,2.856021,1249.0,0,0,3.007682,0.0,0.010101
1,1001_2019-09-01,1001,Autauga County,Alabama,2019-09-01,2.88487,1198.0,0,1,3.007682,0.040833,0.059265
2,1001_2019-10-01,1001,Autauga County,Alabama,2019-10-01,3.055843,1269.0,0,2,2.88487,0.059265,-0.020489
3,1001_2019-11-01,1001,Autauga County,Alabama,2019-11-01,2.993233,1243.0,0,3,3.055843,0.020489,0.0
4,1001_2019-12-01,1001,Autauga County,Alabama,2019-12-01,2.993233,1243.0,0,4,2.993233,0.0,-0.008066
5,1001_2020-01-01,1001,Autauga County,Alabama,2020-01-01,2.96909,1242.0,0,5,2.993233,0.008066,-0.020129
6,1001_2020-02-01,1001,Autauga County,Alabama,2020-02-01,2.909326,1217.0,0,6,2.96909,0.020129,0.008217
7,1001_2020-03-01,1001,Autauga County,Alabama,2020-03-01,2.933231,1227.0,0,7,2.909326,0.008217,0.02282
8,1001_2020-04-01,1001,Autauga County,Alabama,2020-04-01,3.000167,1255.0,0,8,2.933231,0.02282,0.001594
9,1001_2020-05-01,1001,Autauga County,Alabama,2020-05-01,3.004948,1257.0,0,9,3.000167,0.001594,0.004773


In [121]:
all_data['lastactive'] = all_data.groupby('cfips')['active'].transform('last')

In [122]:
md_mapping = all_data.loc[all_data.dcount==28].groupby('cfips')['microbusiness_density'].agg('last')

In [123]:
all_data['lasttarget'] = all_data['cfips'].map(md_mapping)

In [124]:
all_data.head()

Unnamed: 0,row_id,cfips,county,state,first_day_of_month,microbusiness_density,active,is_test,dcount,mbd_lag_1,diff,target,lastactive,lasttarget
0,1001_2019-08-01,1001,Autauga County,Alabama,2019-08-01,2.856021,1249.0,0,0,3.007682,0.0,0.010101,1475.0,3.286307
1,1001_2019-09-01,1001,Autauga County,Alabama,2019-09-01,2.88487,1198.0,0,1,3.007682,0.040833,0.059265,1475.0,3.286307
2,1001_2019-10-01,1001,Autauga County,Alabama,2019-10-01,3.055843,1269.0,0,2,2.88487,0.059265,-0.020489,1475.0,3.286307
3,1001_2019-11-01,1001,Autauga County,Alabama,2019-11-01,2.993233,1243.0,0,3,3.055843,0.020489,0.0,1475.0,3.286307
4,1001_2019-12-01,1001,Autauga County,Alabama,2019-12-01,2.993233,1243.0,0,4,2.993233,0.0,-0.008066,1475.0,3.286307


In [125]:
def build_features(raw, target='microbusiness_density', target_act='active_tmp', lags = 6):
    feats = []
    for lag in range(1, lags):
        raw[f'mbd_lag_{lag}'] = raw.groupby('cfips')[target].shift(lag)
        raw[f'act_lag_{lag}'] = raw.groupby('cfips')[target_act].diff(lag)
        feats.append(f'mbd_lag_{lag}')
        feats.append(f'act_lag_{lag}')
        
    lag = 1
    for window in [2, 4, 6, 8, 10]:
        raw[f'mbd_rollmea{window}_{lag}'] = raw.groupby('cfips')[f'mbd_lag_{lag}'].transform(lambda s: s.rolling(window, min_periods=1).sum())        
        feats.append(f'mbd_rollmea{window}_{lag}')
        
    return raw, feats

In [126]:
all_data, feats = build_features(all_data, 'target', 'active', lags = 5)

In [132]:
all_data.head(10)

Unnamed: 0,row_id,cfips,county,state,first_day_of_month,microbusiness_density,active,is_test,dcount,mbd_lag_1,...,act_lag_2,mbd_lag_3,act_lag_3,mbd_lag_4,act_lag_4,mbd_rollmea2_1,mbd_rollmea4_1,mbd_rollmea6_1,mbd_rollmea8_1,mbd_rollmea10_1
0,1001_2019-08-01,1001,Autauga County,Alabama,2019-08-01,2.856021,1249.0,0,0,,...,,,,,,,,,,
1,1001_2019-09-01,1001,Autauga County,Alabama,2019-09-01,2.88487,1198.0,0,1,0.010101,...,,,,,,0.010101,0.010101,0.010101,0.010101,0.010101
2,1001_2019-10-01,1001,Autauga County,Alabama,2019-10-01,3.055843,1269.0,0,2,0.059265,...,20.0,,,,,0.069366,0.069366,0.069366,0.069366,0.069366
3,1001_2019-11-01,1001,Autauga County,Alabama,2019-11-01,2.993233,1243.0,0,3,-0.020489,...,45.0,0.010101,-6.0,,,0.038777,0.048878,0.048878,0.048878,0.048878
4,1001_2019-12-01,1001,Autauga County,Alabama,2019-12-01,2.993233,1243.0,0,4,0.0,...,-26.0,0.059265,45.0,0.010101,-6.0,-0.020489,0.048878,0.048878,0.048878,0.048878
5,1001_2020-01-01,1001,Autauga County,Alabama,2020-01-01,2.96909,1242.0,0,5,-0.008066,...,-1.0,-0.020489,-27.0,0.059265,44.0,-0.008066,0.030711,0.040812,0.040812,0.040812
6,1001_2020-02-01,1001,Autauga County,Alabama,2020-02-01,2.909326,1217.0,0,6,-0.020129,...,-26.0,0.0,-26.0,-0.020489,-52.0,-0.028195,-0.048683,0.020683,0.020683,0.020683
7,1001_2020-03-01,1001,Autauga County,Alabama,2020-03-01,2.933231,1227.0,0,7,0.008217,...,-15.0,-0.008066,-16.0,0.0,-16.0,-0.011912,-0.019978,0.018799,0.0289,0.0289
8,1001_2020-04-01,1001,Autauga County,Alabama,2020-04-01,3.000167,1255.0,0,8,0.02282,...,38.0,-0.020129,13.0,-0.008066,12.0,0.031037,0.002842,-0.017647,0.05172,0.05172
9,1001_2020-05-01,1001,Autauga County,Alabama,2020-05-01,3.004948,1257.0,0,9,0.001594,...,30.0,0.008217,40.0,-0.020129,15.0,0.024413,0.012502,0.004436,0.043212,0.053313


In [133]:
print(feats)

['mbd_lag_1', 'act_lag_1', 'mbd_lag_2', 'act_lag_2', 'mbd_lag_3', 'act_lag_3', 'mbd_lag_4', 'act_lag_4', 'mbd_rollmea2_1', 'mbd_rollmea4_1', 'mbd_rollmea6_1', 'mbd_rollmea8_1', 'mbd_rollmea10_1']


In [128]:
ACT_THR = 1.8
ABS_THR = 1.00

In [135]:
all_data_pre_processed = pd.DataFrame(columns=all_data.columns)

In [137]:
all_data_pre_processed.head()

Unnamed: 0,row_id,cfips,county,state,first_day_of_month,microbusiness_density,active,is_test,dcount,mbd_lag_1,...,act_lag_2,mbd_lag_3,act_lag_3,mbd_lag_4,act_lag_4,mbd_rollmea2_1,mbd_rollmea4_1,mbd_rollmea6_1,mbd_rollmea8_1,mbd_rollmea10_1


In [131]:
all_data.head()

Unnamed: 0,row_id,cfips,county,state,first_day_of_month,microbusiness_density,active,is_test,dcount,mbd_lag_1,...,act_lag_2,mbd_lag_3,act_lag_3,mbd_lag_4,act_lag_4,mbd_rollmea2_1,mbd_rollmea4_1,mbd_rollmea6_1,mbd_rollmea8_1,mbd_rollmea10_1
0,1001_2019-08-01,1001,Autauga County,Alabama,2019-08-01,2.856021,1249.0,0,0,,...,,,,,,,,,,
1,1001_2019-09-01,1001,Autauga County,Alabama,2019-09-01,2.88487,1198.0,0,1,0.010101,...,,,,,,0.010101,0.010101,0.010101,0.010101,0.010101
2,1001_2019-10-01,1001,Autauga County,Alabama,2019-10-01,3.055843,1269.0,0,2,0.059265,...,20.0,,,,,0.069366,0.069366,0.069366,0.069366,0.069366
3,1001_2019-11-01,1001,Autauga County,Alabama,2019-11-01,2.993233,1243.0,0,3,-0.020489,...,45.0,0.010101,-6.0,,,0.038777,0.048878,0.048878,0.048878,0.048878
4,1001_2019-12-01,1001,Autauga County,Alabama,2019-12-01,2.993233,1243.0,0,4,0.0,...,-26.0,0.059265,45.0,0.010101,-6.0,-0.020489,0.048878,0.048878,0.048878,0.048878


In [138]:
for TS in range(29, 40):
    
    print(f"TS is {TS}")
    
    scaler=MinMaxScaler()
    
    train_indices = (all_data.is_test==0) & (all_data.dcount  < TS) & (all_data.dcount >= 1) & (all_data.lastactive>ACT_THR)  & (all_data.lasttarget>ABS_THR)
    
    valid_indices = (all_data.is_test==0) & (all_data.dcount == TS)
    
    all_data_train_valid = all_data.loc[train_indices | valid_indices, feats]
    
    scaler=MinMaxScaler()
    
    all_data_train_valid_scaled = scaler.fit_transform(all_data_train_valid)
    
    res = pd.DataFrame(data=all_data_train_valid_scaled, columns=feats)
    
    res.fillna(method='bfill', inplace=True)
    
    res = res.reset_index(drop=True)
    
    remaining = all_data.loc[train_indices | valid_indices, [c for c in all_data.columns if c not in feats]]
    
    remaining = remaining.reset_index(drop=True)
    
    cnc = pd.concat([res, remaining], axis="columns")
    
    cnc["nn_count"] = TS
    
    all_data_pre_processed = pd.concat([all_data_pre_processed, cnc], axis="rows")
    
    del cnc; gc.collect()

TS is 29
TS is 30
TS is 31
TS is 32
TS is 33
TS is 34
TS is 35
TS is 36
TS is 37
TS is 38
TS is 39


In [139]:
all_data_pre_processed["nn_count"] = all_data_pre_processed["nn_count"].astype(int)

In [156]:
all_data_pre_processed.head()

Unnamed: 0,row_id,cfips,county,state,first_day_of_month,microbusiness_density,active,is_test,dcount,mbd_lag_1,...,mbd_lag_3,act_lag_3,mbd_lag_4,act_lag_4,mbd_rollmea2_1,mbd_rollmea4_1,mbd_rollmea6_1,mbd_rollmea8_1,mbd_rollmea10_1,nn_count
0,1001_2019-09-01,1001,Autauga County,Alabama,2019-09-01,2.88487,1198.0,0,1,0.014698,...,0.014698,0.510095,0.014698,0.484372,0.021189,0.029059,0.035801,0.041794,0.04887,29
1,1001_2019-10-01,1001,Autauga County,Alabama,2019-10-01,3.055843,1269.0,0,2,0.015721,...,0.014698,0.510095,0.014698,0.484372,0.022414,0.030274,0.037008,0.042994,0.05006,29
2,1001_2019-11-01,1001,Autauga County,Alabama,2019-11-01,2.993233,1243.0,0,3,0.014061,...,0.014698,0.510095,0.014698,0.484372,0.021782,0.029854,0.03659,0.042579,0.049649,29
3,1001_2019-12-01,1001,Autauga County,Alabama,2019-12-01,2.993233,1243.0,0,4,0.014488,...,0.015721,0.510459,0.014698,0.484372,0.020557,0.029854,0.03659,0.042579,0.049649,29
4,1001_2020-01-01,1001,Autauga County,Alabama,2020-01-01,2.96909,1242.0,0,5,0.01432,...,0.014061,0.509945,0.015721,0.484721,0.020814,0.029482,0.036426,0.042416,0.049487,29


In [157]:
TS=39
train_indices = (all_data_pre_processed.is_test==0) & (all_data_pre_processed.dcount  < TS) & (all_data_pre_processed.nn_count == TS) & (all_data_pre_processed.dcount >= 1) & (all_data_pre_processed.lastactive>ACT_THR)  & (all_data_pre_processed.lasttarget>ABS_THR)
valid_indices = (all_data_pre_processed.is_test==0) & (all_data_pre_processed.dcount == TS) & (all_data_pre_processed.nn_count == TS) & (all_data_pre_processed.dcount >= 1) & (all_data_pre_processed.lastactive>ACT_THR)  & (all_data_pre_processed.lasttarget>ABS_THR)

In [158]:
all_data_pre_processed["nn_count"] = all_data_pre_processed["nn_count"].astype(int)

In [159]:
def smape(y_true, y_pred):
    smap = np.zeros(len(y_true))
    
    num = np.abs(y_true - y_pred)
    dem = ((np.abs(y_true) + np.abs(y_pred)) / 2)
    
    pos_ind = (y_true!=0)|(y_pred!=0)
    smap[pos_ind] = num[pos_ind] / dem[pos_ind]
    
    return 100 * np.mean(smap)

In [160]:
def build_model(pred_length, initializer):
    
    if 'seed' in initializer().__dict__.keys():
        a_initializer = initializer(seed=42)
        another_initializer = initializer(seed=42)
    else:
        a_initializer = initializer()
        another_initializer = initializer()
        
    model=tf.keras.Sequential([
        tf.keras.layers.SimpleRNN(32,return_sequences=False, input_shape=[None, 13], kernel_initializer = a_initializer),
        tf.keras.layers.Dense(pred_length, kernel_initializer = another_initializer)
    ])
    
    model.compile(loss=tf.keras.losses.Huber(), optimizer=tf.keras.optimizers.SGD(learning_rate=0.01, momentum=0.9), metrics=[tf.keras.metrics.RootMeanSquaredError()])
    return model

In [161]:
def build_lstm_model(initializer, pred_length):
    
    if 'seed' in initializer().__dict__.keys():
        a_initializer = initializer(seed=42)
        another_initializer = initializer(seed=42)
    else:
        a_initializer = initializer()
        another_initializer = initializer()
    
    model=tf.keras.Sequential([
        tf.keras.layers.LSTM(16,return_sequences=False, input_shape=[None, 13], kernel_initializer=a_initializer),
        tf.keras.layers.Dense(pred_length, kernel_initializer=another_initializer)
    ])
    
    model.compile(loss=tf.keras.losses.Huber(), optimizer=tf.keras.optimizers.SGD(learning_rate=0.01, momentum=0.9), metrics=[tf.keras.metrics.RootMeanSquaredError()])
    return model

In [162]:
all_data_pre_processed = all_data_pre_processed.sort_values(['cfips','row_id']).reset_index(drop=True)

In [163]:
predictions = {}

In [164]:
blacklist = [
    'North Dakota', 'Iowa', 'Kansas', 'Nebraska', 'South Dakota','New Mexico', 'Alaska', 'Vermont'
]
blacklistcfips = [
1019,1027,1029,1035,1039,1045,1049,1057,1067,1071,1077,1085,1091,1099,1101,1123,1131,1133,4001,4012,4013,4021,4023,5001,5003,5005,5017,5019,5027,5031,5035,5047,5063,5065,5071,5081,5083,5087,5091,5093,5107,5109,5115,5121,5137,5139,5141,5147,6003,6015,6027,6033,6053,6055,6057,6071,6093,6097,6103,6105,6115,8003,8007,8009,8019,8021,8023,8047,8051,8053,8055,8057,8059,8061,8065,8067,8069,8071,8073,8075,8085,8091,8093,8097,8099,8103,8105,8107,8109,8111,8115,8117,8121,9007,9009,9015,12009,12017,12019,12029,12047,12055,12065,12075,12093,12107,12127,13005,13007,13015,13017,13019,13027,13035,13047,13065,13081,13083,13099,13107,13109,13117,13119,13121,13123,13125,13127,13135,13143,13147,13161,13165,13171,13175,13181,13193,13201,13221,13225,13229,13231,13233,13245,13247,13249,13257,13279,13281,13287,13289,13293,13301,13319,15001,15005,15007,16001,16003,16005,16007,16013,16015,16017,16023,16025,16029,16031,16033,16035,16037,16043,16045,16049,16061,16063,16067,17001,17003,17007,17009,17013,17015,17023,17025,17031,17035,17045,17051,17059,17061,17063,17065,17067,17069,17075,17077,17081,17085,17087,17103,17105,17107,17109,17115,17117,17123,17127,17133,17137,17141,17143,17147,17153,17167,17169,17171,17177,17179,17181,17185,17187,17193,18001,18007,18009,18013,18015,18019,18021,18025,18035,18037,18039,18041,18053,18061,18075,18079,18083,18087,18099,18103,18111,18113,18115,18137,18139,18145,18153,18171,18179,21001,21003,21013,21017,21023,21029,21035,21037,21039,21045,21047,21055,21059,21065,21075,21077,21085,21091,21093,21097,21099,21101,21103,21115,21125,21137,21139,21141,21149,21155,21157,21161,21165,21179,21183,21191,21197,21199,21215,21217,21223,21227,21237,21239,22019,22021,22031,22039,22041,22047,22069,22085,22089,22101,22103,22109,22111,22115,22119,22121,23003,23009,23021,23027,23029,24011,24027,24029,24031,24035,24037,24039,24041,25011,25015,26003,26007,26011,26019,26021,26025,26027,26033,26037,26041,26043,26051,26053,26057,26059,26061,26065,26071,26077,26079,26083,26089,26097,26101,26103,26109,26111,26115,26117,26119,26127,26129,26131,26135,26141,26143,26155,26161,26165,27005,27011,27013,27015,27017,27021,27023,27025,27029,27047,27051,27055,27057,27065,27069,27073,27075,27077,27079,27087,27091,27095,27101,27103,27105,27107,27109,27113,27117,27119,27123,27125,27129,27131,27133,27135,27141,27147,27149,27155,27159,27167,27169,28017,28019,28023,28025,28035,28045,28049,28061,28063,28093,28097,28099,28125,28137,28139,28147,28159,29001,29015,29019,29031,29033,29041,29049,29051,29055,29057,29063,29065,29069,29075,29085,29089,29101,29103,29111,29121,29123,29125,29135,29137,29139,29143,29157,29159,29161,29167,29171,29173,29175,29177,29183,29195,29197,29199,29203,29205,29207,29209,29213,29215,29217,29223,29227,29229,30005,30009,30025,30027,30033,30035,30037,30039,30045,30049,30051,30053,30055,30057,30059,30069,30071,30073,30077,30079,30083,30085,30089,30091,30093,30101,30103,30105,30107,30109,32005,32009,32017,32023,32027,32029,32510,33005,33007,34021,34027,34033,34035,36011,36017,36023,36033,36043,36047,36049,36051,36057,36061,36067,36083,36091,36097,36103,36107,36113,36115,36121,36123,37005,37009,37011,37017,37023,37029,37031,37049,37061,37075,37095,37117,37123,37131,37137,37151,37187,37189,37197,39005,39009,39015,39017,39019,39023,39037,39039,39043,39049,39053,39057,39063,39067,39071,39077,39085,39087,39091,39097,39105,39107,39113,39117,39119,39125,39127,39129,39135,39137,39151,39153,39157,40003,40013,40015,40023,40025,40027,40035,40039,40043,40045,40053,40055,40057,40059,40065,40067,40073,40077,40079,40099,40105,40107,40111,40115,40123,40127,40129,40133,40141,40147,40151,40153,41001,41007,41013,41015,41017,41021,41025,41031,41033,41037,41051,41055,41063,41067,41069,42005,42007,42011,42013,42015,42019,42027,42029,42031,42035,42053,42057,42067,42071,42083,42085,42093,42097,42105,42111,42113,42115,42123,42125,42127,42129,44005,44007,44009,45001,45009,45021,45025,45031,45059,45067,45071,45073,45089,47001,47005,47013,47015,47019,47021,47023,47027,47035,47039,47041,47047,47055,47057,47059,47061,47069,47073,47075,47077,47083,47087,47099,47105,47121,47127,47131,47133,47135,47137,47147,47151,47153,47159,47161,47163,47169,47177,47183,47185,48001,48011,48017,48019,48045,48057,48059,48063,48065,48073,48077,48079,48081,48083,48087,48095,48101,48103,48107,48109,48115,48117,48119,48123,48125,48129,48149,48151,48153,48155,48159,48161,48165,48175,48189,48191,48195,48197,48211,48221,48229,48233,48235,48237,48239,48241,48243,48245,48255,48261,48263,48265,48267,48269,48275,48277,48283,48293,48299,48305,48311,48313,48319,48321,48323,48327,48333,48345,48347,48355,48369,48377,48379,48383,48387,48389,48401,48403,48413,48417,48431,48433,48437,48443,48447,48453,48455,48457,48461,48463,48465,48469,48471,48481,48483,48485,48487,48495,48499,49001,49009,49013,49019,49027,49031,49045,51005,51017,51025,51029,51031,51036,51037,51043,51057,51059,51065,51071,51073,51077,51079,51083,51091,51095,51097,51101,51111,51115,51119,51121,51127,51135,51147,51155,51159,51165,51167,51171,51173,51181,51183,51191,51197,51530,51590,51610,51620,51670,51678,51720,51735,51750,51770,51810,51820,53013,53019,53023,53031,53033,53037,53039,53041,53047,53065,53069,53071,53075,54013,54019,54025,54031,54033,54041,54049,54055,54057,54063,54067,54071,54077,54079,54085,54089,54103,55001,55003,55005,55007,55011,55017,55021,55025,55029,55037,55043,55047,55049,55051,55061,55065,55067,55075,55077,55091,55097,55101,55103,55109,55117,55123,55125,55127,56007,56009,56011,56015,56017,56019,56021,56027,56031,56037,56043,56045,
12061,  6095, 49025, 18073, 29029, 29097, 48419, 51830, 30067, 26095, 18159, 32001, 54065, 54027, 13043, 48177, 55069, 48137, 30087, 29007, 13055, 48295, 28157, 29037, 45061, 22053, 13199, 47171, 53001, 55041, 51195, 18127, 29151, 48307, 51009, 16047, 29133,  5145, 17175, 21027, 48357, 29179, 13023, 16077, 48371, 21057, 16039, 21143, 48435, 48317, 48475,  5129, 36041, 48075, 29017, 47175, 39167, 47109, 17189, 17173, 28009, 39027, 48133, 18129, 48217, 40081, 36021,  6005, 42099, 18051, 36055, 53051, 6109, 21073, 27019,  6051, 48055,  8083, 48503, 17021, 10003, 41061, 22001, 22011, 21205, 48223, 51103, 51047, 16069, 17033, 41011,  6035, 47145, 27083, 18165, 36055, 12001, 26159,  8125, 34017,
28141, 55119, 48405, 40029, 18125, 21135, 29073, 55115, 37149,55039, 26029, 12099, 13251, 48421, 39007, 41043, 22015, 37115,54099, 51137, 22049, 55131, 17159, 56001, 40005, 18017, 28091,47101, 27037, 29005, 13239, 21019, 55085, 48253, 51139, 40101,13283, 18049, 39163, 45049, 51113,
]

In [165]:
models = {"lstm_model": build_lstm_model, "simple_rnn": build_model}

if not VALIDATE:
    best_model_params = {"lstm_model": (tf.keras.initializers.Constant, -1,False), "simple_rnn": (tf.keras.initializers.RandomNormal, -1, True)}

In [166]:
all_tf_initializers = []

In [167]:
import tensorflow.keras.initializers as initializers
import inspect

In [168]:
for name, initializer in inspect.getmembers(initializers):
    if inspect.isclass(initializer):
        all_tf_initializers.append((name, initializer))

In [169]:
best_score = (None, 1e6)

In [170]:
BATCH_SIZE=4

if VALIDATE:
    
    best_model_params = {}

    for m in models.keys():

        best_score = (None, 1e6)


        for name, initializer in all_tf_initializers:

            smps = []

            for TS in range(29, 39):


                try:
                    model = models[m](pred_length=1, initializer=initializer)
                except NotImplementedError as nime:
                    print(f" initializer {name} not implemented for {m}")
                    break

                all_data_pre_processed_cp = all_data_pre_processed.copy()

                train_indices = (all_data_pre_processed_cp.is_test==0) & (all_data_pre_processed_cp.dcount  < TS) & (all_data_pre_processed_cp.nn_count == TS) & (all_data_pre_processed_cp.dcount >= 1) & (all_data_pre_processed_cp.lastactive>ACT_THR)  & (all_data_pre_processed_cp.lasttarget>ABS_THR)

                x_train = all_data_pre_processed_cp.loc[train_indices, feats]

                x_train = x_train.values.reshape(BATCH_SIZE, -1, x_train.shape[1])

                x_target = all_data_pre_processed_cp.loc[train_indices, 'target']

                x_target = x_target.values.reshape(BATCH_SIZE, -1, 1)

                model.fit(x_train, x_target, epochs=1, batch_size=BATCH_SIZE, verbose=0)

                valid_indices = (all_data_pre_processed_cp.is_test==0) & (all_data_pre_processed_cp.dcount == TS) & (all_data_pre_processed_cp.nn_count == TS)

                df_x = all_data_pre_processed_cp.loc[valid_indices, feats]

                ypred = model.predict(np.array([df_x]))

                all_data_pre_processed_cp.loc[valid_indices, 'k'] = ypred[0].ravel()[0] + 1

                all_data_pre_processed_cp.loc[valid_indices,'k'] = all_data_pre_processed_cp.loc[valid_indices,'k'] * all_data_pre_processed_cp.loc[valid_indices,'microbusiness_density']

                nn_indices = (all_data_pre_processed_cp.dcount==TS) & (all_data_pre_processed_cp.nn_count == TS)

                lastval = all_data_pre_processed_cp.loc[nn_indices, ['cfips', 'microbusiness_density']].set_index('cfips').to_dict()['microbusiness_density']

                df = all_data_pre_processed_cp.loc[(all_data_pre_processed_cp.dcount==TS+1) & (all_data_pre_processed_cp.nn_count == TS+1), ['cfips', 'microbusiness_density', 'state', 'lastactive', 'mbd_lag_1']].reset_index(drop=True)

                dt = all_data_pre_processed_cp.loc[nn_indices, ["cfips", "k"]].set_index('cfips').to_dict()['k']

                df['lastval'] = df['cfips'].map(lastval)

                df['pred'] = df['cfips'].map(dt)

                df.loc[df['lastactive']<=ACT_THR, 'pred'] = df.loc[df['lastactive']<=ACT_THR, 'lastval']

                df.loc[df['lastval']<=ABS_THR, 'pred'] = df.loc[df['lastval']<=ABS_THR, 'lastval']

                df.loc[df['state'].isin(blacklist), 'pred'] = df.loc[df['state'].isin(blacklist), 'lastval']

                df.loc[df['cfips'].isin(blacklistcfips), 'pred'] = df.loc[df['cfips'].isin(blacklistcfips), 'lastval']

                all_data_pre_processed.loc[(all_data_pre_processed.dcount==(TS+1)) & (all_data_pre_processed.nn_count==(TS+1)), 'ypred'] = df['pred'].values

                all_data_pre_processed.loc[(all_data_pre_processed.dcount==(TS+1)) & (all_data_pre_processed.nn_count==(TS+1)), 'ypred_last'] = df['lastval'].values


                smp = smape(df['microbusiness_density'], df['pred'])

                smp_last_val = smape(df['microbusiness_density'], df['lastval'])

                smps.append(smp)

                print(f'{m} for timestep: {TS} --- ', smp)

                print(f'SMAPE LastVal for timestep: {TS} --- ', smp_last_val)



                predictions[TS] = smps

                del df, dt, lastval, nn_indices, all_data_pre_processed_cp, df_x, ypred, x_train, x_target; gc.collect()

            mean_pred = np.asarray(smps).mean()
            if mean_pred < best_score[1]:
                best_score = (initializer, mean_pred)

        best_model_params[m] = best_score

In [171]:
joblib.dump(best_model_params, "best_models.bin")

['best_models.bin']

In [172]:
if VALIDATE:
    for mod in best_model_params.keys():
        print(f"model {mod} best validator: {best_model_params[mod][0]} --- best score: {best_model_params[mod][1]} ---")

In [96]:
for m in models.keys():

    TS=39

    all_data_pre_processed_cp = all_data_pre_processed.copy()

    train_indices = (all_data_pre_processed_cp.is_test==0) & (all_data_pre_processed_cp.dcount  < TS) & (all_data_pre_processed_cp.nn_count == TS) & (all_data_pre_processed_cp.dcount >= 1) & (all_data_pre_processed_cp.lastactive>ACT_THR)  & (all_data_pre_processed_cp.lasttarget>ABS_THR)

    x_train = all_data_pre_processed_cp.loc[train_indices, feats]

    x_train = x_train.values.reshape(BATCH_SIZE, -1, x_train.shape[1])

    x_target = all_data_pre_processed_cp.loc[train_indices, 'target']

    x_target = x_target.values.reshape(BATCH_SIZE, -1, 1)
    print(f"fitting model {m} for timestep: {TS} best initializer is {best_model_params[m][0]}")

    model = models[m](pred_length=1, initializer=best_model_params[m][0])

    model.fit(x_train, x_target, epochs=1, batch_size=BATCH_SIZE, verbose=0)



    valid_indices = (all_data_pre_processed_cp.is_test==0) & (all_data_pre_processed_cp.dcount == TS) & (all_data_pre_processed_cp.nn_count == TS)

    df_x = all_data_pre_processed_cp.loc[valid_indices, feats]

    ypred = model.predict(np.array([df_x]))
    
    print(f"ypred shape {ypred.shape}")

    print(f"ypred shape scalar {ypred[0].ravel()[0]}")

    all_data_pre_processed_cp.loc[valid_indices, 'k'] = ypred[0].ravel()[0] + 1

    all_data_pre_processed_cp.loc[valid_indices,'k'] = all_data_pre_processed_cp.loc[valid_indices,'k'] * all_data_pre_processed_cp.loc[valid_indices,'microbusiness_density']

    nn_indices = (all_data_pre_processed_cp.dcount==TS) & (all_data_pre_processed_cp.nn_count == TS)

    lastval = all_data_pre_processed_cp.loc[nn_indices, ['cfips', 'microbusiness_density']].set_index('cfips').to_dict()['microbusiness_density']

    df = all_data.loc[(all_data.dcount==TS+1), ['cfips', 'microbusiness_density', 'state', 'lastactive', 'mbd_lag_1']].reset_index(drop=True)

    dt = all_data_pre_processed_cp.loc[nn_indices, ["cfips", "k"]].set_index('cfips').to_dict()['k']

    df['lastval'] = df['cfips'].map(lastval)

    df['pred'] = df['cfips'].map(dt)

    df.loc[df['lastactive']<=ACT_THR, 'pred'] = df.loc[df['lastactive']<=ACT_THR, 'lastval']

    df.loc[df['lastval']<=ABS_THR, 'pred'] = df.loc[df['lastval']<=ABS_THR, 'lastval']

    df.loc[df['state'].isin(blacklist), 'pred'] = df.loc[df['state'].isin(blacklist), 'lastval']

    df.loc[df['cfips'].isin(blacklistcfips), 'pred'] = df.loc[df['cfips'].isin(blacklistcfips), 'lastval']

    all_data.loc[(all_data.dcount==(TS+1)), 'ypred'] = df['pred'].values

    all_data.loc[(all_data.dcount==(TS+1)), 'ypred_last'] = df['lastval'].values

    all_data.loc[all_data['cfips']==28055, 'microbusiness_density'] = 0
    all_data.loc[all_data['cfips']==48269, 'microbusiness_density'] = 1.762115

    dt = all_data.loc[all_data.dcount==TS+1, ['cfips', 'ypred']].set_index('cfips').to_dict()['ypred']
    test = all_data.loc[all_data.is_test==1, ['row_id', 'cfips','microbusiness_density']].copy()
    test['microbusiness_density'] = test['cfips'].map(dt)

    test[['row_id','microbusiness_density']].to_csv(f'submission_{m}.csv', index=False)

fitting model lstm_model for timestep: 39 best initializer is <class 'keras.initializers.initializers_v2.Constant'>
ypred shape (1, 1)
ypred shape scalar 1.3192348887969274e-05
fitting model simple_rnn for timestep: 39 best initializer is <class 'keras.initializers.initializers_v2.RandomNormal'>
ypred shape (1, 1)
ypred shape scalar -0.019527774304151535


In [97]:
test.shape

(25080, 3)

In [98]:
test.head(300)

Unnamed: 0,row_id,cfips,microbusiness_density
40,1001_2022-11-01,1001,3.375449
42,1001_2022-12-01,1001,3.375449
43,1001_2023-01-01,1001,3.375449
44,1001_2023-02-01,1001,3.375449
45,1001_2023-03-01,1001,3.375449
...,...,...,...
1812,1073_2023-06-01,1073,6.395634
1853,1075_2022-11-01,1075,0.915565
1855,1075_2022-12-01,1075,0.915565
1856,1075_2023-01-01,1075,0.915565
