In [1]:
import numpy as np
import pandas as pd
import seaborn as sns

In [2]:
# Import my modules.
import sys, os
from pathlib import Path
current_dir = os.path.join(Path().resolve())
sys.path.append(str(current_dir) + '/../')

from modules import utils
from modules import models
from modules import preprocess

import importlib
for m in [utils, models, preprocess]:
    importlib.reload(m)

In [3]:
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)

## Trend実験

In [4]:
# mbd = 'microbusiness_density'

df_train, df_test, df_subm = utils.load_dataset()
df_all, df_census = utils.merge_dataset(df_train, df_test, pop=False, unemploy=False, census=False, coord=False, co_est=False, fix_pop=False, add_location=False, outlier=False)
# df_all = preprocess.add_lag_features(df_all)

  df_census = load_census(BASE)


In [5]:
low_trend_params = {
    1: {
        'n':3,
        'thre':3,
        'thre_r':0,
        'lower_bound': 60,
        'upper_bound': 140,
        'use_regularize': True,
        'v_regularize': [0.03, 0.02],
        'v_clip':[0.999, 1.004]
    }
}
params = low_trend_params[1]
df_trend, trend_dict = preprocess.get_trend_dict(df_all, **params)
print(len(trend_dict))
trend_dict

13


{13025: 1.004,
 13193: 0.999,
 18025: 0.999,
 21147: 1.004,
 31093: 1.0033846255390906,
 40005: 1.004,
 40069: 1.00303696951699,
 46091: 1.0016915440454517,
 48305: 1.004,
 48385: 1.0026579234749724,
 5067: 1.004,
 54023: 0.999,
 8033: 1.004}

In [6]:
high_trend_params = {
    1: {
        'n':3,
        'thre':3,
        'thre_r':0.004,
        'lower_bound': 20000,
        'upper_bound': 99999999,
        'use_regularize': True,
        'v_regularize': [0.002, 0],
        'v_clip':[None, None]
    }
}
params = high_trend_params[1]
df_trend, high_trend_dict = preprocess.get_trend_dict(df_all, **params)
print(len(high_trend_dict))
high_trend_dict

20


{12031: 1.0086328445657107,
 12101: 1.006481142747778,
 17197: 1.0080206078421285,
 20091: 1.0060327304334955,
 25025: 1.0061688635360524,
 36085: 1.0173209335038416,
 37119: 1.006098418566488,
 40109: 1.0129319463023307,
 40143: 1.008331800893501,
 42003: 1.0233533145463996,
 47065: 1.008465009610966,
 47187: 1.0075341777021485,
 48113: 1.0045559762608431,
 48121: 1.0118033817358585,
 48157: 1.013289253684673,
 48491: 1.0074687019144728,
 55133: 1.0069273342745635,
 6001: 1.0116187765330433,
 6073: 1.0100269518719414,
 6079: 1.0050638864187709}

In [7]:
middle_trend_params = {
    1: {
        'n':7,
        'thre':7,
        'thre_r':0.001,
        'lower_bound': 1000,
        'upper_bound': 20000,
        'use_regularize': True,
        'v_regularize': [0.001, 0],
        'v_clip':[None, 1.02]
    }
}
params = middle_trend_params[1]
df_trend, middle_trend_dict = preprocess.get_trend_dict(df_all, **params)
print(len(middle_trend_dict))
middle_trend_dict

29


{10001: 1.0129351534528772,
 1095: 1.008447263901533,
 1125: 1.0154674931169532,
 13285: 1.02,
 16021: 1.02,
 16051: 1.0102600596450686,
 17049: 1.0145852062314247,
 17119: 1.0160684278423038,
 18073: 1.02,
 23017: 1.02,
 24023: 1.02,
 29209: 0.9900118427149365,
 30047: 1.0077290823975364,
 30063: 1.007380223819338,
 32017: 0.989911033357664,
 33009: 1.0036445676092536,
 37071: 1.0116879058571973,
 37097: 1.008211996600611,
 37161: 1.0085466949373592,
 37175: 1.0167317606498527,
 37195: 1.0099083923055931,
 42013: 0.984723291353719,
 46083: 1.008549269096111,
 49057: 1.0060150299560793,
 51033: 1.0088285129605443,
 53007: 1.02,
 53065: 0.9778049488905132,
 6005: 0.9940137900855888,
 8037: 1.0155565410397998}

In [8]:
lowmiddle_trend_params = {
    1: {
        'n':10,
        'thre':10,
        'thre_r':0.003,
        'lower_bound': 140,
        'upper_bound': 1000,
        'use_regularize': True,
        'v_regularize': [0.001, 0],
        'v_clip':[None, 1.03]
    }
}
params = lowmiddle_trend_params[1]
df_trend, lowmiddle_trend_dict = preprocess.get_trend_dict(df_all, **params)
print(len(lowmiddle_trend_dict))
lowmiddle_trend_dict

5


{13133: 1.0124139778496224,
 17135: 1.03,
 26085: 1.0258813587616837,
 28051: 1.03,
 29149: 1.03}

In [9]:
for keys in lowmiddle_trend_dict.keys():
    if keys in middle_trend_dict.keys():
        print(keys)

In [10]:
df_sub1 = pd.read_csv('../submission/submission_13820.csv', index_col='row_id')
df_sub1, df_extract, var_dict = utils.insert_trend(df_sub1, df_all, df_census, trend_dict)
df_sub1, df_extract, var_dict = utils.insert_trend(df_sub1, df_all, df_census, high_trend_dict, method='replace')
df_sub1, df_extract, var_dict = utils.insert_trend(df_sub1, df_all, df_census, middle_trend_dict, method='replace')
df_sub1, df_extract, var_dict = utils.insert_trend(df_sub1, df_all, df_census, lowmiddle_trend_dict, method='replace')
df_sub1.to_csv('../submission/submission_13820_trend.csv')

In [11]:
df_merged = utils.compare_submission(df_sub1, 'submission_13820')
df_diff = df_merged[df_merged['smape']>0.001].copy()
df_diff.shape

(67, 6)

In [12]:
df_merged['smape'].sum()

50.35634956359516

In [13]:
47
25.251103

25.251103

In [14]:
df_merged[df_merged['microbusiness_density']<df_merged['baseline']]

Unnamed: 0,row_id,microbusiness_density,cfips,month,baseline,smape
186,6005_2023-01-01,13.309681,6005,2023-01-01,13.398735,0.666858
480,13193_2023-01-01,0.921605,13193,2023-01-01,0.922528,0.10005
707,18025_2023-01-01,1.314474,18025,2023-01-01,1.315789,0.10005
1584,29209_2023-01-01,5.464115,29209,2023-01-01,5.525793,1.122451
1752,32017_2023-01-01,73.652821,32017,2023-01-01,74.396026,1.004
2247,42013_2023-01-01,3.027983,42013,2023-01-01,3.071815,1.437174
2978,53065_2023-01-01,4.50312,53065,2023-01-01,4.599392,2.115277
2996,54023_2023-01-01,0.763968,54023,2023-01-01,0.764732,0.10005


In [21]:
# df_submission = pd.read_csv('../submission/submission_2023-02-27_16_35_41.csv', index_col='row_id')
df_submission = pd.read_csv('../submission/submission_2023-03-04_06_27_51.csv', index_col='row_id')
# df_submission = pd.read_csv('../submission/submission_2023-02-27_19_41_33.csv', index_col='row_id')
# df_submission = pd.read_csv('../submission/submission_2023-02-27_21_15_16.csv', index_col='row_id')
df_merged = utils.compare_submission(df_submission, 'submission_13820')
print(df_merged['smape'].mean())

0.02302462445296298


In [16]:
# Import my modules.
import sys, os
from pathlib import Path
current_dir = os.path.join(Path().resolve())
sys.path.append(str(current_dir) + '/../')

from modules import utils
from modules import models
from modules import preprocess

import importlib
for m in [utils, models, preprocess]:
    importlib.reload(m)

In [17]:
df_allt = df_all.copy()
df_allt = utils.merge_scale41(df_allt, df_submission, df_census)

In [18]:
df_allt[(df_allt['scale']>=38)&(df_allt['scale']<=41)].head(100)

Unnamed: 0_level_0,cfips,county,state,microbusiness_density,active,year,month,scale,state_i,mbd_origin,select_lastactive40
row_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
10001_2022-10-01,10001,0,Delaware,6.969616,9618.0,2022,10,38,0,6.969616,9784.0
10001_2022-11-01,10001,0,Delaware,7.034833,9708.0,2022,11,39,0,7.034833,9784.0
10001_2022-12-01,10001,0,Delaware,7.089906,9784.0,2022,12,40,0,7.089906,9784.0
10001_2023-01-01,10001,0,Delaware,7.097703,9795.0,2023,1,41,0,7.097703,9784.0
10003_2022-10-01,10003,1,Delaware,20.917912,91633.0,2022,10,38,0,20.917912,98626.0
10003_2022-11-01,10003,1,Delaware,20.93252,91697.0,2022,11,39,0,20.93252,98626.0
10003_2022-12-01,10003,1,Delaware,22.514267,98626.0,2022,12,40,0,22.514267,98626.0
10003_2023-01-01,10003,1,Delaware,22.263424,97527.0,2023,1,41,0,22.263424,98626.0
10005_2022-10-01,10005,2,Delaware,59.590317,111680.0,2022,10,38,0,59.590317,110741.0
10005_2022-11-01,10005,2,Delaware,60.994167,114311.0,2022,11,39,0,60.994167,110741.0
