In [1]:
import numpy as np
import pandas as pd
import seaborn as sns

In [2]:
# Import my modules.
import sys, os
from pathlib import Path
current_dir = os.path.join(Path().resolve())
sys.path.append(str(current_dir) + '/../')

from modules import utils
from modules import models
from modules import preprocess

import importlib
for m in [utils, models, preprocess]:
    importlib.reload(m)

In [3]:
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)

## Trend実験

In [5]:
# mbd = 'microbusiness_density'

df_train, df_test, df_subm = utils.load_dataset()
df_all, df_census = utils.merge_dataset(df_train, df_test, pop=False, unemploy=False, census=False, coord=False, co_est=True, fix_pop=False, add_location=False, outlier=False)
# df_all = preprocess.add_lag_features(df_all)

  df_census = load_census(BASE)


In [143]:
low_trend_params = {
    1: {
        'n':3,
        'thre':3,
        'thre_r':0,
        'lower_bound': 60,
        'upper_bound': 140,
        'use_regularize': True,
        'v_regularize': [0.03, 0.02],
        'v_clip':[0.999, 1.004]
    }
}
params = low_trend_params[1]
df_trend, trend_dict = preprocess.get_trend_dict(df_all, **params)
print(len(trend_dict))
trend_dict

13


{13025: 1.004,
 13193: 0.999,
 18025: 0.999,
 21147: 1.004,
 31093: 1.0033846255390906,
 40005: 1.004,
 40069: 1.00303696951699,
 46091: 1.0016915440454517,
 48305: 1.004,
 48385: 1.0026579234749724,
 5067: 1.004,
 54023: 0.999,
 8033: 1.004}

In [144]:
high_trend_params = {
    1: {
        'n':3,
        'thre':3,
        'thre_r':0.004,
        'lower_bound': 20000,
        'upper_bound': 99999999,
        'use_regularize': True,
        'v_regularize': [0.002, 0],
        'v_clip':[None, None]
    }
}
params = high_trend_params[1]
df_trend, high_trend_dict = preprocess.get_trend_dict(df_all, **params)
print(len(high_trend_dict))
high_trend_dict

20


{12031: 1.0086328445657107,
 12101: 1.006481142747778,
 17197: 1.0080206078421285,
 20091: 1.0060327304334955,
 25025: 1.0061688635360524,
 36085: 1.0173209335038416,
 37119: 1.006098418566488,
 40109: 1.0129319463023307,
 40143: 1.008331800893501,
 42003: 1.0233533145463996,
 47065: 1.008465009610966,
 47187: 1.0075341777021485,
 48113: 1.0045559762608431,
 48121: 1.0118033817358585,
 48157: 1.013289253684673,
 48491: 1.0074687019144728,
 55133: 1.0069273342745635,
 6001: 1.0116187765330433,
 6073: 1.0100269518719414,
 6079: 1.0050638864187709}

In [152]:
middle_trend_params = {
    1: {
        'n':7,
        'thre':7,
        'thre_r':0.002,
        'lower_bound': 2000,
        'upper_bound': 20000,
        'use_regularize': True,
        'v_regularize': [0.002, 0.002],
        'v_clip':[None, 1.015]
    }
}
params = middle_trend_params[1]
df_trend, middle_trend_dict = preprocess.get_trend_dict(df_all, **params)
print(len(middle_trend_dict))
middle_trend_dict

14


{10001: 1.0109092831459714,
 1095: 1.00643036937373,
 1125: 1.0134365581307192,
 13285: 1.015,
 16021: 1.015,
 17049: 1.0125560358189618,
 17119: 1.0140362909866192,
 18073: 1.015,
 23017: 1.015,
 37097: 1.0061955726074097,
 37175: 1.014698297128553,
 42013: 0.9857070309055211,
 53007: 1.015,
 8037: 1.0135254279577202}

In [153]:
for keys in middle_trend_dict.keys():
    if keys in high_trend_dict.keys():
        print(keys)

In [154]:
df_sub1 = pd.read_csv('../submission/submission_13820.csv', index_col='row_id')
df_sub1, df_extract, var_dict = utils.insert_trend(df_sub1, df_all, df_census, trend_dict)
df_sub1, df_extract, var_dict = utils.insert_trend(df_sub1, df_all, df_census, high_trend_dict, method='replace')
df_sub1, df_extract, var_dict = utils.insert_trend(df_sub1, df_all, df_census, middle_trend_dict, method='replace')
df_sub1.to_csv('../submission/submission_13820_trend.csv')

In [155]:
df_merged = utils.compare_submission(df_sub1, 'submission_13820')
df_diff = df_merged[df_merged['smape']>0.001].copy()
df_diff.shape

(47, 6)

In [156]:
df_merged['smape'].sum()

25.271103703018476

In [157]:
df_merged[df_merged['microbusiness_density']<df_merged['baseline']]

Unnamed: 0,row_id,microbusiness_density,cfips,month,baseline,smape
480,13193_2023-01-01,0.921605,13193,2023-01-01,0.922528,0.10005
707,18025_2023-01-01,1.314474,18025,2023-01-01,1.315789,0.10005
2247,42013_2023-01-01,3.031008,42013,2023-01-01,3.071815,1.337329
2996,54023_2023-01-01,0.763968,54023,2023-01-01,0.764732,0.10005


In [7]:
# df_submission = pd.read_csv('../submission/submission_2023-02-27_16_35_41.csv', index_col='row_id')
df_submission = pd.read_csv('../submission/submission_2023-02-28_18_21_26.csv', index_col='row_id')
# df_submission = pd.read_csv('../submission/submission_2023-02-27_19_41_33.csv', index_col='row_id')
# df_submission = pd.read_csv('../submission/submission_2023-02-27_21_15_16.csv', index_col='row_id')
df_merged = utils.compare_submission(df_submission, 'submission_13827')
print(df_merged['smape'].mean())

0.016963210799712435


In [235]:
len(df_all['cfips'].unique())

3135

In [None]:
0.01944239446655972