In [1]:
import pandas as pd
import numpy as np
import math
from tqdm import tqdm
tqdm.pandas()

from tpot import TPOTRegressor
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.metrics import make_scorer

train = pd.read_csv('data/train_updated.csv')
test = pd.read_csv('data/test_updated.csv')
submission = pd.read_csv('data/sample_submission.csv')

In [2]:
train_data, train_target = train.drop(['INVC_CONT'],axis=1), train['INVC_CONT']
test_data = test.copy()

In [3]:
data = pd.concat([train_data, test_data])
data

Unnamed: 0,SEND_SPG_INNB,REC_SPG_INNB,DL_GD_LCLS_NM,DL_GD_MCLS_NM,SEND_격자공간명,REC_격자공간명,SEND_시군구코드,REC_시군구코드,SEND_총인구수,SEND_세대수,...,SEND_여자인구수,SEND_남여비율,REC_총인구수,REC_세대수,REC_세대당인구,REC_남자인구수,REC_여자인구수,REC_남여비율,mean_lcls,mean_mcls
0,1129000014045300,5011000220046300,패션의류,상의,다사6156,다다1502,11290,50110,442471.0,192809.0,...,228224.0,0.94,489202.0,209614.0,2.33,245389.0,243813.0,1.01,6.678694,3.672897
1,1135000009051200,5011000178037300,생활/건강,반려동물,다사6163,다다1903,11350,50110,532662.0,217054.0,...,274916.0,0.94,489202.0,209614.0,2.33,245389.0,243813.0,1.01,5.407921,4.444134
2,1135000030093100,5011000265091400,패션의류,기타패션의류,다사6158,다다1001,11350,50110,532662.0,217054.0,...,274916.0,0.94,489202.0,209614.0,2.33,245389.0,243813.0,1.01,6.678694,10.375479
3,1154500002014200,5011000315087400,식품,농산물,다사4541,다다0600,11545,50110,233066.0,111000.0,...,114767.0,1.03,489202.0,209614.0,2.33,245389.0,243813.0,1.01,4.658195,4.297401
4,1165000021008300,5011000177051200,식품,가공식품,다사5942,다다1803,11650,50110,431050.0,173388.0,...,224938.0,0.92,489202.0,209614.0,2.33,245389.0,243813.0,1.01,4.658195,6.807151
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4635,5013000858004400,4725000719072200,식품,농산물,다나0473,라바5923,50130,47250,181547.0,83748.0,...,89705.0,1.02,99814.0,48527.0,2.06,48984.0,50830.0,0.96,4.658195,4.297401
4636,5013000870018300,2826000106075300,식품,농산물,다나1673,다사2548,50130,28260,181547.0,83748.0,...,89705.0,1.02,544608.0,218156.0,2.50,274954.0,269654.0,1.02,4.658195,4.297401
4637,5013000897086300,4311100034004300,식품,농산물,나나9472,다바9950,50130,43111,181547.0,83748.0,...,89705.0,1.02,179626.0,74371.0,2.42,89423.0,90203.0,0.99,4.658195,4.297401
4638,5013000902065100,4145000013011200,식품,농산물,다나0472,다사7250,50130,41450,181547.0,83748.0,...,89705.0,1.02,273587.0,116659.0,2.35,137533.0,136054.0,1.01,4.658195,4.297401


In [4]:
le = LabelEncoder().fit(data['SEND_격자공간명'].append(data['REC_격자공간명']).sort_values())
data['SEND_격자공간명'] = le.transform(data['SEND_격자공간명'])
data['REC_격자공간명'] = le.transform(data['REC_격자공간명'])

data_one = pd.get_dummies(data)

cols = data_one.columns
for col in cols:
    if data_one[col].mean() > 100:
        mm = MinMaxScaler()
        data_one[col] = mm.fit_transform(np.array(data_one[col]).reshape(-1, 1))

train_data, test_data = data_one.iloc[:-len(test_data)], data_one.iloc[-len(test_data):]

In [5]:
train_data.head()

Unnamed: 0,SEND_SPG_INNB,REC_SPG_INNB,SEND_격자공간명,REC_격자공간명,SEND_시군구코드,REC_시군구코드,SEND_총인구수,SEND_세대수,SEND_세대당인구,SEND_남자인구수,...,DL_GD_MCLS_NM_문화컨텐츠,DL_GD_MCLS_NM_반려동물,DL_GD_MCLS_NM_상의,DL_GD_MCLS_NM_생활용품,DL_GD_MCLS_NM_수산,DL_GD_MCLS_NM_스킨케어,DL_GD_MCLS_NM_음료,DL_GD_MCLS_NM_음반,DL_GD_MCLS_NM_주방용품,DL_GD_MCLS_NM_축산
0,0.004613,0.999487,0.5338,0.095481,0.004613,0.999487,0.530672,0.573835,2.29,0.495201,...,0,0,1,0,0,0,0,0,0,0
1,0.006151,0.999487,0.534579,0.098146,0.006151,0.999487,0.643163,0.649509,2.45,0.599857,...,0,1,0,0,0,0,0,0,0,0
2,0.006151,0.999487,0.534023,0.092817,0.006151,0.999487,0.643163,0.649509,2.45,0.599857,...,0,0,0,0,0,0,0,0,0,0
3,0.011148,0.999487,0.45219,0.090707,0.011148,0.999487,0.269491,0.318489,2.1,0.264357,...,0,0,0,0,0,0,0,0,0,0
4,0.013839,0.999487,0.518234,0.09748,0.013839,0.999487,0.516427,0.513217,2.49,0.475629,...,0,0,0,0,0,0,0,0,0,0


In [6]:
test_data.head()

Unnamed: 0,SEND_SPG_INNB,REC_SPG_INNB,SEND_격자공간명,REC_격자공간명,SEND_시군구코드,REC_시군구코드,SEND_총인구수,SEND_세대수,SEND_세대당인구,SEND_남자인구수,...,DL_GD_MCLS_NM_문화컨텐츠,DL_GD_MCLS_NM_반려동물,DL_GD_MCLS_NM_상의,DL_GD_MCLS_NM_생활용품,DL_GD_MCLS_NM_수산,DL_GD_MCLS_NM_스킨케어,DL_GD_MCLS_NM_음료,DL_GD_MCLS_NM_음반,DL_GD_MCLS_NM_주방용품,DL_GD_MCLS_NM_축산
0,1.0,0.013839,0.085835,0.517708,1.0,0.013839,0.205233,0.233429,2.17,0.200703,...,0,0,0,0,0,0,0,0,0,0
1,1.0,0.011148,0.086169,0.45176,1.0,0.011148,0.205233,0.233429,2.17,0.200703,...,0,0,0,0,0,0,0,0,0,0
2,1.0,0.776012,0.085168,0.383813,1.0,0.776012,0.205233,0.233429,2.17,0.200703,...,0,0,0,0,0,0,0,0,0,0
3,1.0,0.797027,0.085168,0.832797,1.0,0.797027,0.205233,0.233429,2.17,0.200703,...,0,0,0,0,0,0,0,0,0,0
4,1.0,0.41389,0.078497,0.897857,1.0,0.41389,0.205233,0.233429,2.17,0.200703,...,0,0,0,0,0,0,0,0,0,0


In [7]:
def rmse(y_true, y_pred):
    return math.sqrt(sum((y_true-y_pred)**2) / len(y_true))

rmse = make_scorer(rmse, greater_is_better=False)

In [8]:
tpot_params = dict({
    'generations':10,
    'population_size':100,
    'scoring': rmse,
    'subsample':1,
    'verbosity':2,
    'random_state':42,
    'n_jobs':-1,
    'use_dask':True,
    'warm_start':True,
    'memory': 'auto',
    # 'config_dict':'TPOT sparse',
    # 'config_dict': 'TPOT NN',
    'periodic_checkpoint_folder':'./tpot_checkpoint/'
})

In [9]:
tpot = TPOTRegressor(**tpot_params)
for i in range(100):
    tpot.fit(train_data, train_target)
    tpot.export(f'tpot_best_pareto/tpot_exdata_v{i}.py')
    results = tpot.predict(test_data)
    submission['INVC_CONT'] = results
    submission.to_csv(f'submission/tpot_exdata_g{i}.csv', index=False)

Optimization Progress:  10%|▉         | 106/1100 [15:39<1:19:22,  4.79s/pipeline]