In [1]:
import pandas as pd
import math
from tqdm import tqdm
tqdm.pandas()

from tpot import TPOTRegressor
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import make_scorer

train = pd.read_csv('data/train_df.csv', encoding='euc-kr')
test = pd.read_csv('data/test_df.csv', encoding='euc-kr')
submission = pd.read_csv('data/sample_submission.csv')

In [2]:
region = pd.read_excel('data/tfile.xlsx')
region = region.groupby('코드').mean()

In [3]:
train

Unnamed: 0,index,SEND_SPG_INNB,REC_SPG_INNB,DL_GD_LCLS_NM,DL_GD_MCLS_NM,INVC_CONT
0,0,1129000014045300,5011000220046300,패션의류,상의,3
1,1,1135000009051200,5011000178037300,생활/건강,반려동물,3
2,2,1135000030093100,5011000265091400,패션의류,기타패션의류,9
3,3,1154500002014200,5011000315087400,식품,농산물,10
4,4,1165000021008300,5011000177051200,식품,가공식품,3
...,...,...,...,...,...,...
31995,31995,5011001060063300,2635000026053400,식품,농산물,6
31996,31996,5011001095042400,1168000017002200,식품,기타식품,5
31997,31997,5011001108036200,4119700008012100,식품,농산물,9
31998,31998,5011001115011400,1132000015085100,식품,농산물,3


In [4]:
region

Unnamed: 0_level_0,위도,경도
코드,Unnamed: 1_level_1,Unnamed: 2_level_1
1111010100,37.587111,126.969069
1111010200,37.583911,126.968354
1111010300,37.584381,126.971489
1111010400,37.582416,126.971670
1111010500,37.580363,126.972065
...,...,...
5013058000,33.287047,126.540929
5013059000,33.248173,126.511445
5013060000,33.250783,126.477610
5013061000,33.306456,126.442628


In [5]:
def get_distance(input):
    send_code, rec_code = input[0], input[1]
    if send_code==rec_code:
        return 0

    send_codnt = tuple(region.loc[send_code, :])
    rec_codnt = tuple(region.loc[rec_code, :])
    distance = math.sqrt((send_codnt[0]-rec_codnt[0])**2 + (send_codnt[1]-rec_codnt[1])**2)

    return distance

In [6]:
train_data, train_target = train.drop(['index', 'INVC_CONT'],axis=1), train['INVC_CONT']
test_data = test.drop(['index'],axis=1)

In [7]:
data = pd.concat([train_data, test_data])
data

Unnamed: 0,SEND_SPG_INNB,REC_SPG_INNB,DL_GD_LCLS_NM,DL_GD_MCLS_NM
0,1129000014045300,5011000220046300,패션의류,상의
1,1135000009051200,5011000178037300,생활/건강,반려동물
2,1135000030093100,5011000265091400,패션의류,기타패션의류
3,1154500002014200,5011000315087400,식품,농산물
4,1165000021008300,5011000177051200,식품,가공식품
...,...,...,...,...
4635,5013000858004400,4725000719072200,식품,농산물
4636,5013000870018300,2826000106075300,식품,농산물
4637,5013000897086300,4311100034004300,식품,농산물
4638,5013000902065100,4145000013011200,식품,농산물


In [8]:
data_one = pd.get_dummies(data)
# data_one = data.copy()

# data_one['SEND_CODE'] = data_one['SEND_SPG_INNB'].apply(lambda x: int(str(x)[:5]))
# data_one['REC_CODE'] = data_one['REC_SPG_INNB'].apply(lambda x: int(str(x)[:5]))
# data_one = data_one.drop(['SEND_SPG_INNB','REC_SPG_INNB'], axis=1)
# data_one['DISTANCE'] = data_one[['SEND_CODE','REC_CODE']].progress_apply(get_distance, axis=1)

# cols = ['SEND_CODE', 'REC_CODE', 'DISTANCE']
# for col in cols:
#     mm = MinMaxScaler()
#     data_one[col] = mm.fit_transform(np.array(data_one[col]).reshape(-1, 1))

train_data, test_data = data_one.iloc[:-len(test_data)], data_one.iloc[-len(test_data):]

In [9]:
train_data

Unnamed: 0,SEND_SPG_INNB,REC_SPG_INNB,DL_GD_LCLS_NM_디지털/가전,DL_GD_LCLS_NM_생활/건강,DL_GD_LCLS_NM_식품,DL_GD_LCLS_NM_여행/문화,DL_GD_LCLS_NM_패션의류,DL_GD_LCLS_NM_화장품/미용,DL_GD_MCLS_NM_가공식품,DL_GD_MCLS_NM_건강식품,...,DL_GD_MCLS_NM_문화컨텐츠,DL_GD_MCLS_NM_반려동물,DL_GD_MCLS_NM_상의,DL_GD_MCLS_NM_생활용품,DL_GD_MCLS_NM_수산,DL_GD_MCLS_NM_스킨케어,DL_GD_MCLS_NM_음료,DL_GD_MCLS_NM_음반,DL_GD_MCLS_NM_주방용품,DL_GD_MCLS_NM_축산
0,1129000014045300,5011000220046300,0,0,0,0,1,0,0,0,...,0,0,1,0,0,0,0,0,0,0
1,1135000009051200,5011000178037300,0,1,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
2,1135000030093100,5011000265091400,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1154500002014200,5011000315087400,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1165000021008300,5011000177051200,0,0,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31995,5011001060063300,2635000026053400,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
31996,5011001095042400,1168000017002200,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
31997,5011001108036200,4119700008012100,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
31998,5011001115011400,1132000015085100,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [10]:
test_data

Unnamed: 0,SEND_SPG_INNB,REC_SPG_INNB,DL_GD_LCLS_NM_디지털/가전,DL_GD_LCLS_NM_생활/건강,DL_GD_LCLS_NM_식품,DL_GD_LCLS_NM_여행/문화,DL_GD_LCLS_NM_패션의류,DL_GD_LCLS_NM_화장품/미용,DL_GD_MCLS_NM_가공식품,DL_GD_MCLS_NM_건강식품,...,DL_GD_MCLS_NM_문화컨텐츠,DL_GD_MCLS_NM_반려동물,DL_GD_MCLS_NM_상의,DL_GD_MCLS_NM_생활용품,DL_GD_MCLS_NM_수산,DL_GD_MCLS_NM_스킨케어,DL_GD_MCLS_NM_음료,DL_GD_MCLS_NM_음반,DL_GD_MCLS_NM_주방용품,DL_GD_MCLS_NM_축산
0,5013000043028400,1165000021097200,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,5013000044016100,1154500002066400,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,5013000205030200,4139000102013200,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,5013000205030200,4221000040093400,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5013000268011400,2726000004017100,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4635,5013000858004400,4725000719072200,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4636,5013000870018300,2826000106075300,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4637,5013000897086300,4311100034004300,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4638,5013000902065100,4145000013011200,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [11]:
def rmse(y_true, y_pred):
    return math.sqrt(sum((y_true-y_pred)**2) / len(y_true))

rmse = make_scorer(rmse, greater_is_better=False)

In [12]:
tpot_params = dict({
    'generations':10,
    'population_size':100,
    'scoring': 'r2',
    'subsample':1,
    'verbosity':2,
    'random_state':42,
    'n_jobs':-1,
    'use_dask':True,
    'warm_start':True,
    'memory': 'auto',
    # 'memory':'auto',
    # 'config_dict':'TPOT sparse',
    # 'config_dict': 'TPOT NN',
    'periodic_checkpoint_folder':'./tpot_checkpoint/'
})

In [13]:
tpot = TPOTRegressor(**tpot_params)
for i in range(100):
    tpot.fit(train_data, train_target)
    tpot.export(f'tpot_best_pareto/tpot_v{i}.py')
    results = tpot.predict(test_data)
    submission['INVC_CONT'] = results
    submission.to_csv(f'submission/tpot_g{i}.csv', index=False)

Optimization Progress:   0%|          | 0/1100 [00:00<?, ?pipeline/s]