In [1]:
import pandas as pd
import numpy as np
import math
from tqdm import tqdm
import gc
tqdm.pandas()

from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import KFold

train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')
submission = pd.read_csv('data/sample_submission.csv')

In [2]:
region = pd.read_excel('data/tfile.xlsx')
region['코드'] = region['코드'].apply(lambda x: int(str(x)[:3]))
region = region.groupby('코드').mean()

In [3]:
def get_distance(input):
    send_code, rec_code = input[0], input[1]
    if send_code==rec_code:
        return 0

    send_codnt = tuple(region.loc[send_code, :])
    rec_codnt = tuple(region.loc[rec_code, :])
    distance = math.sqrt((send_codnt[0]-rec_codnt[0])**2 + (send_codnt[1]-rec_codnt[1])**2)

    return distance

In [5]:
train_data, train_target = train.drop(['index', 'INVC_CONT'],axis=1), train['INVC_CONT']
test_data = test.drop(['index'],axis=1)

In [7]:
data = pd.concat([train_data, test_data])
data

Unnamed: 0,SEND_SPG_INNB,REC_SPG_INNB,DL_GD_LCLS_NM,DL_GD_MCLS_NM
0,1.129000e+15,5.011000e+15,패션의류,상의
1,1.135000e+15,5.011000e+15,생활/건강,반려동물
2,1.135000e+15,5.011000e+15,패션의류,기타패션의류
3,1.154500e+15,5.011000e+15,식품,농산물
4,1.165000e+15,5.011000e+15,식품,가공식품
...,...,...,...,...
4635,5.013000e+15,4.725000e+15,식품,농산물
4636,5.013000e+15,2.826000e+15,식품,농산물
4637,5.013000e+15,4.311100e+15,식품,농산물
4638,5.013000e+15,4.145000e+15,식품,농산물


In [9]:
data_one = pd.get_dummies(data)
# data_one = data.copy()

data_one['SEND_CODE'] = data_one['SEND_SPG_INNB'].apply(lambda x: int(str(x)[:3]))
data_one['REC_CODE'] = data_one['REC_SPG_INNB'].apply(lambda x: int(str(x)[:3]))
data_one = data_one.drop(['SEND_SPG_INNB','REC_SPG_INNB'], axis=1)
data_one['DISTANCE'] = data_one[['SEND_CODE','REC_CODE']].progress_apply(get_distance, axis=1)

cols = ['SEND_CODE', 'REC_CODE', 'DISTANCE']
for col in cols:
    mm = MinMaxScaler()
    data_one[col] = mm.fit_transform(np.array(data_one[col]).reshape(-1, 1))

train_data, test_data = data_one.iloc[:-len(test_data)], data_one.iloc[-len(test_data):]

100%|██████████| 36295/36295 [00:05<00:00, 6439.03it/s]


In [10]:
train_data

Unnamed: 0,DL_GD_LCLS_NM_디지털/가전,DL_GD_LCLS_NM_생활/건강,DL_GD_LCLS_NM_식품,DL_GD_LCLS_NM_여행/문화,DL_GD_LCLS_NM_패션의류,DL_GD_LCLS_NM_화장품/미용,DL_GD_MCLS_NM_가공식품,DL_GD_MCLS_NM_건강식품,DL_GD_MCLS_NM_건강용품,DL_GD_MCLS_NM_과자,...,DL_GD_MCLS_NM_생활용품,DL_GD_MCLS_NM_수산,DL_GD_MCLS_NM_스킨케어,DL_GD_MCLS_NM_음료,DL_GD_MCLS_NM_음반,DL_GD_MCLS_NM_주방용품,DL_GD_MCLS_NM_축산,SEND_CODE,REC_CODE,DISTANCE
0,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0.002564,1.000000,0.822638
1,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0.005128,1.000000,0.833088
2,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0.005128,1.000000,0.833088
3,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0.010256,1.000000,0.807545
4,0,0,1,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0.012821,1.000000,0.804634
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31650,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1.000000,0.389744,0.604867
31651,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1.000000,0.012821,0.804634
31652,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1.000000,0.769231,0.788436
31653,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1.000000,0.005128,0.833088


In [11]:
test_data

Unnamed: 0,DL_GD_LCLS_NM_디지털/가전,DL_GD_LCLS_NM_생활/건강,DL_GD_LCLS_NM_식품,DL_GD_LCLS_NM_여행/문화,DL_GD_LCLS_NM_패션의류,DL_GD_LCLS_NM_화장품/미용,DL_GD_MCLS_NM_가공식품,DL_GD_MCLS_NM_건강식품,DL_GD_MCLS_NM_건강용품,DL_GD_MCLS_NM_과자,...,DL_GD_MCLS_NM_생활용품,DL_GD_MCLS_NM_수산,DL_GD_MCLS_NM_스킨케어,DL_GD_MCLS_NM_음료,DL_GD_MCLS_NM_음반,DL_GD_MCLS_NM_주방용품,DL_GD_MCLS_NM_축산,SEND_CODE,REC_CODE,DISTANCE
0,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1.0,0.012821,0.804634
1,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1.0,0.010256,0.807545
2,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1.0,0.774359,0.806082
3,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1.0,0.797436,0.934364
4,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1.0,0.412821,0.629841
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4635,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1.0,0.925641,0.689460
4636,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1.0,0.438462,0.804688
4637,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1.0,0.820513,0.708307
4638,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1.0,0.776923,0.809247


In [12]:
def make_prediction(train, y, test, features, model, folds=5):

    kf = KFold(n_splits=folds, random_state=42, shuffle=True)
    
    x_train = train[features]
    x_test = test[features]

    y_preds = np.zeros((x_test.shape[0],))
    y_oof = np.zeros((x_train.shape[0],))
    
    score = 0

    features_importance= pd.DataFrame({'Feature':[], 'Importance':[]})

    for fold, (tr_idx, val_idx) in enumerate(kf.split(x_train, y)):
        print(f'Fold: {fold+1}')

        x_tr, x_val = x_train.loc[tr_idx, features], x_train.loc[val_idx, features]
        y_tr, y_val = y[tr_idx], y[val_idx]

        print(x_tr.shape, x_val.shape)
        model.fit(x_tr, y_tr, eval_set=[(x_tr, y_tr),(x_val, y_val)], early_stopping_rounds = 500, verbose=50)
        
        fold_importance_df= pd.DataFrame({'Feature':[], 'Importance':[]})
        fold_importance_df['Feature']= features
        fold_importance_df['Importance']= model.feature_importances_
        fold_importance_df["fold"] = fold + 1
        features_importance = pd.concat([features_importance, fold_importance_df], axis=0)
        
        y_pred_val = model.predict(x_val)
        y_oof[val_idx] = y_pred_val
        print(f"Fold {fold + 1} | rmse Score: {math.sqrt(mean_squared_error(y_val, y_pred_val))}")

        score += math.sqrt(mean_squared_error(y_val, y_pred_val)) / folds
        y_preds += model.predict(x_test) / folds

        del x_tr, x_val, y_tr, y_val
        gc.collect()

    print(f"rmse Score = {score}")
    print(f"OOF rmse Score = {math.sqrt(mean_squared_error(y, y_oof))}")
    
#     return y_oof, y_preds, features_importance
    return y_preds

lgb_params = {
    'objective':'regression',
    'boosting_type':'gbdt',
    'metric':'mse',
    'n_jobs':-1,
    'learning_rate':0.005,
    'num_leaves': 2**9,
    'max_depth':-1,
    'colsample_bytree': 0.8,
    'subsample':0.8,
    'n_estimators':10000
}

xgb_params = {
    'n_estimators': 10000,
    'max_depth': 9,
    'learning_rate': 0.005,
    'booster': 'gbtree',
    'n_jobs': -1,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'eval_metric':mean_squared_error
}

In [13]:
models = [
    LGBMRegressor(**lgb_params, seed=42),
    XGBRegressor(**xgb_params, random_state=42),
]
# lgb_clf = LGBMClassifier(**lgb_params)
# xgb_clf = XGBClassifier(**xgb_params)
# y_oof_lgb, y_preds_lgb, fi_lgb = make_prediction(train_enc, y, test_enc, train_enc.columns, lgb_clf)
# y_oof_xgb, y_preds_xgb, fi_xgb = make_prediction(train_enc, y, test_enc, train_enc.columns, xgb_clf)

In [14]:
preds = []

for model in models:
    preds.append(make_prediction(train_data, train_target, test_data, train_data.columns, model))

Fold: 1
(25324, 29) (6331, 29)




[50]	training's l2: 7.09306	valid_1's l2: 7.38631
[100]	training's l2: 6.96285	valid_1's l2: 7.30013
[150]	training's l2: 6.87405	valid_1's l2: 7.25097
[200]	training's l2: 6.81127	valid_1's l2: 7.22469
[250]	training's l2: 6.76641	valid_1's l2: 7.21083
[300]	training's l2: 6.73205	valid_1's l2: 7.20569
[350]	training's l2: 6.70528	valid_1's l2: 7.20573
[400]	training's l2: 6.68597	valid_1's l2: 7.20654
[450]	training's l2: 6.66924	valid_1's l2: 7.21042
[500]	training's l2: 6.65542	valid_1's l2: 7.21437
[550]	training's l2: 6.64316	valid_1's l2: 7.21906
[600]	training's l2: 6.63229	valid_1's l2: 7.22472
[650]	training's l2: 6.62313	valid_1's l2: 7.2304
[700]	training's l2: 6.61444	valid_1's l2: 7.23683
[750]	training's l2: 6.60681	valid_1's l2: 7.24233
[800]	training's l2: 6.59953	valid_1's l2: 7.24891
Fold 1 | rmse Score: 2.684283654414578
Fold: 2
(25324, 29) (6331, 29)




[50]	training's l2: 7.1252	valid_1's l2: 7.24483
[100]	training's l2: 6.99227	valid_1's l2: 7.16846
[150]	training's l2: 6.90252	valid_1's l2: 7.12665
[200]	training's l2: 6.84021	valid_1's l2: 7.10512
[250]	training's l2: 6.79622	valid_1's l2: 7.09555
[300]	training's l2: 6.76368	valid_1's l2: 7.09494
[350]	training's l2: 6.7383	valid_1's l2: 7.09747
[400]	training's l2: 6.71991	valid_1's l2: 7.10168
[450]	training's l2: 6.70427	valid_1's l2: 7.10706
[500]	training's l2: 6.69127	valid_1's l2: 7.11365
[550]	training's l2: 6.68001	valid_1's l2: 7.11898
[600]	training's l2: 6.66986	valid_1's l2: 7.12436
[650]	training's l2: 6.66108	valid_1's l2: 7.13
[700]	training's l2: 6.65223	valid_1's l2: 7.13511
[750]	training's l2: 6.64484	valid_1's l2: 7.14025
Fold 2 | rmse Score: 2.6635576102529894
Fold: 3
(25324, 29) (6331, 29)




[50]	training's l2: 7.06612	valid_1's l2: 7.4918
[100]	training's l2: 6.9399	valid_1's l2: 7.39372
[150]	training's l2: 6.85399	valid_1's l2: 7.33696
[200]	training's l2: 6.79521	valid_1's l2: 7.30515
[250]	training's l2: 6.75345	valid_1's l2: 7.28541
[300]	training's l2: 6.72223	valid_1's l2: 7.27382
[350]	training's l2: 6.69781	valid_1's l2: 7.26882
[400]	training's l2: 6.68034	valid_1's l2: 7.26712
[450]	training's l2: 6.66547	valid_1's l2: 7.26729
[500]	training's l2: 6.65327	valid_1's l2: 7.26874
[550]	training's l2: 6.64284	valid_1's l2: 7.27146
[600]	training's l2: 6.63376	valid_1's l2: 7.27374
[650]	training's l2: 6.62603	valid_1's l2: 7.27691
[700]	training's l2: 6.61872	valid_1's l2: 7.28062
[750]	training's l2: 6.61224	valid_1's l2: 7.28415
[800]	training's l2: 6.60587	valid_1's l2: 7.28866
[850]	training's l2: 6.60024	valid_1's l2: 7.29425
[900]	training's l2: 6.59482	valid_1's l2: 7.29832
Fold 3 | rmse Score: 2.695693955393668
Fold: 4
(25324, 29) (6331, 29)




[50]	training's l2: 7.17366	valid_1's l2: 7.01903
[100]	training's l2: 7.02888	valid_1's l2: 6.97151
[150]	training's l2: 6.9308	valid_1's l2: 6.95379
[200]	training's l2: 6.86355	valid_1's l2: 6.95352
[250]	training's l2: 6.81515	valid_1's l2: 6.96124
[300]	training's l2: 6.77931	valid_1's l2: 6.9744
[350]	training's l2: 6.75184	valid_1's l2: 6.98892
[400]	training's l2: 6.73187	valid_1's l2: 7.00343
[450]	training's l2: 6.71475	valid_1's l2: 7.01663
[500]	training's l2: 6.70063	valid_1's l2: 7.02971
[550]	training's l2: 6.68806	valid_1's l2: 7.04113
[600]	training's l2: 6.67685	valid_1's l2: 7.05197
[650]	training's l2: 6.66781	valid_1's l2: 7.06032
Fold 4 | rmse Score: 2.6365617096099743
Fold: 5
(25324, 29) (6331, 29)




[50]	training's l2: 7.21465	valid_1's l2: 6.87828
[100]	training's l2: 7.07782	valid_1's l2: 6.80388
[150]	training's l2: 6.98457	valid_1's l2: 6.76596
[200]	training's l2: 6.91975	valid_1's l2: 6.74596
[250]	training's l2: 6.87371	valid_1's l2: 6.73642
[300]	training's l2: 6.83987	valid_1's l2: 6.73401
[350]	training's l2: 6.81346	valid_1's l2: 6.73709
[400]	training's l2: 6.7943	valid_1's l2: 6.74174
[450]	training's l2: 6.7773	valid_1's l2: 6.74756
[500]	training's l2: 6.76344	valid_1's l2: 6.75499
[550]	training's l2: 6.75123	valid_1's l2: 6.76214
[600]	training's l2: 6.74059	valid_1's l2: 6.76979
[650]	training's l2: 6.73147	valid_1's l2: 6.77721
[700]	training's l2: 6.72263	valid_1's l2: 6.783
[750]	training's l2: 6.71472	valid_1's l2: 6.78786
Fold 5 | rmse Score: 2.5949418167119807
rmse Score = 2.6550077492766384
OOF rmse Score = 2.6552538282560407
Fold: 1
(25324, 29) (6331, 29)
[0]	validation_0-rmse:4.68476	validation_1-rmse:4.67585
[50]	validation_0-rmse:4.00290	validation_1-r

In [15]:
preds

[array([4.09160631, 3.99579906, 3.99127957, ..., 4.1394914 , 3.98109589,
        4.30656905]),
 array([3.87906557, 3.78338856, 3.78323239, ..., 4.01190418, 3.76780385,
        4.0830965 ])]

In [16]:
results = (preds[0] + preds[1]) / len(preds)

In [17]:
results.max()

9.311086645787451

In [18]:
results.min()

3.3428256493088826

In [19]:
submission['INVC_CONT'] = results
submission.to_csv('submission/xgb_lgb_v1.csv', index=False)