In [1]:
import pandas as pd
import numpy as np
import math
from tqdm import tqdm
import gc
tqdm.pandas()

train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')
submission = pd.read_csv('data/sample_submission.csv')

In [2]:
region = pd.read_excel('data/tfile.xlsx')
region['코드'] = region['코드'].apply(lambda x: int(str(x)[:3]))
region = region.groupby('코드').mean()
region

Unnamed: 0_level_0,위도,경도
코드,Unnamed: 1_level_1,Unnamed: 2_level_1
111,37.563457,126.983917
112,37.576807,127.045817
113,37.635206,127.008522
114,37.553908,126.918455
115,37.516894,126.887825
...,...,...
482,35.191351,128.521292
483,35.080022,128.794729
487,35.413145,128.399070
488,35.341528,127.989320


In [3]:
def get_distance(input):
    send_code, rec_code = input[0], input[1]
    if send_code==rec_code:
        return 0

    send_codnt = tuple(region.loc[send_code, :])
    rec_codnt = tuple(region.loc[rec_code, :])
    distance = math.sqrt((send_codnt[0]-rec_codnt[0])**2 + (send_codnt[1]-rec_codnt[1])**2)

    return distance

In [4]:
train_data, train_target = train.drop(['index', 'INVC_CONT'],axis=1), train['INVC_CONT']
test_data = test.drop(['index'],axis=1)

In [5]:
data = pd.concat([train_data, test_data])
data

Unnamed: 0,SEND_SPG_INNB,REC_SPG_INNB,DL_GD_LCLS_NM,DL_GD_MCLS_NM
0,1.129000e+15,5.011000e+15,패션의류,상의
1,1.135000e+15,5.011000e+15,생활/건강,반려동물
2,1.135000e+15,5.011000e+15,패션의류,기타패션의류
3,1.154500e+15,5.011000e+15,식품,농산물
4,1.165000e+15,5.011000e+15,식품,가공식품
...,...,...,...,...
4635,5.013000e+15,4.725000e+15,식품,농산물
4636,5.013000e+15,2.826000e+15,식품,농산물
4637,5.013000e+15,4.311100e+15,식품,농산물
4638,5.013000e+15,4.145000e+15,식품,농산물


In [6]:
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

In [7]:
data_one = pd.get_dummies(data)
# data_one = data.copy()

data_one['SEND_CODE'] = data_one['SEND_SPG_INNB'].apply(lambda x: int(str(x)[:3]))
data_one['REC_CODE'] = data_one['REC_SPG_INNB'].apply(lambda x: int(str(x)[:3]))
data_one = data_one.drop(['SEND_SPG_INNB','REC_SPG_INNB'], axis=1)
data_one['DISTANCE'] = data_one[['SEND_CODE','REC_CODE']].progress_apply(get_distance, axis=1)

cols = ['SEND_CODE', 'REC_CODE', 'DISTANCE']
for col in cols:
    mm = MinMaxScaler()
    data_one[col] = mm.fit_transform(np.array(data_one[col]).reshape(-1, 1))

train_data, test_data = data_one.iloc[:-len(test_data)], data_one.iloc[-len(test_data):]

100%|██████████| 36640/36640 [00:06<00:00, 5380.29it/s]


In [8]:
train_data

Unnamed: 0,DL_GD_LCLS_NM_디지털/가전,DL_GD_LCLS_NM_생활/건강,DL_GD_LCLS_NM_식품,DL_GD_LCLS_NM_여행/문화,DL_GD_LCLS_NM_패션의류,DL_GD_LCLS_NM_화장품/미용,DL_GD_MCLS_NM_가공식품,DL_GD_MCLS_NM_건강식품,DL_GD_MCLS_NM_건강용품,DL_GD_MCLS_NM_과자,...,DL_GD_MCLS_NM_생활용품,DL_GD_MCLS_NM_수산,DL_GD_MCLS_NM_스킨케어,DL_GD_MCLS_NM_음료,DL_GD_MCLS_NM_음반,DL_GD_MCLS_NM_주방용품,DL_GD_MCLS_NM_축산,SEND_CODE,REC_CODE,DISTANCE
0,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0.002564,1.000000,0.822638
1,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0.005128,1.000000,0.833088
2,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0.005128,1.000000,0.833088
3,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0.010256,1.000000,0.807545
4,0,0,1,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0.012821,1.000000,0.804634
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31995,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1.000000,0.389744,0.604867
31996,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1.000000,0.012821,0.804634
31997,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1.000000,0.769231,0.788436
31998,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1.000000,0.005128,0.833088


In [9]:
test_data

Unnamed: 0,DL_GD_LCLS_NM_디지털/가전,DL_GD_LCLS_NM_생활/건강,DL_GD_LCLS_NM_식품,DL_GD_LCLS_NM_여행/문화,DL_GD_LCLS_NM_패션의류,DL_GD_LCLS_NM_화장품/미용,DL_GD_MCLS_NM_가공식품,DL_GD_MCLS_NM_건강식품,DL_GD_MCLS_NM_건강용품,DL_GD_MCLS_NM_과자,...,DL_GD_MCLS_NM_생활용품,DL_GD_MCLS_NM_수산,DL_GD_MCLS_NM_스킨케어,DL_GD_MCLS_NM_음료,DL_GD_MCLS_NM_음반,DL_GD_MCLS_NM_주방용품,DL_GD_MCLS_NM_축산,SEND_CODE,REC_CODE,DISTANCE
0,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1.0,0.012821,0.804634
1,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1.0,0.010256,0.807545
2,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1.0,0.774359,0.806082
3,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1.0,0.797436,0.934364
4,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1.0,0.412821,0.629841
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4635,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1.0,0.925641,0.689460
4636,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1.0,0.438462,0.804688
4637,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1.0,0.820513,0.708307
4638,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1.0,0.776923,0.809247


In [10]:
from sklearn.model_selection import KFold
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_squared_error

def make_prediction(train, y, test, features, model, folds=5):

    kf = KFold(n_splits=folds, random_state=42, shuffle=True)
    
    x_train = train[features]
    x_test = test[features]

    y_preds = np.zeros((x_test.shape[0],))
    y_oof = np.zeros((x_train.shape[0],))
    
    score = 0

    features_importance= pd.DataFrame({'Feature':[], 'Importance':[]})
    best_fold = -1
    best_fold_score = 999

    for fold, (tr_idx, val_idx) in enumerate(kf.split(x_train, y)):
        print(f'Fold: {fold+1}')

        x_tr, x_val = x_train.loc[tr_idx, features], x_train.loc[val_idx, features]
        y_tr, y_val = y[tr_idx], y[val_idx]

        print(x_tr.shape, x_val.shape)
        model.fit(x_tr, y_tr, eval_set=[(x_tr, y_tr),(x_val, y_val)], early_stopping_rounds = 500, verbose=50)
        
        fold_importance_df= pd.DataFrame({'Feature':[], 'Importance':[]})
        fold_importance_df['Feature']= features
        fold_importance_df['Importance']= model.feature_importances_
        fold_importance_df["fold"] = fold + 1
        features_importance = pd.concat([features_importance, fold_importance_df], axis=0)
        
        y_pred_val = model.predict(x_val)
        y_oof[val_idx] = y_pred_val
        print(f"Fold {fold + 1} | rmse Score: {math.sqrt(mean_squared_error(y_val, y_pred_val))}")

        if math.sqrt(mean_squared_error(y_val, y_pred_val)) <= best_fold_score:
            best_fold = fold + 1
            best_fold_score = math.sqrt(mean_squared_error(y_val, y_pred_val))
            y_preds = model.predict(x_test)

        score += math.sqrt(mean_squared_error(y_val, y_pred_val)) / folds
        # y_preds += model.predict(x_test) / folds

        del x_tr, x_val, y_tr, y_val
        gc.collect()

    print(f"best fold = {best_fold} | {best_fold_score}")
    print(f"rmse Score = {score}")
    print(f"OOF rmse Score = {math.sqrt(mean_squared_error(y, y_oof))}")
    
#     return y_oof, y_preds, features_importance
    return y_preds

lgb_params = {
    'objective':'regression',
    'boosting_type':'gbdt',
    'metric':'mse',
    'n_jobs':-1,
    'learning_rate':0.005,
    'num_leaves': 2**9,
    'max_depth':-1,
    'colsample_bytree': 0.8,
    'subsample':0.8,
    'n_estimators':10000
}

xgb_params = {
    'n_estimators': 10000,
    'max_depth': 9,
    'learning_rate': 0.005,
    'booster': 'gbtree',
    'n_jobs': -1,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'eval_metric':mean_squared_error
}

In [11]:
models = [
    LGBMRegressor(**lgb_params, seed=42),
    XGBRegressor(**xgb_params, random_state=42),
]
# lgb_clf = LGBMClassifier(**lgb_params)
# xgb_clf = XGBClassifier(**xgb_params)
# y_oof_lgb, y_preds_lgb, fi_lgb = make_prediction(train_enc, y, test_enc, train_enc.columns, lgb_clf)
# y_oof_xgb, y_preds_xgb, fi_xgb = make_prediction(train_enc, y, test_enc, train_enc.columns, xgb_clf)

In [12]:
preds = []

for model in models:
    preds.append(make_prediction(train_data, train_target, test_data, train_data.columns, model))

Fold: 1
(25600, 29) (6400, 29)
Training until validation scores don't improve for 500 rounds
[50]	training's l2: 30.8415	valid_1's l2: 35.5819
[100]	training's l2: 29.9374	valid_1's l2: 34.8894
[150]	training's l2: 29.3304	valid_1's l2: 34.4779
[200]	training's l2: 28.8998	valid_1's l2: 34.2153
[250]	training's l2: 28.5953	valid_1's l2: 34.0721
[300]	training's l2: 28.3666	valid_1's l2: 33.985
[350]	training's l2: 28.193	valid_1's l2: 33.9464
[400]	training's l2: 28.0661	valid_1's l2: 33.9253
[450]	training's l2: 27.9537	valid_1's l2: 33.9212
[500]	training's l2: 27.8573	valid_1's l2: 33.9299
[550]	training's l2: 27.7746	valid_1's l2: 33.9368
[600]	training's l2: 27.6993	valid_1's l2: 33.9486
[650]	training's l2: 27.6381	valid_1's l2: 33.9645
[700]	training's l2: 27.5771	valid_1's l2: 33.976
[750]	training's l2: 27.5266	valid_1's l2: 33.994
[800]	training's l2: 27.4768	valid_1's l2: 34.0087
[850]	training's l2: 27.433	valid_1's l2: 34.0216
[900]	training's l2: 27.3922	valid_1's l2: 34.

In [13]:
preds

[array([4.30311523, 4.42681191, 4.27754341, ..., 4.3543904 , 4.28158712,
        4.42350162]),
 array([3.6159484, 3.7194867, 3.5822556, ..., 3.7215123, 3.5732675,
        3.7857187], dtype=float32)]

In [14]:
results = (preds[0] + preds[1]) / len(preds)

In [15]:
results.max()

43.447873594034334

In [16]:
results.min()

3.4979601731097585

In [17]:
submission['INVC_CONT'] = results
submission.to_csv('submission/xgb_lgb_v1.csv', index=False)