In [1]:
import numpy as np
import pandas as pd
from pandas import DataFrame
from pandas import Series
import seaborn as sns
from tqdm import tqdm
import plotly.io as pio
import plotly.express as px
import plotly.offline as pyo
import plotly.graph_objs as go
import matplotlib.font_manager as fm
font_path = './AppleGothic.ttf'
fontprop = fm.FontProperties(fname=font_path, size=18)
font_name = fm.FontProperties(fname=font_path).get_name()

In [2]:
import pandas as pd
pd.set_option("display.max_row", 100)
pd.set_option("display.max_column", 100)
import numpy as np
import os

from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error as mse
from sklearn.model_selection import train_test_split, StratifiedKFold, KFold
from lightgbm import LGBMRegressor

In [15]:
# 고객 및 주거래계좌 정보(CUS_INFO.CSV)
cus= pd.read_csv('cus_info_new.csv')

# 종목정보(IEM_INFO.CSV)
iem= pd.read_csv('iem_new2.csv', encoding='cp949')

# 국내주식 잔고이력(STK_BNC_HIST.CSV) _ 잔고정보 (국내 주식건에 한해) <- 여기세 종목 정보 들어있음
hist = pd.read_csv('stk_bnc_hist.csv', encoding='cp949')

# 국내주식 보유기간(STK_HLD_.CSV)
test = pd.read_csv('stk_hld_test.csv', encoding='cp949')
train = pd.read_csv('stk_hld_train.csv', encoding='cp949')

# submission file
submission = pd.read_csv('sample_submission.csv', encoding='cp949')

In [20]:
iem.columns=['iem_cd','iem_krl_nm','btp_cfc_cd','mkt_pr_tal_scl_tp_cd','stk_dit_cd']

In [21]:
# 앞서 언급한 것처럼 베이스라인에서는 "hist_d" column을 임의로 생성하여 모델을 학습시키도록 하겠습니다.
# 베이스라인에서는 "hold_d"값, 즉 주식 보유기간의 0.6배에 해당하는 기간을 임의로 설정하여 "hist_d"를 생성하였습니다.
# 결국 모델은 "hist_d"만큼 주식을 보유 했을때의 "hold_d"를 예측하게 될 것입니다.

train["hist_d"] = train["hold_d"]*0.6
train.hist_d = np.trunc(train["hist_d"])

In [22]:
train.head(3)

Unnamed: 0,act_id,iem_cd,byn_dt,hold_d,hist_d
0,0ad104dbed99be0cd858aa772765ddedade554601a981b...,A006360,20180726,11,6.0
1,0ad104dbed99be0cd858aa772765ddedade554601a981b...,A005930,20180131,80,48.0
2,0ad104dbed99be0cd858aa772765ddedade554601a981b...,A005070,20180517,5,3.0


In [23]:
# train과 test에 고객정보(cus_info)와 주식정보(iem_info)를 추가하겠습니다.

train_data = pd.merge(train, cus, how = "left", on = ["act_id"])
train_data = pd.merge(train_data, iem, how = "left", on = ["iem_cd"])

test_data = pd.merge(test, cus, how = "left", on = ["act_id"])
test_data = pd.merge(test_data, iem, how = "left", on = ["iem_cd"])

In [24]:
# train_data에서 Y값을 추출한 후 hold_d column을 지워주겠습니다.

train_label = train_data["hold_d"]
train_data.drop(["hold_d"], axis = 1, inplace = True)

In [25]:
# 추가적으로 약간의 전처리를 통해 train data와 test data를 구성하겠습니다.

hist["stk_p"] = hist["tot_aet_amt"] / hist["bnc_qty"]
hist = hist.fillna(0)

train_data = pd.merge(train_data, hist, how = "left", on = ["act_id", "iem_cd"])
train_data = train_data[(train_data["byn_dt"] == train_data["bse_dt"])]
train_data.reset_index(drop = True, inplace = True)

test_data = pd.merge(test_data, hist, how = "left", on = ["act_id", "iem_cd"])
test_data = test_data[(test_data["byn_dt"] == test_data["bse_dt"])]
test_data.reset_index(drop = True, inplace = True)

train_data = train_data.drop(["act_id", "iem_cd", "byn_dt", "bse_dt"], axis = 1)
test_data = test_data.drop(["act_id", "iem_cd", "byn_dt", "submit_id", "hold_d", "bse_dt"], axis = 1)

L_encoder = LabelEncoder()
L_encoder.fit(iem["iem_krl_nm"])
train_data["iem_krl_nm"] = L_encoder.transform(train_data["iem_krl_nm"])
test_data["iem_krl_nm"] = L_encoder.transform(test_data["iem_krl_nm"])

In [26]:
train_data.head(3)

Unnamed: 0,hist_d,sex_dit_cd,cus_age_stn_cd,ivs_icn_cd,cus_aet_stn_cd,mrz_pdt_tp_sgm_cd,lsg_sgm_cd,tco_cus_grd_cd,tot_ivs_te_sgm_cd,mrz_btp_dit_cd,iem_krl_nm,btp_cfc_cd,mkt_pr_tal_scl_tp_cd,stk_dit_cd,bnc_qty,tot_aet_amt,stk_par_pr,stk_p
0,6.0,1,9,3,2,2,9,5,5,8,101,1,1,1,274.0,11782000.0,5000.0,43000.0
1,48.0,1,9,3,2,2,9,5,5,8,1361,9,1,1,2.0,4990000.0,5000.0,2495000.0
2,3.0,1,9,3,2,2,9,5,5,8,2530,12,2,3,786.0,14619600.0,1000.0,18600.0


In [27]:
test_data.head(3)

Unnamed: 0,hist_d,sex_dit_cd,cus_age_stn_cd,ivs_icn_cd,cus_aet_stn_cd,mrz_pdt_tp_sgm_cd,lsg_sgm_cd,tco_cus_grd_cd,tot_ivs_te_sgm_cd,mrz_btp_dit_cd,iem_krl_nm,btp_cfc_cd,mkt_pr_tal_scl_tp_cd,stk_dit_cd,bnc_qty,tot_aet_amt,stk_par_pr,stk_p
0,153,1,9,3,2,2,9,5,5,8,418,4,1,1,300.0,3945000.0,5000.0,13150.0
1,335,1,9,3,2,2,9,5,5,8,2230,10,3,4,198.0,2524500.0,500.0,12750.0
2,139,1,9,3,2,2,9,5,5,8,1515,13,2,4,138.0,4291800.0,500.0,31100.0


In [28]:
train_data.reset_index(drop = True, inplace=True)
train_label.reset_index(drop = True, inplace=True)

In [29]:
models = []

folds = KFold(n_splits=10)
for train_idx, val_idx in folds.split(train_data):
    
    train_x = train_data.iloc[train_idx, :]
    train_y = train_label[train_idx]
    val_x = train_data.iloc[val_idx, :]
    val_y = train_label[val_idx]
    
    model = LGBMRegressor(objective= "regression",
                          max_depth= 5,
                          n_estimators= 2000,
                          learning_rate= 0.01,
                          num_leaves = 31)
    
    model.fit(train_x, train_y,
              eval_set=[(val_x, val_y)],
              eval_metric=["rmse"],
              early_stopping_rounds=300,
              verbose=500)
    
    models.append(model)

Training until validation scores don't improve for 300 rounds
[500]	valid_0's rmse: 4.22281	valid_0's l2: 17.8321
[1000]	valid_0's rmse: 4.07749	valid_0's l2: 16.6259
[1500]	valid_0's rmse: 4.04536	valid_0's l2: 16.3649
[2000]	valid_0's rmse: 4.02955	valid_0's l2: 16.2373
Did not meet early stopping. Best iteration is:
[2000]	valid_0's rmse: 4.02955	valid_0's l2: 16.2373
Training until validation scores don't improve for 300 rounds
[500]	valid_0's rmse: 1.29387	valid_0's l2: 1.67409
[1000]	valid_0's rmse: 1.29382	valid_0's l2: 1.67396
Early stopping, best iteration is:
[737]	valid_0's rmse: 1.19037	valid_0's l2: 1.41698
Training until validation scores don't improve for 300 rounds
[500]	valid_0's rmse: 0.68268	valid_0's l2: 0.466052
[1000]	valid_0's rmse: 0.477423	valid_0's l2: 0.227933
Early stopping, best iteration is:
[916]	valid_0's rmse: 0.476031	valid_0's l2: 0.226606
Training until validation scores don't improve for 300 rounds
[500]	valid_0's rmse: 0.574012	valid_0's l2: 0.3294

In [30]:
result = []
for i in models:
    result.append(i.predict(test_data))
predict = np.mean(result, axis = 0)

In [31]:
predict

array([254.86669553, 557.70346413, 232.32806352, ..., 878.80171913,
        18.99795723,   7.46440248])

In [18]:
submission["hold_d"] = np.round(predict)

In [19]:
submission.to_csv('sub3_baseline_lgbm.csv', header=True, index=False, encoding='cp949')

## ㅇ

In [14]:
# 고객 및 주거래계좌 정보(CUS_INFO.CSV)
cus= pd.read_csv('cus_info_new.csv')

# 종목정보(IEM_INFO.CSV)
iem= pd.read_csv('iem_new2.csv', encoding='cp949')

# 국내주식 잔고이력(STK_BNC_HIST.CSV) _ 잔고정보 (국내 주식건에 한해) <- 여기세 종목 정보 들어있음
hist = pd.read_csv('stk_bnc_hist.csv', encoding='cp949')

# 국내주식 보유기간(STK_HLD_.CSV)
train_6 = pd.read_csv('train_6개버젼.csv', encoding='cp949')
test_6 = pd.read_csv('test_6개버젼.csv', encoding='cp949')

# submission file
submission = pd.read_csv('sample_submission.csv', encoding='cp949')

In [15]:
cus.columns=["계좌 ID","성별","연령대","투자성향","자산구간","주거래상품군","Life Style","서비스 등급","총 투자기간","주거래업종구분"]
hist.columns= ["계좌 ID","기준일자","종목코드","잔고수량","잔고금액","주당 액면가"]

In [16]:
hist['기준일자'] = hist['기준일자'].astype('str')
hist['기준일자'] = hist['기준일자'].apply(lambda x: str(x[0:4]) + '-' + str(x[4:6]) + '-'+ str(x[6:8]))

In [17]:
train= train_6[['계좌 ID','종목코드','매수일자','보유기간(일)', 'oil_14_0.8','copper_14_0.8','base_14_0.8','exchange_14_0.8','Rank']]

In [18]:
test = test_6[['계좌 ID', '종목코드', '매수일자', '과거 보유일', '제출ID', '보유기간(일)', 'oil_14_0.8','copper_14_0.8','base_14_0.8','exchange_14_0.8','Rank']]

In [19]:
# 앞서 언급한 것처럼 베이스라인에서는 "hist_d" column을 임의로 생성하여 모델을 학습시키도록 하겠습니다.
# 베이스라인에서는 "hold_d"값, 즉 주식 보유기간의 0.6배에 해당하는 기간을 임의로 설정하여 "hist_d"를 생성하였습니다.
# 결국 모델은 "hist_d"만큼 주식을 보유 했을때의 "hold_d"를 예측하게 될 것입니다.

train["과거 보유일"] = train["보유기간(일)"]*0.6
train["과거 보유일"] = np.trunc(train["과거 보유일"])



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [20]:
train.head(3)

Unnamed: 0,계좌 ID,종목코드,매수일자,보유기간(일),oil_14_0.8,copper_14_0.8,base_14_0.8,exchange_14_0.8,Rank,과거 보유일
0,0ad104dbed99be0cd858aa772765ddedade554601a981b...,A006360,2018-07-26,11,69.14,6266.47,1.5,1134.08,75.0,6.0
1,f431b2a7651bccfc7ce8f294fdacdf0b7b31db734be701...,A023590,2018-07-26,5,69.14,6266.47,1.5,1134.08,201.0,3.0
2,43a0eb89f16d3a46767794dda9c31cd15e1ff9daf33332...,A010820,2018-07-26,3,69.14,6266.47,1.5,1134.08,758.0,1.0


In [21]:
# train과 test에 고객정보(cus_info)와 주식정보(iem_info)를 추가하겠습니다.

train_data = pd.merge(train, cus, how = "left", on = ["계좌 ID"])
train_data = pd.merge(train_data, iem, how = "left", on = ["종목코드"])

test_data = pd.merge(test, cus, how = "left", on = ["계좌 ID"])
test_data = pd.merge(test_data, iem, how = "left", on = ["종목코드"])

In [22]:
# train_data에서 Y값을 추출한 후 hold_d column을 지워주겠습니다.

train_label = train_data["보유기간(일)"]
train_data.drop(["보유기간(일)"], axis = 1, inplace = True)

In [23]:
# 추가적으로 약간의 전처리를 통해 train data와 test data를 구성하겠습니다.

hist["stk_p"] = hist["잔고금액"] / hist["잔고수량"]
hist = hist.fillna(0)

train_data = pd.merge(train_data, hist, how = "left", on = ["계좌 ID", "종목코드"])
train_data = train_data[(train_data["매수일자"] == train_data["기준일자"])]
train_data.reset_index(drop = True, inplace = True)

test_data = pd.merge(test_data, hist, how = "left", on = ["계좌 ID", "종목코드"])
test_data = test_data[(test_data["매수일자"] == test_data["기준일자"])]
test_data.reset_index(drop = True, inplace = True)

train_data = train_data.drop(["계좌 ID", "종목코드", "매수일자", "기준일자"], axis = 1)
test_data = test_data.drop(["계좌 ID", "종목코드", "매수일자", "제출ID","보유기간(일)", "기준일자"], axis = 1)

from sklearn.preprocessing import LabelEncoder
L_encoder = LabelEncoder()
L_encoder.fit(iem["종목한글명"])
train_data["종목한글명"] = L_encoder.transform(train_data["종목한글명"])
test_data["종목한글명"] = L_encoder.transform(test_data["종목한글명"])

In [24]:
train_data.head(3)

Unnamed: 0,oil_14_0.8,copper_14_0.8,base_14_0.8,exchange_14_0.8,Rank,과거 보유일,성별,연령대,투자성향,자산구간,...,총 투자기간,주거래업종구분,종목한글명,종목업종,시가총액 규모유형,시장구분,잔고수량,잔고금액,주당 액면가,stk_p
0,69.14,6266.47,1.5,1134.08,75.0,6.0,1,9,3,2,...,5,8,101,1,1,1,274.0,11782000.0,5000.0,43000.0
1,69.14,6266.47,1.5,1134.08,201.0,3.0,1,9,5,5,...,6,15,851,5,2,3,50.0,1182500.0,500.0,23650.0
2,69.14,6266.47,1.5,1134.08,758.0,1.0,1,6,4,5,...,6,10,2701,3,3,3,1659.0,7050750.0,500.0,4250.0


In [25]:
test_data.head(3)

Unnamed: 0,과거 보유일,oil_14_0.8,copper_14_0.8,base_14_0.8,exchange_14_0.8,Rank,성별,연령대,투자성향,자산구간,...,총 투자기간,주거래업종구분,종목한글명,종목업종,시가총액 규모유형,시장구분,잔고수량,잔고금액,주당 액면가,stk_p
0,153,33.78,5401.56,0.75,1227.4,38.0,1,9,3,2,...,5,8,418,4,1,1,300.0,3945000.0,5000.0,13150.0
1,153,33.78,5401.56,0.75,1227.4,199.0,2,7,3,5,...,5,13,455,10,1,2,550.0,20817500.0,500.0,37850.0
2,153,33.78,5401.56,0.75,1227.4,1121.0,1,6,4,5,...,3,16,762,13,3,4,359.0,6085050.0,500.0,16950.0


In [26]:
train_data.reset_index(drop = True, inplace=True)
train_label.reset_index(drop = True, inplace=True)

In [28]:
from sklearn.metrics import mean_squared_error as mse
from sklearn.model_selection import train_test_split, StratifiedKFold, KFold
from lightgbm import LGBMRegressor

In [29]:
models = []

folds = KFold(n_splits=10)
for train_idx, val_idx in folds.split(train_data):
    
    train_x = train_data.iloc[train_idx, :]
    train_y = train_label[train_idx]
    val_x = train_data.iloc[val_idx, :]
    val_y = train_label[val_idx]
    
    model = LGBMRegressor(objective= "regression",
                          max_depth= 5,
                          n_estimators= 2000,
                          learning_rate= 0.01,
                          num_leaves = 31)
    
    model.fit(train_x, train_y,
              eval_set=[(val_x, val_y)],
              eval_metric=["rmse"],
              early_stopping_rounds=300,
              verbose=500)
    
    models.append(model)

Training until validation scores don't improve for 300 rounds
[500]	valid_0's rmse: 2.72936	valid_0's l2: 7.44941
[1000]	valid_0's rmse: 2.50354	valid_0's l2: 6.26771
[1500]	valid_0's rmse: 2.46921	valid_0's l2: 6.09702
[2000]	valid_0's rmse: 2.44738	valid_0's l2: 5.98969
Did not meet early stopping. Best iteration is:
[1996]	valid_0's rmse: 2.44723	valid_0's l2: 5.98893
Training until validation scores don't improve for 300 rounds
[500]	valid_0's rmse: 0.748669	valid_0's l2: 0.560506
[1000]	valid_0's rmse: 0.683412	valid_0's l2: 0.467053
[1500]	valid_0's rmse: 0.682412	valid_0's l2: 0.465686
Early stopping, best iteration is:
[1597]	valid_0's rmse: 0.682386	valid_0's l2: 0.46565
Training until validation scores don't improve for 300 rounds
[500]	valid_0's rmse: 0.959655	valid_0's l2: 0.920938
Early stopping, best iteration is:
[552]	valid_0's rmse: 0.933086	valid_0's l2: 0.87065
Training until validation scores don't improve for 300 rounds
[500]	valid_0's rmse: 0.441406	valid_0's l2: 

In [30]:
result = []
for i in models:
    result.append(i.predict(test_data))
predict = np.mean(result, axis = 0)

In [31]:
predict

array([ 63.99349322, 332.26148779, 895.36609022, ..., 926.42711408,
       196.06732705, 913.98516365])