# DAYCON COIN TRADER

### 암호화폐 가격 예측

#### 10가지 종류의 암호화폐 가격을 예측하여 가장 수익율이 높은 모델 만들기

#### 데이터셋

- **train_x_df** : 임의의 시점부터 2020년 12월 31일까지 10가지 종류의 암호화폐 분단위 정보를 가공한 데이터

- **train_y_df** : 임의의 시점부터 2020년 12월 31일까지 10가지 종류의 암호화폐 분단위 가격정보를 가공한 데이터

- **test_x_df** : 2021년 1월 1일부터 임의의 시점까지 10가지 종류의 암호화폐 분단위 가격정보를 가공한 데이터


- 학습용 데이터는 10가지 종류의 코인을 포함하는 7362가지의 sample이 담겨있으며 train_x_df는 입력 23시간 동안의 분단위 데이터, train_y_df는 출력 2시간 동안의 분단위 데이터이다.

- test_x는 train_x과 동일한 구성을 갖는 529가지의 sample을 의미하며 해당 sample_id에 대해 매수량(buy_quantity)와 매도 시점(sell_time)을 결정해야한다.

### 파일 불러오기

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [2]:
data_path = 'D:/portfolio/open'
train_x_df = pd.read_csv(data_path + '/train_x_df.csv')
train_y_df = pd.read_csv(data_path + '/train_y_df.csv')
test_x_df = pd.read_csv(data_path + '/test_x_df.csv')

In [3]:
train_x_df.head()

Unnamed: 0,sample_id,time,coin_index,open,high,low,close,volume,quote_av,trades,tb_base_av,tb_quote_av
0,0,0,7,1.010004,1.010004,1.009612,1.010004,838287.5,43160.632812,451.157288,732683.4,37725.183594
1,0,1,7,1.009808,1.009808,1.009808,1.009808,162242.0,8352.220703,39.231071,0.0,0.0
2,0,2,7,1.009808,1.0102,1.009808,1.0102,16649.67,857.377808,58.846603,16649.67,857.377808
3,0,3,7,1.0102,1.011181,1.0102,1.011181,2586971.0,133310.34375,431.541779,2189147.0,112811.046875
4,0,4,7,1.010985,1.010985,1.0102,1.0102,1129996.0,58216.867188,176.53981,0.0,0.0


In [4]:
train_y_df.head()

Unnamed: 0,sample_id,time,coin_index,open,high,low,close,volume,quote_av,trades,tb_base_av,tb_quote_av
0,0,0,7,1.000392,1.000588,1.000392,1.000588,830511.9,42356.179688,137.308746,830511.9,42356.179688
1,0,1,7,1.000588,1.001177,1.000392,1.001177,532006.6,27140.638672,294.233032,488273.8,24909.861328
2,0,2,7,1.001177,1.001177,1.001177,1.001177,511377.0,26100.681641,58.846603,511377.0,26100.681641
3,0,3,7,1.001177,1.001373,1.001177,1.001373,1134853.0,57929.410156,137.308746,1095514.0,55921.15625
4,0,4,7,1.000981,1.000981,0.999804,1.000196,5801173.0,295872.34375,666.928162,991123.9,50528.589844


In [5]:
test_x_df.head()

Unnamed: 0,sample_id,time,coin_index,open,high,low,close,volume,quote_av,trades,tb_base_av,tb_quote_av
0,0,0,7,1.003541,1.00711,1.003541,1.00609,486928.96875,182543.6875,332.67157,287449.96875,107758.375
1,0,1,7,1.006117,1.006331,1.004829,1.005634,824485.375,309019.75,407.790955,560290.25,210005.125
2,0,2,7,1.005929,1.005983,1.004266,1.005366,478614.5625,179301.84375,276.332031,214270.265625,80283.554688
3,0,3,7,1.005044,1.005956,1.00491,1.005795,334366.84375,125305.117188,166.335785,56401.246094,21135.490234
4,0,4,7,1.00558,1.006466,1.005285,1.005929,560916.6875,210316.796875,303.16037,138711.703125,52010.964844


### 변수 설명

|<center>변수명|<center>변수설명|
|:------:|:-------:|
|<center>**sample_id**| <center>개별 샘플의 인덱스|
|<center>**time**|<center>x_df는 0분 ~ 1379분, y_df는 0분 ~ 119분의 값. 동일한 샘플 내 시간 정보|
|<center>**coin_index**|<center>10가지 종류의 코인에 대한 비식별화 인덱스 (0 ~9)|
|<center>**open**|<center>open price|
|<center>**high**|<center>high price|
|<center>**low**|<center>low price|
|<center>**close**|<center>close price|
|<center>**volume**|<center>거래량|
|<center>**quote_av**|<center>quote asset volume|
|<center>**trades**|<center>거래 건 수|
|<center>**tb_base_av**|<center>taker buy base asset volume|
|<center>**tb_quote_av**|<center>taker buy quote asset volume|

In [6]:
train_x_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10159560 entries, 0 to 10159559
Data columns (total 12 columns):
sample_id      int64
time           int64
coin_index     int64
open           float64
high           float64
low            float64
close          float64
volume         float64
quote_av       float64
trades         float64
tb_base_av     float64
tb_quote_av    float64
dtypes: float64(9), int64(3)
memory usage: 930.1 MB


In [7]:
train_y_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 883440 entries, 0 to 883439
Data columns (total 12 columns):
sample_id      883440 non-null int64
time           883440 non-null int64
coin_index     883440 non-null int64
open           883440 non-null float64
high           883440 non-null float64
low            883440 non-null float64
close          883440 non-null float64
volume         883440 non-null float64
quote_av       883440 non-null float64
trades         883440 non-null float64
tb_base_av     883440 non-null float64
tb_quote_av    883440 non-null float64
dtypes: float64(9), int64(3)
memory usage: 80.9 MB


In [8]:
test_x_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 730020 entries, 0 to 730019
Data columns (total 12 columns):
sample_id      730020 non-null int64
time           730020 non-null int64
coin_index     730020 non-null int64
open           730020 non-null float64
high           730020 non-null float64
low            730020 non-null float64
close          730020 non-null float64
volume         730020 non-null float64
quote_av       730020 non-null float64
trades         730020 non-null float64
tb_base_av     730020 non-null float64
tb_quote_av    730020 non-null float64
dtypes: float64(9), int64(3)
memory usage: 66.8 MB


In [None]:
train_x_df.describe()

Unnamed: 0,sample_id,time,coin_index,open,high,low,close,volume,quote_av,trades,tb_base_av,tb_quote_av
count,10159560.0,10159560.0,10159560.0,10159560.0,10159560.0,10159560.0,10159560.0,10159560.0,10159560.0,10159560.0,10159560.0,10159560.0
mean,3680.5,689.5,5.495925,1.000988,1.001753,1.000196,1.00098,765786.2,42689.07,117.9676,364791.1,21061.37
std,2125.226,398.3716,2.940005,0.0389512,0.03900216,0.03889973,0.03892255,9341042.0,214642.0,493.6206,5268894.0,111790.1
min,0.0,0.0,0.0,0.08903663,0.5540227,0.08903663,0.5540227,0.0,0.0,0.0,0.0,0.0
25%,1840.0,344.75,4.0,0.987131,0.987835,0.9863969,0.9871426,0.05047751,31.26231,0.0771516,0.005671815,10.66213
50%,3680.5,689.5,6.0,1.0,1.000556,0.99945,1.0,3.23124,322.0208,0.7796185,1.039115,115.2704
75%,5521.0,1034.25,8.0,1.012693,1.013415,1.01195,1.012669,42584.15,12544.63,56.41976,9016.594,4863.686
max,7361.0,1379.0,9.0,2.066901,4.451831,2.066901,2.066901,7398036000.0,37054970.0,62578.62,7340513000.0,23200230.0


In [None]:
train_y_df.describe()

In [None]:
test_x_df.describe()

In [None]:
train_x_df.shape

### 암호화폐별 분류

- train_x데이터의 시간은 0~1379분까지 고유값은 7362개

- train_y데이터의 시간은 0~119분까지 고유값은 7362개

- test데이터의 시간은 0~1379분까지 고유값은 529개

In [None]:
def change_array(x) :
    coin_idx = x.iloc[:,2:].shape[1]
    time_idx = len(x.time.value_counts())
    sample_idx = len(x.sample_id.value_counts())
    array_df = x.iloc[:,2:].values.reshape([sample_idx,time_idx,coin_idx])
    return array_df

In [None]:
train_x_array = change_array(train_x_df)
train_y_array = change_array(train_y_df)
test_x_array = change_array(test_x_df)

암호화폐 10개의 시간(컬럼)별 데이터 정리<br>
(행(sample_idx), 열(시간),분단위 10가지 정보)

In [None]:
print(train_x_array.shape)
print(train_y_array.shape)
print(test_x_array.shape)

In [None]:
plt.plot(train_x_array[0,:,1])
plt.show()

In [None]:
plt.plot(train_x_array[0,:,1])
plt.plot(np.arange(train_x_array.shape[1],train_x_array.shape[1]+train_y_array.shape[1]),train_y_array[0,:,1])
plt.show()