In [1]:
from google.colab import drive
drive.mount('/gdrive', force_remount=True)

Mounted at /gdrive


In [2]:
import pandas as pd
import numpy as np
import pickle
import os

# [Regression] https://sosoeasy.tistory.com/389
from sklearn.linear_model import LinearRegression, Ridge

# [머신러닝 기법] https://sosoeasy.tistory.com/390
# bagging
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

# boosting
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

# [deep learning] https://sosoeasy.tistory.com/403
from keras.models import Sequential, Model, load_model
from keras.layers import Input, Dense, Activation, Flatten, Dropout
from keras.layers import LSTM, SimpleRNN, GRU

In [3]:
class Ensemble:
    def __init__(self,path):
        # 경로에서 데이터 불러오기
        self.csv_test_x=pd.read_csv(path)
        display(self.csv_test_x)

        # 변수
        self.sampleIds=self.csv_test_x["sample_id"].unique() # 샘플아이디 저장 리스트
        self.sampleSize=len(self.sampleIds) # 샘플사이즈
        self.minuteSize=len(self.csv_test_x.loc[self.csv_test_x["sample_id"]==self.sampleIds[0]]) # 샘플당 주어진 분(1380분)
        self.predictSize=120 # 예측해야할 길이(분)

        print("샘플사이즈 :",self.sampleSize)
        print("분길이 : ",self.minuteSize)
        
    
    def preprocessing(self):
        # 전처리
        # my[샘플아이디]=[샘플의 X분가격]
        myD={}
        for sampleId in self.sampleIds:
            temp=np.array(self.csv_test_x.loc[self.csv_test_x["sample_id"]==sampleId]["open"])
            myD[sampleId]=temp
        return myD
              
    def brave_ridge(self,sample):
        # transformation (https://sosoeasy.tistory.com/404?category=891830)
        sequence=7
        
        x=pd.DataFrame(columns=[str(i) for i in range(sequence)])
        y=pd.DataFrame(columns=[str(sequence)])

        for j in range(self.minuteSize-sequence):
            x=x.append({str(k):sample[j:j+sequence][k] for k in range(sequence)},ignore_index=True)
            y=y.append({str(sequence):sample[j+sequence]},ignore_index=True)

        # model fit
        sizeX=self.minuteSize-sequence
        my_ridge = Ridge(alpha=1, random_state=123, normalize=False, fit_intercept=True).fit(x, y)

        # 미래 120분 예측
        predL=[]
        maxPrice=1
        maxMinute=-1

        x_pred=[[ x[str(i)][sizeX-1] for i in range(1,sequence) ] + [y[str(sequence)][sizeX-1]]]
        
        for i in range(self.predictSize): # 향후 120분을 예측
            # 예측
            y_pred=my_ridge.predict(x_pred)[0][0]    
            predL.append(y_pred)
            if maxPrice<y_pred:
                maxPrice=y_pred
                maxMinute=i

            # x값 조정
            x_pred=[x_pred[0][1:]+[y_pred]]

        # 최고가
        return maxPrice, maxMinute, predL

    def brave_LinearRegression(self,sample):
        # transformation
        sequence=7
        
        x=pd.DataFrame(columns=[str(i) for i in range(sequence)])
        y=pd.DataFrame(columns=[str(sequence)])

        for j in range(self.minuteSize-sequence):
            x=x.append({str(k):sample[j:j+sequence][k] for k in range(sequence)},ignore_index=True)
            y=y.append({str(sequence):sample[j+sequence]},ignore_index=True)

        sizeX=self.minuteSize-sequence
        my_LinearRegression = LinearRegression().fit(x, y)

        # 미래 120분 예측(한번도 1못넘으면 maxMinute=-1)
        predL=[]
        maxPrice=1
        maxMinute=-1

        x_pred=[[ x[str(i)][sizeX-1] for i in range(1,sequence) ] + [y[str(sequence)][sizeX-1]]]
        
        for i in range(self.predictSize):
            # 예측
            y_pred=my_LinearRegression.predict(x_pred)[0][0]    
            predL.append(y_pred)
            if maxPrice<y_pred:
                maxPrice=y_pred
                maxMinute=i

            # x값 조정
            x_pred=[x_pred[0][1:]+[y_pred]]

        # 최고가
        return maxPrice, maxMinute, predL

    def brave_ridge_linear_val(self,sample):
        #ridge와 linearRegression중 validation loss 낮은것 선택

        # transformation
        sequence=7
        
        x=pd.DataFrame(columns=[str(i) for i in range(sequence)])
        y=pd.DataFrame(columns=[str(sequence)])

        for j in range(self.minuteSize-sequence):
            x=x.append({str(k):sample[j:j+sequence][k] for k in range(sequence)},ignore_index=True)
            y=y.append({str(sequence):sample[j+sequence]},ignore_index=True)

        # 데이터 split
        sizeX=self.minuteSize-sequence
        sizePred=120
        
        train_size=int(sizeX*0.7)
        train_x=x.iloc[:train_size]
        train_y=y.iloc[:train_size]

        val_x=x.iloc[train_size:]
        val_y=y.iloc[train_size:]

        # train 학습
        brave_LinearRegression = LinearRegression().fit(train_x, train_y)
        brave_Ridge = Ridge().fit(train_x, train_y)
        
        # 오차 확인
        val_predict_linear=brave_LinearRegression.predict(val_x).flatten()
        linear_MAE=abs(val_predict_linear-np.array(val_y[str(sequence)])).mean()
        
        val_predict_ridge=brave_Ridge.predict(val_x).flatten()
        ridge_MAE=abs(val_predict_ridge-np.array(val_y[str(sequence)])).mean()

        # 모델선택
        if linear_MAE<ridge_MAE:
            print("linear선택!")
            picked_model=LinearRegression().fit(x, y)
        else:
            print("ridge선택!")
            picked_model=Ridge().fit(x, y)
        
        # 미래 120분 예측
        predL=[]
        maxPrice=1
        maxMinute=-1

        x_pred=[[ x[str(i)][sizeX-1] for i in range(1,sequence) ] + [y[str(sequence)][sizeX-1]]]
        for i in range(self.predictSize):
            # 예측
            y_pred=picked_model.predict(x_pred)[0][0]
            predL.append(y_pred)
            if maxPrice<y_pred:
                maxPrice=y_pred
                maxMinute=i

            # x값 조정
            x_pred=[x_pred[0][1:]+[y_pred]]

        # 최고가
        return maxPrice, maxMinute, predL

    def brave_ridge_alpha_val(self,sample):
        # transformation
        sequence=7
        
        x=pd.DataFrame(columns=[str(i) for i in range(sequence)])
        y=pd.DataFrame(columns=[str(sequence)])

        for j in range(self.minuteSize-sequence):
            x=x.append({str(k):sample[j:j+sequence][k] for k in range(sequence)},ignore_index=True)
            y=y.append({str(sequence):sample[j+sequence]},ignore_index=True)

        # 데이터 split
        sizeX=self.minuteSize-sequence
        sizePred=120
        
        train_size=int(sizeX*0.7)
        train_x=x.iloc[:train_size]
        train_y=y.iloc[:train_size]

        val_x=x.iloc[train_size:]
        val_y=y.iloc[train_size:]

        alphas=[0.1,0.5,1,10]
        min_MAE=1
        my_alpha=-1
        for a in alphas:
            # train 학습
            brave_Ridge = Ridge(alpha=a, random_state=123, normalize=False, fit_intercept=True).fit(train_x, train_y)
            
            # 오차 확인
            val_predict_ridge=brave_Ridge.predict(val_x).flatten()
            ridge_MAE=abs(val_predict_ridge-np.array(val_y[str(sequence)])).mean()

            if ridge_MAE<min_MAE:
                min_MAE=ridge_MAE
                my_alpha=a

        print("최적알파 :",my_alpha)

        # 최적 alpha로 학습
        picked_model=Ridge(alpha=my_alpha, random_state=123, normalize=False, fit_intercept=True).fit(x,y)

        # 미래 120분 예측
        predL=[]
        maxPrice=1
        maxMinute=-1

        x_pred=[[ x[str(i)][sizeX-1] for i in range(1,sequence) ] + [y[str(sequence)][sizeX-1]]]
        for i in range(self.predictSize):
            # 예측
            y_pred=picked_model.predict(x_pred)[0][0]
            predL.append(y_pred)
            if maxPrice<y_pred:
                maxPrice=y_pred
                maxMinute=i

            # x값 조정
            x_pred=[x_pred[0][1:]+[y_pred]]

        # 최고가
        return maxPrice, maxMinute, predL

    def brave_decisionTree(self,sample):
        # transformation
        sequence=7
        
        x=pd.DataFrame(columns=[str(i) for i in range(sequence)])
        y=pd.DataFrame(columns=[str(sequence)])

        for j in range(self.minuteSize-sequence):
            x=x.append({str(k):sample[j:j+sequence][k] for k in range(sequence)},ignore_index=True)
            y=y.append({str(sequence):sample[j+sequence]},ignore_index=True)

        sizeX=self.minuteSize-sequence

        # model fit
        my_DecisionTree = DecisionTreeRegressor().fit(x, y)

        # 미래 120분 예측
        predL=[]
        maxPrice=1
        maxMinute=-1

        x_pred=[[ x[str(i)][sizeX-1] for i in range(1,sequence) ] + [y[str(sequence)][sizeX-1]]]

        for i in range(self.predictSize):
            # 예측
            y_pred=my_DecisionTree.predict(x_pred)[0]
            predL.append(y_pred)
            if maxPrice<y_pred:
                maxPrice=y_pred
                maxMinute=i

            # x값 조정
            x_pred=[x_pred[0][1:]+[y_pred]]

        # 최고가
        return maxPrice, maxMinute, predL


    def brave_RandomForest(self,sample):
        # transformation
        sequence=7
        
        x=pd.DataFrame(columns=[str(i) for i in range(sequence)])
        y=pd.DataFrame(columns=[str(sequence)])

        for j in range(self.minuteSize-sequence):
            x=x.append({str(k):sample[j:j+sequence][k] for k in range(sequence)},ignore_index=True)
            y=y.append({str(sequence):sample[j+sequence]},ignore_index=True)

        sizeX=self.minuteSize-sequence

        # model fit
        my_RandomForest = RandomForestRegressor().fit(x, y)

        # 미래 120분 예측
        predL=[]
        maxPrice=1
        maxMinute=-1

        x_pred=[[ x[str(i)][sizeX-1] for i in range(1,sequence) ] + [y[str(sequence)][sizeX-1]]]

        for i in range(self.predictSize):
            # 예측
            y_pred=my_RandomForest.predict(x_pred)[0]
            predL.append(y_pred)
            if maxPrice<y_pred:
                maxPrice=y_pred
                maxMinute=i

            # x값 조정
            x_pred=[x_pred[0][1:]+[y_pred]]

        # 최고가
        return maxPrice, maxMinute, predL


    def brave_lightGBMF(self,sample):
        # transformation
        sample=list(sample)
        sequence=7
        
        x=[]
        y=[]
        for j in range(self.minuteSize-sequence):
            x.append(sample[j:j+sequence])
            y.append(sample[j+sequence])
        sizeX=self.minuteSize-sequence

        # n_estimator:결정트리개수 
        my_lightGBM = LGBMRegressor(learning_rate=0.05, n_estimators=100, random_state=123).fit(x, y)

        # 미래 120분 예측
        predL=[]
        maxPrice=1
        maxMinute=-1

        # 미래 120분 예측
        predL=[]
        maxPrice=1
        maxMinute=-1

        x_pred=[x[-1][1:] + [y[-1]]]

        for i in range(self.predictSize):
            # 예측
            y_pred=my_lightGBM.predict(x_pred)[0]
            predL.append(y_pred)
            if maxPrice<y_pred:
                maxPrice=y_pred
                maxMinute=i

            # x값 조정
            x_pred=[x_pred[0][1:]+[y_pred]]

        # 최고가
        return maxPrice, maxMinute, predL


    def brave_xgb(self,sample):
        # transformation
        sample=list(sample)
        sequence=7
        
        x=[]
        y=[]
        for j in range(self.minuteSize-sequence):
            x.append(sample[j:j+sequence])
            y.append(sample[j+sequence])
        sizeX=self.minuteSize-sequence

        # model fit
        my_xgb = XGBRegressor(learning_rate=0.05, n_estimators=100, random_state=123).fit(x, y)

        # 미래 120분 예측
        predL=[]
        maxPrice=1
        maxMinute=-1

        x_pred=[x[-1][1:] + [y[-1]]]

        for i in range(self.predictSize):
            # 예측
            y_pred=my_xgb.predict(x_pred)[0]
            predL.append(y_pred)
            if maxPrice<y_pred:
                maxPrice=y_pred
                maxMinute=i

            # x값 조정
            x_pred=[x_pred[0][1:]+[y_pred]]

        # 최고가
        return maxPrice, maxMinute, predL

    def brave_lstm(self,sample):
        #parameter
        sequence=7
        dropout=0.3
        epoch=200
        batch_size=14
        verbose=0

        # transformation
        x_train=[]
        y_train=[]

        for i in range(self.minuteSize-sequence):
            x_train.append(np.array(sample[i:i+sequence]))
            y_train.append(np.array(sample[i+sequence]))

        x_train=np.array(x_train)
        y_train=np.array(y_train)

        x_train=x_train.reshape(x_train.shape[0],x_train.shape[1],1)
        y_train=y_train.reshape(y_train.shape[0],1)

        # LSTM
        model = Sequential()
        model.add(LSTM(128, input_shape=(x_train.shape[1], x_train.shape[2]), activation='relu', return_sequences=True))
        model.add(Dropout(dropout)) 
        model.add(LSTM(64, activation="relu", return_sequences=False))
        model.add(Dropout(dropout)) 
        model.add(Dense(1))
        model.compile(optimizer='adam', loss='mean_squared_error')

        model_fit = model.fit(x_train, y_train, 
                                batch_size=batch_size, 
                                epochs=epoch,
                                verbose=verbose)

        
        # predict
        #predL=[]
        maxPrice=1
        maxMinute=-1
        predL=[]
        x=np.concatenate((x_train[-1][1:],np.array([y_train[-1]])),axis=0)
        for i in range(self.predictSize):
            y = model.predict(np.array([x]))
            predL.append(y[0][0])
            x=np.concatenate((x[1:],y),axis=0)
            if maxPrice<y[0][0]:
                maxPrice=y
                maxMinute=i

        return maxPrice, maxMinute, predL

    def brave_rnn(self,sample):
        #parameter
        sequence=7
        dropout=0.3
        epoch=200
        batch_size=14
        verbose=0

        # transformation
        x_train=[]
        y_train=[]

        for i in range(self.minuteSize-sequence):
            x_train.append(np.array(sample[i:i+sequence]))
            y_train.append(np.array(sample[i+sequence]))

        x_train=np.array(x_train)
        y_train=np.array(y_train)

        x_train=x_train.reshape(x_train.shape[0],x_train.shape[1],1)
        y_train=y_train.reshape(y_train.shape[0],1)

        # RNN
        model = Sequential()
        model.add(SimpleRNN(128, input_shape=(x_train.shape[1], x_train.shape[2]), return_sequences=True, activation='relu'))
        model.add(Dropout(dropout)) 
        model.add(SimpleRNN(256, return_sequences=True, activation="relu"))
        model.add(Dropout(dropout)) 
        model.add(SimpleRNN(128, return_sequences=True, activation="relu"))
        model.add(Dropout(dropout)) 
        model.add(SimpleRNN(64, return_sequences=True, activation="relu"))
        model.add(Dropout(dropout)) 
        model.add(Flatten())
        model.add(Dense(1))
        model.compile(optimizer='adam', loss='mean_squared_error')

        model_fit = model.fit(x_train, y_train, batch_size=batch_size, epochs=epoch,verbose=verbose)
        
        # predict
        #predL=[]
        maxPrice=1
        maxMinute=-1
        predL=[]
        x=np.concatenate((x_train[-1][1:],np.array([y_train[-1]])),axis=0)
        for i in range(self.predictSize):
            y = model.predict(np.array([x]))
            predL.append(y[0][0])
            x=np.concatenate((x[1:],y),axis=0)
            if maxPrice<y[0][0]:
                maxPrice=y
                maxMinute=i

        return maxPrice, maxMinute, predL

    def brave_GRU(self,sample):
        #parameter
        sequence=7
        dropout=0.3
        epoch=200
        batch_size=14
        verbose=0

        # transformation
        x_train=[]
        y_train=[]

        for i in range(self.minuteSize-sequence):
            x_train.append(np.array(sample[i:i+sequence]))
            y_train.append(np.array(sample[i+sequence]))

        x_train=np.array(x_train)
        y_train=np.array(y_train)

        x_train=x_train.reshape(x_train.shape[0],x_train.shape[1],1)
        y_train=y_train.reshape(y_train.shape[0],1)

        # GRU
        model = Sequential()
        model.add(GRU(128, input_shape=(x_train.shape[1], x_train.shape[2]), return_sequences=True, activation='relu'))
        model.add(Dropout(dropout)) 
        model.add(GRU(256, return_sequences=True, activation="relu"))
        model.add(Dropout(dropout)) 
        model.add(GRU(128, return_sequences=True, activation="relu"))
        model.add(Dropout(dropout)) 
        model.add(GRU(64, return_sequences=False, activation="relu")) 
        model.add(Dropout(dropout)) 
        model.add(Dense(1))
        model.compile(optimizer='adam', loss='mean_squared_error')
        model_fit = model.fit(x_train, y_train, 
                            batch_size=batch_size, epochs=epoch,
                            verbose=verbose)
        
        # predict
        # predL=[]
        maxPrice=1
        maxMinute=-1
        predL=[]
        x=np.concatenate((x_train[-1][1:],np.array([y_train[-1]])),axis=0)
        for i in range(self.predictSize):
            y = model.predict(np.array([x]))
            predL.append(y[0][0])
            x=np.concatenate((x[1:],y),axis=0)
            if maxPrice<y[0][0]:
                maxPrice=y
                maxMinute=i

        return maxPrice, maxMinute, predL

In [5]:
if __name__=="__main__":
    season=3 # *********season입력*********
    modelList="""
    ---regression---
    1.  ridge
    2.  linear regression
    3.  ridge_linear_val : searcing best model (ridge or linear) by using val set
    4.  ridge_alpha_val : searching best alpha by using val set
    5.  decision Tree 
    6.  randomForest
    7.  lightGBM
    8.  xgboost
    9.  lstm
    10. rnn
    11. gru
    """

    path=f"/gdrive/My Drive/Colab Notebooks/dacon/bitcoin/season{season}/test_x_df.csv"

    # Ensemble 객체지정
    my_Ensemble=Ensemble(path)

    # 전처리
    d=my_Ensemble.preprocessing()

    # 모델선택
    print(modelList)
    modelNum=int(input("모델 번호를 입력하시오 :"))
    if modelNum==1:
        brave_model=my_Ensemble.brave_ridge
    elif modelNum==2:
        brave_model=my_Ensemble.brave_LinearRegression
    elif modelNum==3:
        brave_model=my_Ensemble.brave_ridge_linear_val
    elif modelNum==4:
        brave_model=my_Ensemble.brave_ridge_alpha_val
    elif modelNum==5:
        brave_model=my_Ensemble.brave_decisionTree
    elif modelNum==6:
        brave_model=my_Ensemble.brave_RandomForest
    elif modelNum==7:
        brave_model=my_Ensemble.brave_lightGBMF
    elif modelNum==8:
        brave_model=my_Ensemble.brave_xgb
    elif modelNum==9:
        brave_model=my_Ensemble.brave_lstm
    elif modelNum==10:
        brave_model=my_Ensemble.brave_rnn
    elif modelNum==11:
        brave_model=my_Ensemble.brave_GRU
    
    print(brave_model)

    # 데이터 피클로 저장
    directory=f"/gdrive/My Drive/Colab Notebooks/dacon/bitcoin/season{season}/modelP/modelP_{modelNum}"
    if os.path.exists(directory):
        learned_samples=os.listdir(directory)
    else:
        os.makedirs(directory) # 폴더만들기
        learned_samples=[] 
    leared_samples_size=len(learned_samples)
    print("현재까지 구한 sample 수 :",leared_samples_size)
    print("앞으로 계산할 sample 수 :",my_Ensemble.sampleSize-leared_samples_size)

    # transformation 및 모델fit
    for sampleId in my_Ensemble.sampleIds:
        if str(sampleId)+".pickle" not in learned_samples:
            p,m,pL=brave_model(d[sampleId])
            print("sample :",sampleId,"매도시점 :",m)
            
            with open(directory+f'/{sampleId}.pickle','wb') as fw:
                pickle.dump([sampleId,m,p,pL], fw)

Unnamed: 0,sample_id,time,coin_index,open,high,low,close,volume,quote_av,trades,tb_base_av,tb_quote_av
0,7929,0,1,0.941549,0.941773,0.940431,0.941176,0.169500,85.595390,0.089465,0.108427,54.745537
1,7929,1,1,0.941586,0.941736,0.940282,0.940897,0.237560,119.955322,0.119287,0.105256,53.145061
2,7929,2,1,0.941270,0.941586,0.938940,0.940934,0.231588,116.848953,0.149109,0.163196,82.363953
3,7929,3,1,0.940971,0.941363,0.939052,0.940319,0.238199,120.162766,0.201297,0.157981,79.687485
4,7929,4,1,0.940077,0.940561,0.939760,0.939909,0.082302,41.511501,0.055916,0.009733,4.908966
...,...,...,...,...,...,...,...,...,...,...,...,...
1048795,8688,1375,5,1.005905,1.007405,1.005392,1.006122,32184.292969,23957.617188,66.215317,4388.184082,3269.062256
1048796,8688,1376,5,1.006243,1.006554,1.004040,1.004608,46183.425781,34342.394531,93.241982,8526.371094,6343.348145
1048797,8688,1377,5,1.004608,1.005284,1.002865,1.002865,55310.468750,41101.046875,97.295982,16837.339844,12520.099609
1048798,8688,1378,5,1.002865,1.002905,1.000013,1.000054,114685.742188,85007.203125,127.025307,35383.035156,26229.859375


샘플사이즈 : 760
분길이 :  1380

    ---regression---
    1.  ridge
    2.  linear regression
    3.  ridge_linear_val : searcing best model (ridge or linear) by using val set
    4.  ridge_alpha_val : searching best alpha by using val set
    5.  decision Tree 
    6.  randomForest
    7.  lightGBM
    8.  xgboost
    9.  lstm
    10. rnn
    11. gru
    
모델 번호를 입력하시오 :5
<bound method Ensemble.brave_decisionTree of <__main__.Ensemble object at 0x7f31484eee50>>
현재까지 구한 sample 수 : 760
앞으로 계산할 sample 수 : 0
