In [1]:
import numpy as np
import pandas as pd

df = pd.read_csv('./train.csv')
test_df = pd.read_csv('./test.csv')

In [2]:
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import scale
import talib as ta

def prep_data(data) :
    # log10 to Volume
    tf = FunctionTransformer(np.log10)
    data['Volume'] = tf.transform(data['Volume'])

    # scale to others colume
    data['Open Price'] = scale(data['Open Price'])
    data['Close Price'] = scale(data['Close Price'])
    data['High Price'] = scale(data['High Price'])
    data['Low Price'] = scale(data['Low Price'])
    
    # add some data
    data['open-close'] = data['Open Price'] -  data['Close Price'].shift(1)
    data['op-op'] = data['Open Price'] - data['Open Price'].shift(1)
    data['RSI'] = ta.RSI(np.array(data['Close Price']), timeperiod = 3)
    
    data = data.dropna()
    
    return data

In [3]:
df = prep_data(df)
test_df = prep_data(test_df)

In [4]:
df.describe()

Unnamed: 0,Open Price,Close Price,High Price,Low Price,Volume,open-close,op-op,RSI
count,2261.0,2261.0,2261.0,2261.0,2261.0,2261.0,2261.0,2261.0
mean,0.00201,0.001984,0.001987,0.002001,9.432364,0.001611,0.001603,57.05054
std,0.999359,0.999399,0.999394,0.999373,0.17235,0.005693,0.028121,25.440508
min,-2.013382,-2.020613,-1.999492,-2.019975,8.714463,-0.03815,-0.161279,1.207076
25%,-0.855123,-0.858156,-0.855304,-0.856811,9.311532,-0.000381,-0.011707,36.87997
50%,-0.0113,-0.012899,-0.015344,-0.015192,9.39879,0.001386,0.002865,60.214066
75%,0.854534,0.854332,0.856595,0.847459,9.527859,0.0039,0.01688,78.950453
max,2.136437,2.130184,2.126139,2.139159,9.96,0.045695,0.144316,99.258642


In [5]:
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score

def kFord_training(k, model, x_data, y_data, verbose=True) :
    kf = KFold(n_splits=k,
               random_state=15680,
               shuffle=True)
    kf.get_n_splits(x_data)

    train_acc_list = []
    valid_acc_list = []

    for train_index, valid_index in kf.split(x_data) :
        x_train = x_data.iloc[train_index]
        y_train = y_data.iloc[train_index]
        x_valid = x_data.iloc[valid_index]
        y_valid = y_data.iloc[valid_index]
        
        model.fit(x_train, y_train)

        y_pred_train = model.predict(x_train)
        train_acc = accuracy_score(y_train, y_pred_train)

        y_pred_valid = model.predict(x_valid)
        vaild_acc = accuracy_score(y_valid, y_pred_valid)

        train_acc_list.append(train_acc)
        valid_acc_list.append(vaild_acc)
    
    if verbose :
        print(
            f'train mean acc = {np.mean(train_acc_list)}\n'
            f'train min acc = {np.min(train_acc_list)}\n'
            f'train max acc = {np.max(train_acc_list)}\n'
        )
        print(
            f'valid mean acc = {np.mean(valid_acc_list)}\n'
            f'valid min acc = {np.min(valid_acc_list)}\n'
            f'valid max acc = {np.max(valid_acc_list)}\n'
        )
    
    return model

In [6]:
def test_model(model, attrs, verbose=True) :
    test_x = test_df[attrs]

    test_y = np.where(
        test_df['Close Price'].shift(-1) > test_df['Close Price'],
        1,
        -1)
    test_y = pd.Series(test_y)

    pred_test_y = model.predict(test_x)
    test_acc = accuracy_score(test_y, pred_test_y)
    
    if verbose :
        print(f'test acc = {test_acc}')
    
    return test_acc

In [7]:
from sklearn.linear_model import LogisticRegression

attrs = ['Open Price', 'Close Price',
         'High Price', 'Low Price', 'Volume',
         'open-close', 'op-op', 'RSI']

x = df[attrs]

# where "CP of next day" is > "CP of today"
y = np.where(
    df['Close Price'].shift(-1) > df['Close Price'],
    1,
    -1)
y = pd.Series(y)

lgr = kFord_training(5, LogisticRegression(), x, y)

test_model(lgr, attrs);

train mean acc = 0.5482089919135884
train min acc = 0.5422885572139303
train max acc = 0.5533443891652847

valid mean acc = 0.546225751626326
valid min acc = 0.5298013245033113
valid max acc = 0.5619469026548672

test acc = 0.5301204819277109


In [8]:
from sklearn.neural_network import MLPClassifier

attrs = ['Open Price', 'Close Price',
         'High Price', 'Low Price', 'Volume',
         'open-close', 'op-op', 'RSI']

x = df[attrs]

# where "CP of next day" is > "CP of today"
y = np.where(
    df['Close Price'].shift(-1) > df['Close Price'],
    1,
    -1)
y = pd.Series(y)

model = MLPClassifier(
    max_iter = 200,
    random_state=15680,
    solver = 'sgd',
    early_stopping=True,
    learning_rate_init=0.0028
)
mlp = kFord_training(5, model, x, y)

test_model(mlp, attrs);

train mean acc = 0.540801523356668
train min acc = 0.5163073521282476
train max acc = 0.5500276395798784

valid mean acc = 0.524987790345582
valid min acc = 0.4823008849557522
valid max acc = 0.5575221238938053

test acc = 0.5542168674698795


In [9]:
from sklearn.neighbors import KNeighborsClassifier

attrs = ['Open Price', 'Close Price',
         'High Price', 'Low Price', 'Volume',
         'open-close', 'op-op', 'RSI']

x = df[attrs]

# where "CP of next day" is > "CP of today"
y = np.where(
    df['Close Price'].shift(-1) > df['Close Price'],
    1,
    -1)
y = pd.Series(y)

model = KNeighborsClassifier(
    n_neighbors=20
)
knc = kFord_training(5, model, x, y)

test_model(knc, attrs);

train mean acc = 0.5981862442947504
train min acc = 0.5865118850193477
train max acc = 0.6069651741293532

valid mean acc = 0.5218904452128387
valid min acc = 0.5154867256637168
valid max acc = 0.5331858407079646

test acc = 0.5421686746987951
