# HW3 - Stock Movement Prediction

作業檔案：
- hw3.ipynb

資料：
https://www.sharecast.com/index/SP_500/prices/download

- train.csv: S&P 500 訓練資料(2009-2017)
- test.csv: S&P 500 測試資料(2018)


In [1]:
# Read data

import pandas as pd
import numpy as np

train_data_path = './train.csv'
test_data_path = './test.csv'

train_df = pd.read_csv(train_data_path)
test_df = pd.read_csv(test_data_path)

print(train_df.shape)
print(train_df.head())
print(test_df.shape)
print(test_df.head())

(2264, 6)
          Date  Open Price  Close Price  High Price  Low Price      Volume
0  02-Jan-2009      902.99       931.80      934.73     899.35  4048270080
1  05-Jan-2009      929.17       927.45      936.63     919.53  5413910016
2  06-Jan-2009      931.17       934.70      943.85     927.28  5392620032
3  07-Jan-2009      927.45       906.65      927.45     902.37  4704940032
4  08-Jan-2009      905.73       909.73      910.00     896.81  4991549952
(252, 6)
          Date  Open Price  Close Price  High Price  Low Price      Volume
0  02-Jan-2018     2683.73      2695.81     2695.89    2682.36  1846463232
1  03-Jan-2018     2697.85      2713.06     2714.37    2697.77  2090595328
2  04-Jan-2018     2719.31      2723.99     2729.29    2719.07  2100767744
3  05-Jan-2018     2731.33      2743.15     2743.45    2727.92  1918869120
4  08-Jan-2018     2742.67      2747.71     2748.51    2737.60  1894823936


In [2]:
# Drop unnecessary columns

drop_col_names = ['Date'] # !--- or you can modify it to drop the columns you don't want ---!

train_df.drop(columns=drop_col_names, inplace=True)
test_df.drop(columns=drop_col_names, inplace=True)

print(train_df.shape)
print(train_df.head())
print(test_df.shape)
print(test_df.head())

(2264, 5)
   Open Price  Close Price  High Price  Low Price      Volume
0      902.99       931.80      934.73     899.35  4048270080
1      929.17       927.45      936.63     919.53  5413910016
2      931.17       934.70      943.85     927.28  5392620032
3      927.45       906.65      927.45     902.37  4704940032
4      905.73       909.73      910.00     896.81  4991549952
(252, 5)
   Open Price  Close Price  High Price  Low Price      Volume
0     2683.73      2695.81     2695.89    2682.36  1846463232
1     2697.85      2713.06     2714.37    2697.77  2090595328
2     2719.31      2723.99     2729.29    2719.07  2100767744
3     2731.33      2743.15     2743.45    2727.92  1918869120
4     2742.67      2747.71     2748.51    2737.60  1894823936


In [3]:
# Add the column `Tomorrow Movement` by comparing the `Close Price` with the previous days as the training target

train_df['Tomorrow Movement'] = np.where(train_df['Close Price'].diff() >= 0, 1, 0)
test_df['Tomorrow Movement'] = np.where(test_df['Close Price'].diff() >= 0, 1, 0)

train_df['Tomorrow Movement'] = train_df['Tomorrow Movement'].shift(-1)
test_df['Tomorrow Movement'] = test_df['Tomorrow Movement'].shift(-1)

### Data preprocessing

In [4]:
# !--- You can add your own data preprocessing here ---!
train_df['Open Close Diff'] = train_df['Open Price'] - train_df['Close Price']
test_df['Open Close Diff'] = test_df['Open Price'] - test_df['Close Price']

train_df['High Low Diff'] = train_df['High Price'] - train_df['Low Price']
test_df['High Low Diff'] = test_df['High Price'] - test_df['Low Price']
# Drop unnecessary columns ,'High Price','Low Price'
drop_col_names = ['Open Price','Close Price'] # !--- or you can modify it to drop the columns you don't want ---!

train_df.drop(columns=drop_col_names, inplace=True)
test_df.drop(columns=drop_col_names, inplace=True)

# Convert np.int64 datatype into np.float64 type for later use
train_df = train_df.astype(np.float64)
test_df = test_df.astype(np.float64)

print(train_df.head())
print(test_df.head())

   High Price  Low Price        Volume  Tomorrow Movement  Open Close Diff  \
0      934.73     899.35  4.048270e+09                0.0           -28.81   
1      936.63     919.53  5.413910e+09                1.0             1.72   
2      943.85     927.28  5.392620e+09                0.0            -3.53   
3      927.45     902.37  4.704940e+09                1.0            20.80   
4      910.00     896.81  4.991550e+09                0.0            -4.00   

   High Low Diff  
0          35.38  
1          17.10  
2          16.57  
3          25.08  
4          13.19  
   High Price  Low Price        Volume  Tomorrow Movement  Open Close Diff  \
0     2695.89    2682.36  1.846463e+09                1.0           -12.08   
1     2714.37    2697.77  2.090595e+09                1.0           -15.21   
2     2729.29    2719.07  2.100768e+09                1.0            -4.68   
3     2743.45    2727.92  1.918869e+09                1.0           -11.82   
4     2748.51    2737.60  1

In [5]:
train_df.head()

Unnamed: 0,High Price,Low Price,Volume,Tomorrow Movement,Open Close Diff,High Low Diff
0,934.73,899.35,4048270000.0,0.0,-28.81,35.38
1,936.63,919.53,5413910000.0,1.0,1.72,17.1
2,943.85,927.28,5392620000.0,0.0,-3.53,16.57
3,927.45,902.37,4704940000.0,1.0,20.8,25.08
4,910.0,896.81,4991550000.0,0.0,-4.0,13.19


In [6]:
train_df.tail()

Unnamed: 0,High Price,Low Price,Volume,Tomorrow Movement,Open Close Diff,High Low Diff
2259,2685.35,2678.13,1383889000.0,0.0,0.88,7.22
2260,2682.74,2677.96,1103808000.0,1.0,-1.41,4.78
2261,2685.64,2678.91,1149108000.0,1.0,-0.52,6.73
2262,2687.66,2682.69,1126090000.0,0.0,-1.44,4.97
2263,2692.12,2673.61,1332374000.0,,15.54,18.51


In [7]:
# Drop rows with NaN values
train_df = train_df.dropna()
test_df = test_df.dropna()

print(train_df.shape)

(2263, 6)


In [8]:
# Divide x and y data

train_x_df = train_df.drop(columns=['Tomorrow Movement'])
train_y_df = train_df['Tomorrow Movement']

test_x_df = test_df.drop(columns=['Tomorrow Movement'])
test_y_df = test_df['Tomorrow Movement']

print(train_x_df.shape)
print(train_x_df.head())
print(train_y_df.shape)
print(train_y_df.head())
print('-----')
print(test_x_df.shape)
print(test_x_df.head())
print(test_y_df.shape)
print(test_y_df.head())

(2263, 5)
   High Price  Low Price        Volume  Open Close Diff  High Low Diff
0      934.73     899.35  4.048270e+09           -28.81          35.38
1      936.63     919.53  5.413910e+09             1.72          17.10
2      943.85     927.28  5.392620e+09            -3.53          16.57
3      927.45     902.37  4.704940e+09            20.80          25.08
4      910.00     896.81  4.991550e+09            -4.00          13.19
(2263,)
0    0.0
1    1.0
2    0.0
3    1.0
4    0.0
Name: Tomorrow Movement, dtype: float64
-----
(251, 5)
   High Price  Low Price        Volume  Open Close Diff  High Low Diff
0     2695.89    2682.36  1.846463e+09           -12.08          13.53
1     2714.37    2697.77  2.090595e+09           -15.21          16.60
2     2729.29    2719.07  2.100768e+09            -4.68          10.22
3     2743.45    2727.92  1.918869e+09           -11.82          15.53
4     2748.51    2737.60  1.894824e+09            -5.04          10.91
(251,)
0    1.0
1    1.0
2    

In [9]:
# Normalize data

# !--- Modify here if you want ---!

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(train_x_df)

normalized_train_x_df = scaler.transform(train_x_df)
normalized_train_x_df = np.transpose(normalized_train_x_df)
normalized_train_x_df = pd.DataFrame({
    'High Price': normalized_train_x_df[0],
    'Low Price': normalized_train_x_df[1],
    'Volume': normalized_train_x_df[2],
    'Open Close Diff':normalized_train_x_df[3],
    'High Low Diff':normalized_train_x_df[4]
})

normalized_test_x_df = scaler.transform(test_x_df)
normalized_test_x_df = np.transpose(normalized_test_x_df)
normalized_test_x_df = pd.DataFrame({
    'High Price': normalized_test_x_df[0],
    'Low Price': normalized_test_x_df[1],
    'Volume': normalized_test_x_df[2],
    #'Open Price Diff':normalized_test_x_df[3],
    #'Close Price Diff':normalized_test_x_df[4],
    'Open Close Diff':normalized_test_x_df[3],
    'High Low Diff':normalized_test_x_df[4]
})

print(normalized_train_x_df.head())
print('------')
print(train_y_df[:5])

   High Price  Low Price    Volume  Open Close Diff  High Low Diff
0   -1.505683  -1.541181  0.813175        -2.089188       1.859140
1   -1.501760  -1.499581  1.823826         0.174385       0.009988
2   -1.486853  -1.483605  1.808070        -0.214864      -0.043625
3   -1.520714  -1.534956  1.299148         1.589025       0.817222
4   -1.556744  -1.546417  1.511255        -0.249711      -0.385536
------
0    0.0
1    1.0
2    0.0
3    1.0
4    0.0
Name: Tomorrow Movement, dtype: float64


#### Check if two classes are balanced

In [10]:
print(np.bincount(train_y_df))

[1024 1239]


# Logistic Regression


#### Solver of sklearn logistic regression
Algorithm to use in the optimization problem.

*    For small datasets, ‘liblinear’ is a good choice, whereas ‘sag’ and ‘saga’ are faster for large ones.
*    For multiclass problems, only ‘newton-cg’, ‘sag’, ‘saga’ and ‘lbfgs’ handle multinomial loss; ‘liblinear’ is limited to one-versus-rest schemes.
*    ‘newton-cg’, ‘lbfgs’, ‘sag’ and ‘saga’ handle L2 or no penalty
*    ‘liblinear’ and ‘saga’ also handle L1 penalty
*    ‘saga’ also supports ‘elasticnet’ penalty
*    ‘liblinear’ does not handle no penalty

In [11]:
# Train & Predict using Logistic Regression

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

lr_model = LogisticRegression(penalty = "l2",solver='liblinear') # !--- Initialize the model here ---!
lr_model.fit(normalized_train_x_df, train_y_df) # !-- Fill the training data here --!

print('training accuracy:')
# !-- Predict training target & print the training accuracy here --!
lr_training_acc = lr_model.score(normalized_train_x_df, train_y_df)
print(lr_training_acc)

print('\ntesting accuracy:')
# !-- Predict testing target & print the testing accuracy here --!
lr_predict_test_result = lr_model.predict(normalized_test_x_df)
lr_testing_acc = np.mean(lr_predict_test_result == test_y_df)
print(lr_testing_acc)

print('\npredicted testing labels:')
print(lr_predict_test_result)

training accuracy:
0.5457357490057446

testing accuracy:
0.47808764940239046

predicted testing labels:
[1. 0. 1. 1. 1. 1. 1. 0. 0. 1. 0. 1. 1. 0. 1. 1. 1. 0. 1. 1. 1. 1. 1. 1.
 0. 1. 1. 0. 0. 0. 0. 0. 1. 1. 1. 1. 0. 0. 1. 1. 1. 0. 0. 1. 0. 1. 0. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 0. 1. 1. 0. 1. 0. 0. 1. 1. 1. 0. 1. 1. 1. 1.
 0. 1. 1. 1. 1. 1. 1. 0. 1. 1. 0. 1. 1. 0. 1. 1. 0. 0. 1. 1. 1. 1. 1. 1.
 1. 1. 0. 1. 1. 1. 0. 1. 0. 1. 1. 0. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 0. 1. 0. 1. 0. 0. 0. 1. 1. 0. 1. 1. 0. 1. 1. 1. 1. 1. 0. 1. 1.
 1. 1. 1. 0. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 0. 1. 1. 1. 1. 1. 1. 1. 0. 1.
 1. 1. 1. 1. 1. 1. 0. 1. 1. 1. 1. 0. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 0. 1. 1. 1. 1. 0. 1. 0. 1. 1. 0. 1. 0. 1. 0. 0. 0.
 1. 1. 1. 1. 1. 0. 0. 1. 1. 1. 1. 0. 0. 0. 1. 0. 1. 1. 1. 0. 1. 0. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 0. 0. 1.]


In [12]:
# Print precision, recall, fbeta-score and confusion matrix

from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import confusion_matrix

print('precision, recall, fbeta-score:')
print(precision_recall_fscore_support(test_y_df, lr_predict_test_result, average='weighted',labels=np.unique( lr_predict_test_result)))
print('\nconfusion matrix(tn, fp, fn, tp):')
tn, fp, fn, tp = confusion_matrix(test_y_df, lr_predict_test_result).ravel()
print((tn, fp, fn, tp))

precision, recall, fbeta-score:
(0.4569589289899227, 0.47808764940239046, 0.44465064182688885, None)

confusion matrix(tn, fp, fn, tp):
(26, 93, 38, 94)


# SVM

In [13]:
# Train & Predict with SVC

from sklearn.svm import SVC

svc_model = SVC(kernel = 'rbf',gamma="scale") # !--- Initialize the model here ---!
svc_model.fit(normalized_train_x_df, train_y_df) # !-- Fill the training data here --!

print('training accuracy:')
# !-- Predict training target & print the training accuracy here --!
svc_training_acc = svc_model.score(normalized_train_x_df, train_y_df)
print(svc_training_acc)

print('\ntesting accuracy:')
# !-- Predict testing target & print the testing accuracy here --!
svc_predict_test_result = svc_model.predict(normalized_test_x_df)
svc_testing_acc = np.mean(svc_predict_test_result == test_y_df)
print(svc_testing_acc)

print('\npredicted testing labels:')
print(svc_predict_test_result)

training accuracy:
0.5612019443216969

testing accuracy:
0.4940239043824701

predicted testing labels:
[0. 0. 1. 0. 1. 1. 1. 0. 0. 1. 0. 1. 1. 0. 1. 1. 1. 0. 1. 1. 1. 0. 1. 1.
 1. 0. 1. 1. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 1. 1. 1. 0. 0. 1. 0. 1. 0. 1.
 1. 1. 1. 1. 1. 1. 0. 1. 1. 0. 1. 0. 0. 1. 0. 1. 0. 1. 0. 0. 1. 0. 1. 0.
 0. 1. 1. 1. 1. 1. 0. 0. 1. 1. 0. 1. 0. 0. 1. 1. 0. 0. 1. 1. 1. 1. 1. 1.
 1. 1. 0. 1. 1. 1. 0. 1. 0. 1. 1. 0. 1. 0. 1. 1. 1. 1. 1. 1. 0. 1. 1. 1.
 1. 1. 1. 0. 1. 0. 1. 0. 0. 0. 1. 1. 0. 1. 1. 0. 1. 1. 1. 1. 1. 0. 1. 1.
 1. 1. 1. 0. 1. 1. 1. 1. 1. 1. 1. 0. 1. 1. 0. 1. 1. 1. 1. 1. 1. 1. 0. 1.
 1. 1. 1. 1. 1. 1. 0. 1. 1. 1. 1. 0. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 0. 1. 1. 1. 0. 1. 0. 0. 1. 0. 1. 0. 0. 0. 1. 1. 0. 0. 0. 1. 0. 0. 0.
 1. 1. 1. 0. 1. 1. 0. 1. 0. 1. 1. 0. 0. 1. 0. 0. 0. 1. 0. 1. 1. 0. 1. 0.
 0. 1. 1. 1. 1. 1. 0. 1. 1. 1. 1.]


In [14]:
# Print precision, recall, fbeta-score and confusion matrix

print('precision, recall, fbeta-score:')
print(precision_recall_fscore_support(test_y_df, svc_predict_test_result, average='weighted',labels=np.unique(svc_predict_test_result)))
print('\nconfusion matrix(tn, fp, fn, tp):')
tn, fp, fn, tp = confusion_matrix(test_y_df, svc_predict_test_result).ravel()
print((tn, fp, fn, tp))

precision, recall, fbeta-score:
(0.48651573294107453, 0.4940239043824701, 0.4827219901202527, None)

confusion matrix(tn, fp, fn, tp):
(40, 79, 48, 84)


# Neural Network

The first column of data y indicate if it belongs to class '0'

In [15]:
# Define NN output groundtruth
falling_prob = pd.DataFrame(data=np.where(train_y_df == 0, 1, 0)[:])
train_y_df_NN = pd.DataFrame(data=np.where(train_y_df == 0, 0, 1)[:],dtype = 'float64')
train_y_df_NN = pd.concat( [ falling_prob, train_y_df ], axis=1, ignore_index=True )

falling_prob = pd.DataFrame(data=np.where(test_y_df == 0, 1, 0)[:])
test_y_df_NN = pd.DataFrame(data=np.where(test_y_df == 0, 0, 1)[:],dtype = 'float64')
test_y_df_NN = pd.concat( [ falling_prob, test_y_df ], axis=1, ignore_index=True )

print(train_y_df_NN.shape)
print(train_y_df_NN.head())

(2263, 2)
   0    1
0  1  0.0
1  0  1.0
2  1  0.0
3  0  1.0
4  1  0.0


In [None]:
# Define NN structure

import torch
import torch.nn.functional as F

# !--- You can modify the NN structure here ---!
class M_NN(torch.nn.Module):
    def __init__(self, D_in, H, D_out):
        super(M_NN, self).__init__()
        self.linear1 = torch.nn.Linear(D_in, H)
        self.linear2 = torch.nn.Linear(H, D_out)
        self.linear3 = torch.nn.Linear(H, H)

    def forward(self, x):
        # Input layer
        i = self.linear1(x)
        act_out = F.relu(i)
        # Hidden layer
        h1 = self.linear3(act_out)
        act2_out = F.relu(h1)
        h2 = self.linear3(act2_out)
        act3_out = F.relu(h2)
        h3 = self.linear3(act3_out)
        act4_out = F.relu(h3)
        # Output layer
        o = self.linear2(act4_out)
        y_pred = F.relu(o)
        
        return y_pred

# N = batch size, D_in = input size, H = hidden size, D_out = output size
N, D_in, H, D_out = 250, 5, 100, 2  # !--- You can modify here ---!

model = M_NN(D_in, H, D_out)
criterion = torch.nn.BCEWithLogitsLoss(reduction='mean') # !--- You can modify here ---!
optimizer = torch.optim.SGD(model.parameters(), lr=3e-2) # !--- You can modify here ---!

# Train NN
# !--- You can modify here ---!

for t in range(1000):
    for batch_num in range(N, len(normalized_train_x_df), N): 
        data = torch.tensor(normalized_train_x_df.iloc[batch_num-N:batch_num].values) # !-- Transfer data into tensor form --!
        y_pred = model(data.float()) # !-- Fill the training batch data here --!
        target = torch.tensor(train_y_df_NN.iloc[batch_num-N:batch_num].values) # !-- Transfer target into tensor form --!
        loss = criterion(y_pred,target.float()) # !-- Fill the prediction & groundtruth here to calculate loss --!
                
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
    if (t%10 == 0):
        print('epoch:%d - loss:%.10f' % (t, loss.item()))

In [None]:
# Predict
x_train = torch.tensor(normalized_train_x_df.values).float()
nn_predict_train_y = model(x_train) # !-- Predict training data here --!
result_train = np.where(nn_predict_train_y[:, 0] > nn_predict_train_y[:, 1], 0, 1) # !-- You can modify here --!
print('training accuracy:')
print(accuracy_score(train_y_df, result_train))

x_test = torch.tensor(normalized_test_x_df.values).float()
nn_predict_test_y = model(x_test) # !-- Predict training data here --!
result_test = np.where(nn_predict_test_y[:, 0] > nn_predict_test_y[:, 1], 0, 1) # !-- You can modify here --!
print('\ntesting accuracy:')
print(accuracy_score(test_y_df, result_test))

print('\npredicted testing prob:')
print(nn_predict_test_y)
print('\npredicted testing labels:')
print(result_test)

In [None]:
# Print precision, recall, fbeta-score and confusion matrix

print('\nprecision, recall, fbeta-score:')
print(precision_recall_fscore_support(test_y_df, result_test, average='weighted',labels=np.unique(result_test)))
print('\nconfusion matrix(tn, fp, fn, tp):')
tn, fp, fn, tp = confusion_matrix(test_y_df, result_test).ravel()
print((tn, fp, fn, tp))

# Discussion

一開始使用的特徵為：
*    Open Price
*    Close Price
*    High Price
*    Low Price
*    Volume

但是發現不管怎麼調整model或是其餘參數，始終無法提升準確度，一直停留在百分之50上下，我認為這可能並不是模型的問題，而是Data選得不好，Garbage in Garbage out，因此我後來試著調整輸入參數。
調整為高低價、高低價差、開盤收盤價差與成交量，但是結果並沒有任何提升。