In [1]:
import os
import yfinance as yf
from datetime import datetime
import pandas as pd
import pytz
import logging
import numpy as np
import plotly.graph_objects as go
from PIL import Image

import matplotlib.pyplot as plt

import torchvision.transforms as transforms
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from torch.autograd import Variable

from torch.optim import Adam, SGD
import gc

In [18]:
def dta_to_candlestick(data):
    l = len(data)
    # Make candlestick picture
    layout = go.Layout(xaxis=dict(ticks='',
                                  showgrid=False,
                                  showticklabels=False,
                                  rangeslider=dict(visible=False)),
                       yaxis=dict(ticks='',
                                  showgrid=False,
                                  showticklabels=False),
                       width=300,
                       height=300,
                       paper_bgcolor='rgba(0,0,0,0)',
                       plot_bgcolor='rgba(0,0,0,0)')
    fig = go.Figure(data=[go.Candlestick(x=np.linspace(1,l,l),
                                         open=data.Open,
                                         high=data.High,
                                         low=data.Low,
                                         close=data.Close)],
                    layout=layout)
    fig.write_image("images/fig-33.png")

    # Convert to numpy array
    im = Image.open('images/fig-33.png')
    #im = im.resize((300,300),Image.ANTIALIAS)
    data = np.asarray(im)

    # Return the first channel of the image
    return data[:, :, 0]

In [19]:
START = datetime(1980, 1, 1)
END = datetime(2020, 4, 23)

In [20]:
tic = yf.Ticker('MSFT')
hist = tic.history(start=START, end=END)

In [21]:
hist

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Dividends,Stock Splits
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1986-03-13,0.06,0.06,0.06,0.06,1031788800,0.0,0.0
1986-03-14,0.06,0.07,0.06,0.06,308160000,0.0,0.0
1986-03-17,0.06,0.07,0.06,0.07,133171200,0.0,0.0
1986-03-18,0.07,0.07,0.06,0.06,67766400,0.0,0.0
1986-03-19,0.06,0.06,0.06,0.06,47894400,0.0,0.0
...,...,...,...,...,...,...,...
2020-04-16,173.82,176.79,172.42,176.55,50479600,0.0,0.0
2020-04-17,179.00,179.50,175.38,178.10,52765600,0.0,0.0
2020-04-20,176.14,178.25,174.50,174.57,36669600,0.0,0.0
2020-04-21,173.02,173.19,165.65,167.35,56203700,0.0,0.0


In [22]:
def dta_transformation(data, est_h):
    # Make sure data has sufficient columns
    assert 'Open' in data.columns
    assert 'High' in data.columns
    assert 'Low' in data.columns
    assert 'Close' in data.columns

    data['lag_close'] = data['Close'].shift(1)
    data['Indicator'] = np.where(data['Close'] > data['lag_close'], 1, 0)

    x = []
    y = []
    for i in range(est_h, data.shape[0]):
        sub_dta = data.iloc[i - est_h:i]

        y_i = data.iloc[i]['Indicator']
        x_i = dta_to_candlestick(sub_dta)

        y.append(y_i)
        x.append(x_i)

        print("{}/{}".format(i - est_h, data.shape[0] - est_h))

    return x, y

In [50]:
test_data = hist.iloc[5020:5050]
test_data

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Dividends,Stock Splits
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2006-02-03,20.07,20.23,19.97,20.11,75022700,0.0,0.0
2006-02-06,20.09,20.11,19.79,19.84,60170500,0.0,0.0
2006-02-07,19.68,19.83,19.58,19.68,72159500,0.0,0.0
2006-02-08,19.73,19.78,19.51,19.65,51795200,0.0,0.0
2006-02-09,19.69,19.74,19.46,19.47,52861700,0.0,0.0
2006-02-10,19.44,19.64,19.36,19.49,52127000,0.0,0.0
2006-02-13,19.45,19.5,19.24,19.27,46707000,0.0,0.0
2006-02-14,19.29,19.49,19.24,19.46,58432900,0.0,0.0
2006-02-15,19.49,19.73,19.42,19.7,62808900,0.09,0.0
2006-02-16,19.68,19.71,19.47,19.65,48868500,0.0,0.0


In [63]:
temp = np.where(test_data['Close'] > test_data['Open'], 1, 0)

In [74]:
count_1 = 0
count_0 = 0

for i in range(len(temp)-1):
    if (temp[i] == 1) & (temp[i+1] == 1):
        count_1 += 1

    elif (temp[i] == 0) & (temp[i+1] == 0):
        print(i)
        count_0 += 1


1
2
3


In [75]:
temp[7]

1

In [76]:
temp, count_1, count_0

(array([1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1,
        1, 0, 1, 0, 1, 1, 0, 1]), 4, 3)

In [51]:
(test_data['Close'] <= 19.8).sum()

18

In [39]:
def is_valid(data):
    # check if the entry is valid
    if isinstance(data, float):
        return data
    else:
        return 1e-5

In [40]:
def momentum_ret(data):
    x_0 = is_valid(data['Close'][0])
    x_T = is_valid(data['Close'][-1])
    mid = data.shape[0] // 2
    x_t = is_valid(data['Close'][mid])
    
    ttl_ret = (x_T - x_0) / x_0
    half_ret = (x_t - x_0) / x_0
    return ttl_ret + half_ret

In [47]:
def mean_cutoff(data):
    mu = data['Close'].mean()
    upr = (data['Close'] > mu).sum()
    lwr = (data['Close'] <= mu).sum()
    return upr - lwr

In [48]:
def half_return_diff(data):
    x_0 = is_valid(data['Close'][0])
    x_T = is_valid(data['Close'][-1])
    mid = data.shape[0] // 2
    x_t = is_valid(data['Close'][mid])
    
    first_ret = (x_t - x_0) / x_0
    second_ret = (x_T - x_t) / x_t
    return second_ret - first_ret

In [None]:
def consec_trend(data):
    

In [29]:
(test_data['Close'][-1] - test_data['Close'][0]) / test_data['Close'][0]

-0.12915851272015658

In [32]:
mid_point = test_data.shape[0] // 2

In [34]:
(test_data['Close'][-1] - test_data['Close'][mid_point]) / test_data['Close'][mid_point]

-0.10999999999999996

In [24]:
x, y = dta_transformation(hist.iloc[5000:5050], 30)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



0/20
1/20
2/20
3/20
4/20
5/20
6/20
7/20
8/20
9/20
10/20
11/20
12/20
13/20
14/20
15/20
16/20
17/20
18/20
19/20


In [7]:
x = np.stack(x, axis=2)

In [8]:
x.shape

(512, 512, 990)

In [35]:
im = Image.fromarray(x[:,:,0])
im.show()

In [36]:
x[:,:,0]

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=uint8)

In [37]:
sub_x = x[:,:,568:1568]

In [9]:
y = np.array(y)
y.shape

(990,)

In [24]:
sub_y = y[568:1568]

In [25]:
np.savez_compressed('test_1', x=sub_x, y=sub_y)

In [7]:
8568 % 1000

568

In [27]:
x.shape[2] // 1000

8

In [42]:
for i in range(8):
    sub_x = x[:,:,(1000*i+568):(1000*(i+1)+568)]
    sub_y = y[(1000*i+568):(1000*(i+1)+568)]
    np.savez_compressed('test_{}'.format(i), x=sub_x, y=sub_y)

In [19]:
350000000/1024/1024

333.7860107421875

# Pytorch Machine Learning

In [2]:
from ResNet_CNN import *

import math as m
import torch
from torch.nn import Linear, ReLU, Conv1d, Conv2d, Flatten, Sequential, CrossEntropyLoss, MSELoss, MaxPool1d, MaxPool2d, Dropout, BatchNorm1d, BatchNorm2d

from torch.optim import Adam
from torch import nn
import torchvision
from functools import partial
from collections import OrderedDict

import torchvision.models as models

from torchsummary import summary

In [3]:
model = res_conv1(1, 64)

In [4]:
print(model)

ResNet(
  (encoder): ResNetEncoder(
    (gate): Sequential(
      (0): Conv2d(1, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
      (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): LeakyReLU(negative_slope=0.01, inplace=True)
      (3): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
    )
    (blocks): ModuleList(
      (0): ResNetLayer(
        (blocks): Sequential(
          (0): ResNetBasicBlock(
            (blocks): Sequential(
              (0): Sequential(
                (0): Conv2dAuto(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
                (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
              )
              (1): LeakyReLU(negative_slope=0.01, inplace=True)
              (2): Sequential(
                (0): Conv2dAuto(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
                (

In [5]:
print(summary(model, (1,512,512)))

------------------------------------------------------------------------------------------
Layer (type:depth-idx)                   Output Shape              Param #
├─ResNetEncoder: 1-1                     [-1, 128, 64, 64]         --
|    └─Sequential: 2-1                   [-1, 64, 128, 128]        --
|    |    └─Conv2d: 3-1                  [-1, 64, 256, 256]        3,136
|    |    └─BatchNorm2d: 3-2             [-1, 64, 256, 256]        128
|    |    └─LeakyReLU: 3-3               [-1, 64, 256, 256]        --
|    |    └─MaxPool2d: 3-4               [-1, 64, 128, 128]        --
├─ResNetDecoder: 1-2                     [-1, 1]                   --
|    └─AdaptiveAvgPool2d: 2-2            [-1, 128, 1, 1]           --
Total params: 322,843
Trainable params: 322,843
Non-trainable params: 0
------------------------------------------------------------------------------------------
Input size (MB): 1.00
Forward/backward pass size (MB): 64.00
Params size (MB): 1.23
Estimated Total Size (M

## Putting everything together

In [10]:
ticker_list = os.listdir('D:/GitHub/Backtesting/images_npy')

In [9]:
os.getcwd()

'D:\\GitHub\\Backtesting'

In [12]:
ticker_dta = os.listdir('D:/GitHub/Backtesting/images_npy/{}'.format(ticker_list[0]))

In [13]:
ticker_list

['CI', 'DDS', 'IPG', 'LLY', 'MTG', 'NAVI', 'NKE', 'SEQ', 'WY']

In [14]:
ticker_dta

['CI_0.npz',
 'CI_1.npz',
 'CI_2.npz',
 'CI_3.npz',
 'CI_4.npz',
 'CI_5.npz',
 'CI_6.npz',
 'CI_7.npz',
 'CI_8.npz']

In [10]:
temp = np.load('C:/Users/Andyy/Documents/GitHub/Backtesting/images_npy/{}/{}'.format(ticker_list[0], ticker_dta[0]))

In [12]:
dta_x = temp['x']
dta_y = temp['y']

In [3]:
model = res_conv1(1, 64)
lr = 0.0001
optimizer = Adam(model.parameters(), lr=lr)

criterion = nn.BCELoss()

train_losses = []
val_losses = []

In [10]:
loaded = np.load('test.npz')
dta_x = loaded['x']
dta_y = loaded['y']

In [16]:
train_X, train_Y, val_X, val_Y = data_preprocessing(dta_x, dta_y)

((891, 512, 512), (891, 1)) ((99, 512, 512), (99, 1))


In [13]:
dta_y.shape

(990,)

In [25]:
np.savez_compressed('test', x=x, y=y)

In [13]:
def dataLoader(path):
    loaded = np.load(path)
    dta_x = loaded['x']
    dta_y = loaded['y']
    return dta_x, dta_y

In [14]:
for comp in ticker_list:
    ticker_dta = os.listdir('D:/GitHub/Backtesting/images_npy/{}'.format(comp))
    for dta in ticker_dta:
        try:
            path = 'D:/GitHub/Backtesting/images_npy/{}/{}'.format(comp, dta)
            dta_x, dta_y = dataLoader(path)
            print("Train on {}".format(dta))
            # Begin training
            
            train_X, train_Y, val_X, val_Y = data_preprocessing(dta_x, dta_y)
            gc.collect()
            
            train(0)
        except RuntimeError:
            model_path = './cnn_res.pth'
            torch.save(model.state_dict(), model_path)
            print("{} breaks the computer!!!".format(dta))
            break

Train on CI_0.npz
((900, 256, 256), (900, 1)) ((100, 256, 256), (100, 1))
Epoch:  1 	 train loss:  tensor(0.7072, grad_fn=<BinaryCrossEntropyBackward>) 	 val loss:  tensor(0.7103, grad_fn=<BinaryCrossEntropyBackward>)
Train on CI_1.npz
((900, 256, 256), (900, 1)) ((100, 256, 256), (100, 1))
Epoch:  1 	 train loss:  tensor(0.7052, grad_fn=<BinaryCrossEntropyBackward>) 	 val loss:  tensor(0.7158, grad_fn=<BinaryCrossEntropyBackward>)
Train on CI_2.npz
((900, 256, 256), (900, 1)) ((100, 256, 256), (100, 1))
Epoch:  1 	 train loss:  tensor(0.7063, grad_fn=<BinaryCrossEntropyBackward>) 	 val loss:  tensor(0.7024, grad_fn=<BinaryCrossEntropyBackward>)
Train on CI_3.npz
((900, 256, 256), (900, 1)) ((100, 256, 256), (100, 1))
Epoch:  1 	 train loss:  tensor(0.6973, grad_fn=<BinaryCrossEntropyBackward>) 	 val loss:  tensor(0.7002, grad_fn=<BinaryCrossEntropyBackward>)
Train on CI_4.npz
((900, 256, 256), (900, 1)) ((100, 256, 256), (100, 1))
Epoch:  1 	 train loss:  tensor(0.6984, grad_fn=<Binar

RuntimeError: [enforce fail at ..\c10\core\CPUAllocator.cpp:72] data. DefaultCPUAllocator: not enough memory: you tried to allocate 471859200 bytes. Buy new RAM!
(no backtrace available)

In [20]:
model_path = './cnn_res.pth'
torch.save(model.state_dict(), model_path)

In [21]:
test = res_conv1(1,64)

In [22]:
test.load_state_dict(torch.load(model_path))

<All keys matched successfully>

In [14]:
def data_preprocessing(X, Y):
    X = X.reshape((-1, 512, 512)).astype(np.float32)
    Y = Y.reshape((-1, 1)).astype(np.float32)
    train_X, val_X, train_Y, val_Y = train_test_split(X, Y, test_size=0.1, random_state=42)
    print((train_X.shape, train_Y.shape), (val_X.shape, val_Y.shape))

    train_X = train_X.reshape(-1, 1, 512, 512)
    train_X = torch.from_numpy(train_X)
    train_Y = torch.from_numpy(train_Y)

    val_X =  val_X.reshape(-1, 1, 512, 512)
    val_X = torch.from_numpy(val_X)
    val_Y = torch.from_numpy(val_Y)
    
    return train_X, train_Y, val_X, val_Y

In [22]:
trans = transforms.Compose([transforms.ToTensor(),
                           transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])

In [20]:
train_X, train_Y, val_X, val_Y = data_preprocessing(dta_x, dta_y)

((900, 256, 256), (900, 1))((100, 256, 256), (100, 1))


In [17]:
model = res_conv1(1, 64)
lr = 0.0001
optimizer = Adam(model.parameters(), lr=lr)

criterion = nn.BCELoss()

In [18]:
def train(epochs):
    # dataset
    x_train, y_train = Variable(train_X), Variable(train_Y)
    x_val, y_val = Variable(val_X), Variable(val_Y)
 
    optimizer.zero_grad()
    
    output_train = model(x_train)
    output_val = model(x_val)
    
    loss_train = criterion(output_train, y_train.type(torch.float))
    loss_val = criterion(output_val, y_val.type(torch.float))
    
    train_losses.append(loss_train)
    val_losses.append(loss_val)
    
    loss_train.backward()
    optimizer.step()
    
    print('Epoch: ', epochs+1, '\t', 'train loss: ', loss_train, '\t', 'val loss: ', loss_val)

In [23]:
model_path = './cnn_res-1.pth'
torch.save(model.state_dict(), model_path)
print("Breaks the computer!!!")

Breaks the computer!!!


In [19]:
n_epochs = 2
train_losses = []
val_losses = []
tr_loss = 0
try:
    for epochs in range(n_epochs):
        train(epochs)
except RuntimeError:
    model_path = './cnn_res-1.pth'
    torch.save(model.state_dict(), model_path)
    print("Breaks the computer!!!")

Breaks the computer!!!
