In [None]:
ll '/content/drive/My Drive/mafs_5440_group2/img_data/monthly_20d/'

ls: cannot access '/content/drive/My Drive/mafs_5440_group2/img_data/monthly_20d/': No such file or directory


In [None]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [None]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"


import torch
device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')

import torch
import torch.nn as nn
import torchvision
import torchvision.transforms as transforms
import torchvision
import os
import pandas as pd
import numpy as np

from torchsummary import summary
import re
import time
import scipy.stats
import torch.optim as optim

use_gpu = torch.cuda.is_available()


In [None]:
class Model(nn.Module):
    def __init__(self):
        super(Model, self).__init__()
        self.conv1=nn.Conv2d(in_channels=1,out_channels=64,kernel_size=(5,3),padding=(7,1),stride=(3,1))
        self.batchnorm1=nn.BatchNorm2d(64,affine=True)
        self.relu1=nn.LeakyReLU()
        self.pool1=nn.MaxPool2d(kernel_size=(2,1))
        self.conv2=nn.Conv2d(in_channels=64,out_channels=128,kernel_size=(5,3),padding=(2,1),stride=1)
        self.batchnorm2=nn.BatchNorm2d(128,affine=True)
        self.relu2=nn.LeakyReLU()
        self.pool2=nn.MaxPool2d(kernel_size=(2,1))
        self.conv3=nn.Conv2d(in_channels=128,out_channels=256,kernel_size=(5,3),padding=(2,1),stride=1)
        self.batchnorm3=nn.BatchNorm2d(256,affine=True)
        self.relu3=nn.LeakyReLU()
        self.pool3=nn.MaxPool2d(kernel_size=(2,1))
        self.fc= nn.Linear(46080,2)
    def forward(self,x):
        y = self.conv1(x)
        y = self.batchnorm1(y)
        y = self.relu1(y)
        y = self.pool1(y)
        y = self.conv2(y)
        y = self.batchnorm2(y)
        y = self.relu2(y)
        y = self.pool2(y)
        y = self.conv3(y)
        y = self.batchnorm3(y)
        y = self.relu3(y)
        y = self.pool3(y)
        y = y.view(y.shape[0],-1)
        y = self.fc(y)
        return y



def data_set(dir, year_index, in_size, out_size,height,width):
    year = year_index[0]
    img_path = os.path.join(dir,"monthly_20d",f"20d_month_has_vb_[20]_ma_{year}_images.dat")
    label_path = os.path.join(dir,"monthly_20d",f"20d_month_has_vb_[20]_ma_{year}_labels_w_delay.feather")
    image = np.memmap(img_path,dtype=np.uint8,mode='r').reshape(-1,height[in_size],width[out_size])
    label = pd.read_feather(label_path)

    for year in year_index[1:]:
        img_path = os.path.join(dir,"monthly_20d",f"20d_month_has_vb_[20]_ma_{year}_images.dat")
        label_path = os.path.join(dir,"monthly_20d",f"20d_month_has_vb_[20]_ma_{year}_labels_w_delay.feather")
        img = np.memmap(img_path,dtype=np.uint8,mode='r').reshape(-1,height[in_size],width[out_size])
        lbl = pd.read_feather(label_path)
        image = np.concatenate((image,img),axis=0)
        label = pd.concat([label,lbl])
    return image, label

def sampling(image, label):
    len_df = len(label)
    random_index = np.random.permutation(len_df)
    train_index = random_index[:int(0.7*len_df)]
    train_image = image[train_index]
    train_label = label.iloc[train_index,:]
    validation_index = random_index[int(0.7*len_df):]
    validation_image = image[validation_index]
    validation_label = label.iloc[validation_index,:]
    return train_image, train_label, validation_image,validation_label


class Stock(torch.utils.data.Dataset):
    def __init__(self, image, label,transform):
        self.image = image
        self.label = label
        self.transform = transform
        self.pre_process()
    def __len__(self):
        return len(self.label)
    def __getitem__(self, index):
        image = self.transform(self.image[index])
        return image, self.label.iloc[index, -1]
    def pre_process(self):
        self.label['Label_20d'] = self.label['Ret_20d']
        self.label.loc[self.label['Label_20d']<=0, 'Label_20d']=0
        self.label.loc[self.label['Label_20d']>0, 'Label_20d']=1
        self.label.loc[pd.isnull(self.label['Label_20d']), 'Label_20d']=0


def create_loader(img_dir,batch_size):
  IMAGE_WIDTH={5:15, 20:60, 60:180}
  IMAGE_HEIGHT={5:32, 20:64, 60:96}

  in_size = 20
  out_size = 20

  train_index = np.arange(1993, 2007, 1)
  image, label = data_set(img_dir, train_index, in_size, out_size,IMAGE_HEIGHT,IMAGE_WIDTH)
  train_image, train_label, validation_image,validation_label = sampling(image, label)


  test_index = np.arange(2007, 2020, 1)
  test_image, test_label = data_set(img_dir, test_index, in_size, out_size,IMAGE_HEIGHT,IMAGE_WIDTH)

  mytransform = torchvision.transforms.ToTensor()
  train_data = Stock(train_image, train_label, mytransform)
  validation_data = Stock(validation_image, validation_label, mytransform)
  test_data = Stock(test_image, test_label, mytransform)

  train_loader=torch.utils.data.DataLoader(dataset=train_data, batch_size=batch_size,shuffle=True,num_workers=4)
  validation_loader=torch.utils.data.DataLoader(dataset=validation_data, batch_size=batch_size,shuffle=True,num_workers=4)
  test_loader=torch.utils.data.DataLoader(dataset=test_data, batch_size=batch_size, shuffle=True,num_workers=4)
  loaders = {'train': train_loader, 'validation': validation_loader, 'test': test_loader}

  return loaders

class AverageMeter():
    """Computes and stores the average and current value"""

    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

def accuracy(output, target):
    batch_size = target.size(0)
    _, pred = output.topk(1, 1, True, True)
    pred = pred.t()
    correct = pred.eq(target.view(1, -1).expand_as(pred))

    res = []
    res.append(correct.sum()/batch_size*100)
    return res

def train_model(model, data_loader, criterion, optimizer, scheduler = None):
    epoch = 1
    stopping = 0
    iter = 0
    log_saver = {
    'train_loss': [],
    'train_error': [],
    'validation_loss': [],
    'validation_error': [],
    'test_loss':[],
    'test_error':[]
    }
    while(stopping < 2):
        iter += 1
        print('Epoch {}'.format(epoch))
        print('-' * 10)
        for mode in ['train', 'validation']:

            loss_meter = AverageMeter()
            acc_meter = AverageMeter()

            if mode == 'train':
                model.train(True)
            else:
                model.train(False)

            for i, data in enumerate(data_loader[mode]):
                inputs, labels = data
                if use_gpu:
                    inputs = inputs.cuda()
                    labels = labels.cuda()

                optimizer.zero_grad()

                outputs = model(inputs)
                _, preds = torch.max(outputs.data, 1)
                # print(labels)

                # print(labels)
                labels = labels.long()
                loss = criterion(outputs, labels)

                if mode == 'train':
                    loss.backward()
                    optimizer.step()
                    if scheduler is not None:
                      scheduler.step()


                loss_meter.update(loss.data.item(), outputs.shape[0])
                acc_meter.update(
                    accuracy(outputs.data, labels.data)[-1].item(), outputs.shape[0])

            epoch_loss = loss_meter.avg
            epoch_error = 1 - acc_meter.avg / 100

            if mode == 'train':
                log_saver['train_loss'].append(epoch_loss)
                log_saver['train_error'].append(epoch_error)

            elif mode == 'validation':

                log_saver['validation_loss'].append(epoch_loss)
                log_saver['validation_error'].append(epoch_error)

                if iter > 2 and (log_saver['validation_loss'][-1] >= log_saver['validation_loss'][-2]):
                    stopping += 1
                else:
                    stopping = 0


            print(
                f'{mode} loss: {epoch_loss:.4f}; error: {epoch_error:.4f}'
            )

        epoch += 1



    print("Testing...")
    loss_meter = AverageMeter()
    acc_meter = AverageMeter()


    model.train(False)

    for i, data in enumerate(data_loader["test"]):
        inputs, labels = data
        if use_gpu:
            inputs = inputs.cuda()
            labels = labels.cuda()

        optimizer.zero_grad()

        outputs = model(inputs)
        _, preds = torch.max(outputs.data, 1)
        labels = labels.long()
        loss = criterion(outputs, labels)
        loss_meter.update(loss.data.item(), outputs.shape[0])
        acc_meter.update(
            accuracy(outputs.data, labels.data)[-1].item(), outputs.shape[0])

    epoch_loss = loss_meter.avg
    epoch_error = 1 - acc_meter.avg / 100
    log_saver['test_loss'].append(epoch_loss)
    log_saver['test_error'].append(epoch_error)

    print(f'test loss: {epoch_loss:.4f}; error: {epoch_error:.4f}')

    return model, log_saver

In [None]:
img_dir = "/content/drive/My Drive/mafs_5440_group2/img_data/"
batch_size = 128
data_loader = create_loader(img_dir,batch_size)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.label['Label_20d'] = self.label['Ret_20d']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.label['Label_20d'] = self.label['Ret_20d']


In [None]:
# model=Model()

# if use_gpu:
#   model.cuda()


# criterion = nn.CrossEntropyLoss()
# optimizer = optim.Adam(model.parameters(), lr=0.0001)


# model, log = train_model(model=model, data_loader=data_loader, criterion=criterion, optimizer=optimizer)

# train_validation_loss_error = {key: log[key] for key in ['train_loss', 'train_error', 'validation_loss', 'validation_error']}
# train_validation_loss_error = pd.DataFrame(train_validation_loss_error)
# train_validation_loss_error.to_csv('/content/drive/My Drive/train_validation_loss_error.csv')

# test_loss_error = {key: log[key] for key in ['test_loss', 'test_error']}
# test_loss_error = pd.DataFrame(test_loss_error)
# test_loss_error.to_csv('/content/drive/My Drive/test_loss_error.csv')

In [None]:
# Parameter tuning

class Model_new(nn.Module):
    def __init__(self):
        super(Model_new, self).__init__()
        self.conv1=nn.Conv2d(in_channels=1,out_channels=64,kernel_size=(5,3),padding=(7,1),stride=(3,1))
        self.batchnorm1=nn.BatchNorm2d(64,affine=True)
        self.relu1=nn.LeakyReLU()
        self.pool1=nn.MaxPool2d(kernel_size=(2,1))
        self.conv2=nn.Conv2d(in_channels=64,out_channels=128,kernel_size=(5,3),padding=(2,1),stride=1)
        self.batchnorm2=nn.BatchNorm2d(128,affine=True)
        self.relu2=nn.LeakyReLU()
        self.pool2=nn.MaxPool2d(kernel_size=(2,1))
        self.conv3=nn.Conv2d(in_channels=128,out_channels=256,kernel_size=(5,3),padding=(2,1),stride=1)
        self.batchnorm3=nn.BatchNorm2d(256,affine=True)
        self.relu3=nn.LeakyReLU()
        self.pool3=nn.MaxPool2d(kernel_size=(2,1))
        self.fc= nn.Linear(46080,2)
    def forward(self,x):
        y = self.conv1(x)
        y = self.batchnorm1(y)
        y = self.relu1(y)
        y = self.pool1(y)
        y = self.conv2(y)
        y = self.batchnorm2(y)
        y = self.relu2(y)
        y = self.pool2(y)
        y = self.conv3(y)
        y = self.batchnorm3(y)
        y = self.relu3(y)
        y = self.pool3(y)
        y = y.view(y.shape[0],-1)
        y = self.fc(y)
        return y

In [None]:
# parameter tuning

model_new=Model_new()

if use_gpu:
  model_new.cuda()

criterion_new = nn.CrossEntropyLoss()
optimizer_new = optim.Adam(model_new.parameters(), lr=0.00005)
# gamma = 0.9  # exponential decay
# scheduler = optim.lr_scheduler.ExponentialLR(optimizer_new, gamma=gamma)


model, log = train_model(
    model=model_new, data_loader=data_loader, criterion=criterion_new, optimizer=optimizer_new)

train_validation_loss_error = {key: log[key] for key in ['train_loss', 'train_error', 'validation_loss', 'validation_error']}
train_validation_loss_error = pd.DataFrame(train_validation_loss_error)
train_validation_loss_error.to_csv('/content/drive/My Drive/train_validation_loss_error(LR:5e-5).csv')

test_loss_error = {key: log[key] for key in ['test_loss', 'test_error']}
test_loss_error = pd.DataFrame(test_loss_error)
test_loss_error.to_csv('/content/drive/My Drive/test_loss_error(LR:5e-5).csv')

Epoch 1
----------
train loss: 0.7083; error: 0.4807
validation loss: 0.7301; error: 0.4883
Epoch 2
----------
train loss: 0.6960; error: 0.4671
validation loss: 0.6890; error: 0.4610
Epoch 3
----------
train loss: 0.6916; error: 0.4578
validation loss: 0.6874; error: 0.4559
Epoch 4
----------
train loss: 0.6872; error: 0.4489
validation loss: 0.6897; error: 0.4588
Epoch 5
----------
train loss: 0.6808; error: 0.4353
validation loss: 0.6901; error: 0.4591
Testing...
test loss: 0.6990; error: 0.4770
