In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from matplotlib.pyplot import figure
import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
import torch.utils.data
from sklearn.linear_model import LogisticRegression

In [2]:
df = pd.read_csv('datasets/alldat.csv')

In [3]:
import os
os.listdir("./")

['.ipynb_checkpoints',
 'data exploration.ipynb',
 'datasets',
 'Price prediction.ipynb',
 'Untitled.ipynb']

In [3]:
dat = np.array(df)[:,1:]
dat.shape

(97890, 42)

In [6]:
print(dat[0,])

[2.248500e+02 1.162400e+02 6.262000e+01 7.991200e+02 1.157300e+02
 7.767800e+02 3.656500e+01 8.559000e+01 5.402000e+01 1.169545e+03
 2.255000e+01 1.765400e+02 8.720000e+01 9.079000e+01 2.954000e+01
 1.618000e+02 7.890000e+01 1.141900e+02 5.581000e+01 1.184000e+02
 1.352000e+02 2.544000e+03 1.390000e+03 4.120000e+02 3.000000e+02
 1.479000e+04 1.000000e+02 1.000000e+00 0.000000e+00 1.300000e+03
 0.000000e+00 2.946300e+04 0.000000e+00 3.150000e+03 1.000000e+02
 0.000000e+00 4.300000e+01 1.000000e+02 0.000000e+00 2.510000e+03
 2.000000e+00 4.000000e+02]


In [6]:
nMin = 390
nDay = int(dat.shape[0]/nMin)
print(nMin)
print(nDay)

window = 20

390
251


In [9]:
def createCNNFeatureVectors(dat, window):
    vec_list = []
    for day in range(nDay):
        base = day * nMin
        for min_ind in range(window, nMin - 1):
            vec = np.zeros((2, 22, window))
            vec[0,:21,:] = dat[(base+min_ind-window):(base+min_ind), :21].T
            vec[0, 21,:] = dat[(base+min_ind-window):(base+min_ind), 0].T%1
            vec[1,:21,:] = dat[(base+min_ind-window):(base+min_ind), 21:].T
            vec_list.append(vec)
    return torch.from_numpy(np.array(vec_list)).float()

def createClassLabels(dat, window, yCol):
    labels_per_day = nMin - window - 1
    labels = np.zeros(nDay * labels_per_day)
    for day in range(nDay):
        base = day * nMin
        labels[day * labels_per_day : (day + 1) * labels_per_day] = (dat[(base + 1):(base + 1 + labels_per_day), yCol] > dat[(base):(base + labels_per_day), yCol])
    
    return torch.from_numpy(labels).int()

In [10]:
X_dat = createCNNFeatureVectors(dat, 20)
len(X_dat)

92619

In [15]:
pred_col = 0
Y_dat = createClassLabels(dat, window, pred_col)
Y_dat.shape

torch.Size([92619])

In [16]:
print(Y_dat.shape)
print(X_dat.shape)
batch_size = 10

train_X = X_dat[0:int(.8*len(X_dat))]
train_Y = Y_dat[0:int(.8*len(X_dat))]

train = torch.utils.data.TensorDataset(X_dat, Y_dat)

val_X = X_dat[int(.8*len(X_dat)):]
val_Y = Y_dat[int(.8*len(X_dat)):]

train = torch.utils.data.TensorDataset(train_X, train_Y)
val = torch.utils.data.TensorDataset(val_X, val_Y)


torch.Size([92619])
torch.Size([92619, 2, 22, 20])


In [16]:

class priceNet(nn.Module):

    def __init__(self):

        super(priceNet, self).__init__()
        self.conv1 = nn.Conv2d(2, 2, [3,3], stride = 1, padding = 1, bias = False)
        #self.batch1 = nn.BatchNorm2d(4)
        self.relu = nn.Tanh()
        self.conv2 = nn.Conv2d(2, 2, [3,3], stride = 1, padding = 0, bias = True)
        self.conv3 = nn.Conv2d(2, 2, [3,3], stride = 1, padding = 1, bias = True)
        self.conv4 = nn.Conv2d(2, 1, [3,3], stride = 1, padding = 1, bias = True)
        self.pool1 = nn.MaxPool2d([2,2])
        self.pool2 = nn.MaxPool2d([2,2])
        self.lin = nn.Linear(16, 1)
        self.drop = nn.Dropout2d(p=0.2)
        '''
        self.conv1.weight = nn.Parameter(torch.rand((4,1,3,3))-.5)
        self.batch1.weight = nn.Parameter(torch.rand((4,))-.5)
        self.batch1.bias = nn.Parameter(torch.rand((4,))-.5)

        self.conv2.weight = nn.Parameter(torch.rand((4,4,3,3))-.5)
        self.conv2.bias = nn.Parameter(torch.rand(4,)-.5)

        self.conv3.weight = nn.Parameter(torch.rand((4,4,3,3))-.5)
        self.conv3.bias = nn.Parameter(torch.rand(4,)-.5)

        self.lin.weight = nn.Parameter(torch.rand((36,64*64))-.5)
        '''
        '''
        for param in self.parameters():
            param.data = param.data.half()
        '''
    def forward(self, x):
        #print(x.element_size() * x.nelement())
        #print(x.shape)
        f1 = self.conv2(x.view((x.shape[0], 2, 21, -1)))
        #print(f1.shape)
        #f2 = self.conv2(self.relu(f1))
        #print(f2.shape)
        f3 = self.pool1(self.drop(f1))
        #print(f3.shape)
        f4 = self.pool2(self.conv3(self.drop(self.relu(f3))))
        #print(f4.shape)
        f5 = self.conv4(self.drop(self.relu(f4)))
        #print(f5.shape)
        f6 = self.lin(f5.view((x.shape[0], 16)))
        #print(f6.shape)
        #f7 = self.soft(f6)
        return f6.view(-1)


In [9]:

def fit_and_validate(net, loss_func, optimizer, train, val, n_epochs, batch_size =100):
    """
    @param net: the neural network
    @param optimizer: a optim.Optimizer used for some variant of stochastic gradient descent
    @param train: a torch.utils.data.Dataset
    @param val: a torch.utils.data.Dataset
    @param n_epochs: the number of epochs over which to do gradient descent
    @param batch_size: the number of samples to use in each batch of gradient descent
    @return train_epoch_loss, validation_epoch_loss: two arrays of length n_epochs+1, containing the mean loss at the beginning of training and after each epoch
    """
    net.eval() #put the net in evaluation mode
    train_dl = torch.utils.data.DataLoader(train, batch_size)
    val_dl = torch.utils.data.DataLoader(val, batch_size)
    with torch.no_grad():
        # compute the mean loss on the training set at the beginning of iteration
        total=0
        b = 0
        t_l = 0
        for X,Y in train_dl:
            b+=1
            X_temp = X.float().cuda()
            Y_temp = Y.float().cuda()
            pred = net(X_temp)
            t_l += loss_func(pred, Y_temp)
            total += Y.shape[0]

            del X_temp
            del Y_temp
            
        print("avg initial loss:", t_l/total)

    for i in range(n_epochs):
        print(torch.cuda.memory_allocated(0))
        print("base epoch #", i)
        net.train() #put the net in train mode
        first = True
        train_epoch_loss = []
        val_epoch_loss = []
        first = True
        batch = 0
        for X,Y in train_dl:
            X_temp = X.float().cuda()
            pred = net(X_temp)
            #print(pred)
            Y_temp = Y.float().cuda()
            
            loss = loss_func(pred, Y_temp)
            if batch%1000 == 0:
                print("batchnum: ", batch)
                print(pred)
                print(Y_temp)
                print(loss)
                
            batch+=1
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
            del Y_temp
            del X_temp
        with torch.no_grad():
            net.eval() #put the net in evaluation mode
            t_l = 0
            v_l = 0
            total=0
            b = 0
            for X,Y in train_dl:
                b+=1
                X_temp = X.float().cuda()
                Y_temp = Y.float().cuda()
                pred = net(X_temp)
                t_l += loss_func(pred, Y_temp)

                total += Y.shape[0]

                del X_temp
                del Y_temp

                break
            t_l/=total
            #print("bias:", sum(pred))
            total=0
            b=0
            for X,Y in val_dl:
                b+=1
                X_temp = X.float().cuda()
                Y_temp = Y.float().cuda()
                pred = net(X_temp)
                #print(pred)
                v_l += loss_func(pred, Y_temp)
                total += Y.shape[0]
                del X_temp
                del Y_temp
            v_l/=total

            train_epoch_loss.append(t_l)
            val_epoch_loss.append(v_l)
            #print("bias:", sum(pred))

            print("train loss:", train_epoch_loss[len(train_epoch_loss)-1])
            print("val loss:", val_epoch_loss[len(train_epoch_loss)-1])

    return train_epoch_loss, val_epoch_loss

In [18]:

predNet = priceNet().cuda()
optimizer = optim.Adam(predNet.parameters(), lr = .0001)
print(torch.cuda.memory_allocated(0))
tl, vl = fit_and_validate(predNet, nn.L1Loss(), optimizer, train, val, 1000, 10)

4608
avg initial loss: tensor(0.0527, device='cuda:0')
5632
base epoch # 0
batchnum:  0
tensor([ 0.1073,  0.0860, -0.1380,  0.1123, -0.1879, -0.1308, -0.1809,  0.0828,
         0.1396, -0.1592], device='cuda:0', grad_fn=<ViewBackward>)
tensor([1., 1., 1., 1., 0., 0., 0., 0., 1., 1.], device='cuda:0')
tensor(0.6434, device='cuda:0', grad_fn=<L1LossBackward>)
batchnum:  1000
tensor([ 0.0266, -0.0011, -0.0376,  0.0307,  0.0415,  0.0212,  0.0521,  0.0531,
         0.0797,  0.0480], device='cuda:0', grad_fn=<ViewBackward>)
tensor([0., 1., 1., 0., 1., 1., 1., 0., 0., 0.], device='cuda:0')
tensor(0.5162, device='cuda:0', grad_fn=<L1LossBackward>)
batchnum:  2000
tensor([0.0575, 0.0260, 0.0290, 0.0223, 0.0247, 0.0041, 0.0608, 0.0319, 0.0561,
        0.0319], device='cuda:0', grad_fn=<ViewBackward>)
tensor([0., 1., 1., 0., 1., 0., 1., 1., 0., 0.], device='cuda:0')
tensor(0.4999, device='cuda:0', grad_fn=<L1LossBackward>)
batchnum:  3000
tensor([0.0703, 0.0230, 0.0107, 0.0115, 0.0325, 0.0178, 0.

KeyboardInterrupt: 

In [20]:
reg_train_X = train_X.view(train_X[:,0,:,:].shape[0], -1)
reg_val_X = val_X.view(val_X[:,0,:,:].shape[0], -1)

In [21]:
iters = [1, 2, 5, 10, 20]
for i in iters:
    reg = LogisticRegression(max_iter=i)
    reg.fit(reg_train_X, train_Y)
    print()
    print("train acc it = " + str(i),":", reg.score(reg_train_X, train_Y))
    print("val acc it = " + str(i), ":", reg.score(reg_val_X, val_Y))


train acc it = 1 : 0.5601457588231324
val acc it = 1 : 0.5525264521701576

train acc it = 2 : 0.5623861259194277
val acc it = 2 : 0.5492334269056359

train acc it = 5 : 0.5650988595721709
val acc it = 5 : 0.5501511552580436

train acc it = 10 : 0.5652608138200959
val acc it = 10 : 0.5486396026776075

train acc it = 20 : 0.5658276536878332
val acc it = 20 : 0.5489635068019866


In [25]:
print(float(sum(val_Y))/val_Y.shape[0])

0.4465018354567048


In [26]:
from sklearn.ensemble import RandomForestRegressor
rfreg = RandomForestRegressor()
rfreg.fit(reg_train_X, train_Y)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)