In [1]:
import Pkg
Pkg.add("DataFrames")
Pkg.add("Flux")
Pkg.add("CSV")
Pkg.add("MLJ")
Pkg.add("CUDA")
Pkg.add("IterTools")
Pkg.add("ProgressMeter")
Pkg.add("MLUtils")
Pkg.add("StatsBase")

[32m[1m    Updating[22m[39m registry at `/opt/julia/registries/General.toml`
[32m[1m   Resolving[22m[39m package versions...
[32m[1m   Installed[22m[39m TableTraits ───────────────── v1.0.1
[32m[1m   Installed[22m[39m SentinelArrays ────────────── v1.3.18
[32m[1m   Installed[22m[39m Tables ────────────────────── v1.10.1
[32m[1m   Installed[22m[39m DataAPI ───────────────────── v1.14.0
[32m[1m   Installed[22m[39m PooledArrays ──────────────── v1.4.2
[32m[1m   Installed[22m[39m InlineStrings ─────────────── v1.4.0
[32m[1m   Installed[22m[39m Crayons ───────────────────── v4.1.1
[32m[1m   Installed[22m[39m IteratorInterfaceExtensions ─ v1.0.0
[32m[1m   Installed[22m[39m DataValueInterfaces ───────── v1.0.0
[32m[1m   Installed[22m[39m LaTeXStrings ──────────────── v1.3.0
[32m[1m   Installed[22m[39m InvertedIndices ───────────── v1.3.0
[32m[1m   Installed[22m[39m Reexport ──────────────────── v1.2.2
[32m[1m   Installed[22m[39m Orde

In [2]:
using DataFrames
using CSV
using Flux
using MLJ
using CUDA
using IterTools: ncycle
using ProgressMeter
using MLUtils
using StatsBase

## Part 1 - Data Preprocessing

In [3]:
dataset_train = CSV.read("../data/Google_Stock_Price_Train.csv", DataFrame)
first(dataset_train, 10)

Row,Date,Open,High,Low,Close,Volume
Unnamed: 0_level_1,String15,Float64,Float64,Float64,String15,String15
1,1/3/2012,325.25,332.83,324.97,663.59,7380500
2,1/4/2012,331.27,333.87,329.08,666.45,5749400
3,1/5/2012,329.83,330.75,326.89,657.21,6590300
4,1/6/2012,328.34,328.77,323.68,648.24,5405900
5,1/9/2012,322.04,322.29,309.46,620.76,11688800
6,1/10/2012,313.7,315.72,307.3,621.43,8824000
7,1/11/2012,310.59,313.52,309.4,624.25,4817800
8,1/12/2012,314.43,315.26,312.08,627.92,3764400
9,1/13/2012,311.96,312.3,309.37,623.28,4631800
10,1/17/2012,314.81,314.81,311.67,626.86,3832800


In [26]:
#training_set = select(dataset_train, :Open)
training_set = Float32.(dataset_train.Open)
@show training_set isa AbstractVector
summarystats(training_set)

training_set isa AbstractVector = true


Summary Stats:
Length:         1258
Missing Count:  0
Mean:           533.709839
Minimum:        279.119995
1st Quartile:   404.114990
Median:         537.469971
3rd Quartile:   654.922485
Maximum:        816.679993


### Apply Normalization

In [5]:
scale_class = StatsBase.fit(UnitRangeTransform, training_set, dims=1)

training_set_scaled = StatsBase.transform(scale_class, training_set)

#training_set_scaled = [[Float32(x)] for x ∈ training_set_scaled]
@show size(training_set_scaled)
first(training_set_scaled, 10)

size(training_set_scaled) = (1258,)


10-element Vector{Float32}:
 0.085813686
 0.097012416
 0.09433364
 0.091561876
 0.07984228
 0.06432773
 0.058542304
 0.06568569
 0.06109085
 0.06639259

### Data Structure for Time Steps

In [6]:
function batch_timeseries(X, s::Int, r::Int)
    if isa(X, AbstractVector)       # If X is passed in format T×1, reshape it
        X = permutedims(X)
    end
    T = size(X, 2)
    @assert s ≤ T "s cannot be longer than the total series"
    X = X[:, ((T - s) % r)+1:end]   # Ensure uniform sequence lengths
    [X[:, t:r:end-s+t] for t ∈ 1:s] # Output
end

@show size(permutedims(training_set_scaled[begin:end-1]))
@show size(permutedims(training_set_scaled[begin+1:end]))
X_train = batch_timeseries(permutedims(training_set_scaled[begin:end-1]), 60, 1)
y_train = batch_timeseries(permutedims(training_set_scaled[begin+1:end]), 60, 1)
@show size(X_train), size(X_train[1]), typeof(X_train)
@show size(y_train), size(y_train[1]), typeof(y_train)

size(permutedims(training_set_scaled[begin:end - 1])) = (1, 1257)
size(permutedims(training_set_scaled[begin + 1:end])) = (1, 1257)
(size(X_train), size(X_train[1]), typeof(X_train)) = ((60,), (1, 1198), Vector{Matrix{Float32}})
(size(y_train), size(y_train[1]), typeof(y_train)) = ((60,), (1, 1198), Vector{Matrix{Float32}})


((60,), (1, 1198), Vector{Matrix{Float32}})

In [7]:
X_train

60-element Vector{Matrix{Float32}}:
 [0.085813686 0.097012416 … 0.9243806 0.9304822]
 [0.097012416 0.09433364 … 0.9304822 0.92990553]
 [0.09433364 0.091561876 … 0.92990553 0.9311332]
 [0.091561876 0.07984228 … 0.9311332 0.92750585]
 [0.07984228 0.06432773 … 0.92750585 0.94415504]
 [0.06432773 0.058542304 … 0.94415504 0.93876034]
 [0.058542304 0.06568569 … 0.93876034 0.93403524]
 [0.06568569 0.06109085 … 0.93403524 0.93483526]
 [0.06109085 0.06639259 … 0.93483526 0.9313937]
 [0.06639259 0.061425738 … 0.9313937 0.94636875]
 [0.061425738 0.07474513 … 0.94636875 0.9656969]
 [0.07474513 0.027978288 … 0.9656969 0.97510976]
 [0.027978288 0.023792708 … 0.97510976 0.95966965]
 ⋮
 [0.0448694 0.050654832 … 0.9411414 0.9576234]
 [0.050654832 0.052143015 … 0.9576234 0.9641343]
 [0.052143015 0.056123994 … 0.9641343 0.9640227]
 [0.056123994 0.058188852 … 0.9640227 0.96971506]
 [0.058188852 0.06540666 … 0.96971506 0.95077753]
 [0.06540666 0.06882953 … 0.95077753 0.96294373]
 [0.06882953 0.072438434 … 

In [8]:
y_train

60-element Vector{Matrix{Float32}}:
 [0.097012416 0.09433364 … 0.9304822 0.92990553]
 [0.09433364 0.091561876 … 0.92990553 0.9311332]
 [0.091561876 0.07984228 … 0.9311332 0.92750585]
 [0.07984228 0.06432773 … 0.92750585 0.94415504]
 [0.06432773 0.058542304 … 0.94415504 0.93876034]
 [0.058542304 0.06568569 … 0.93876034 0.93403524]
 [0.06568569 0.06109085 … 0.93403524 0.93483526]
 [0.06109085 0.06639259 … 0.93483526 0.9313937]
 [0.06639259 0.061425738 … 0.9313937 0.94636875]
 [0.061425738 0.07474513 … 0.94636875 0.9656969]
 [0.07474513 0.027978288 … 0.9656969 0.97510976]
 [0.027978288 0.023792708 … 0.97510976 0.95966965]
 [0.023792708 0.024090357 … 0.95966965 0.97808623]
 ⋮
 [0.050654832 0.052143015 … 0.9576234 0.9641343]
 [0.052143015 0.056123994 … 0.9641343 0.9640227]
 [0.056123994 0.058188852 … 0.9640227 0.96971506]
 [0.058188852 0.06540666 … 0.96971506 0.95077753]
 [0.06540666 0.06882953 … 0.95077753 0.96294373]
 [0.06882953 0.072438434 … 0.96294373 0.9612323]
 [0.072438434 0.0799352

## Build the Model

In [9]:
lstm_layer_1 = Flux.LSTM(1 => 50)

Recur(
  LSTMCell(1 => 50),                    [90m# 10_500 parameters[39m
) [90m        # Total: 5 trainable arrays, [39m10_500 parameters,
[90m          # plus 2 non-trainable, 100 parameters, summarysize [39m41.336 KiB.

In [10]:
dropout_layer_1 = Flux.Dropout(0.2)

Dropout(0.2)

In [11]:
lstm_layer_2 = Flux.LSTM(50 => 50)
dropout_layer_2 = Flux.Dropout(0.2)

lstm_layer_3 = Flux.LSTM(50 => 50)
dropout_layer_3 = Flux.Dropout(0.2)

lstm_layer_4 = Flux.LSTM(50 => 50)
dropout_layer_4 = Flux.Dropout(0.2)

Dropout(0.2)

In [12]:
output_layer = Flux.Dense(50 => 1)

Dense(50 => 1)      [90m# 51 parameters[39m

In [13]:
model = Chain(lstm_layer_1, 
    dropout_layer_1, 
    lstm_layer_2, 
    dropout_layer_2,
    lstm_layer_3, 
    dropout_layer_3,
    lstm_layer_4,
    #x -> x[:, end],
    dropout_layer_4,
    output_layer)

Chain(
  Recur(
    LSTMCell(1 => 50),                  [90m# 10_500 parameters[39m
  ),
  Dropout(0.2),
  Recur(
    LSTMCell(50 => 50),                 [90m# 20_300 parameters[39m
  ),
  Dropout(0.2),
  Recur(
    LSTMCell(50 => 50),                 [90m# 20_300 parameters[39m
  ),
  Dropout(0.2),
  Recur(
    LSTMCell(50 => 50),                 [90m# 20_300 parameters[39m
  ),
  Dropout(0.2),
  Dense(50 => 1),                       [90m# 51 parameters[39m
) [90m        # Total: 22 trainable arrays, [39m71_451 parameters,
[90m          # plus 8 non-trainable, 400 parameters, summarysize [39m280.699 KiB.

## Train the RNN

#### Load Data into the dataloader

In [14]:
@show size(X_train), typeof(X_train)
@show size(y_train), typeof(y_train)

#train_data_loader = Flux.DataLoader(X_train, transpose(y_train); batchsize = 16, partial = false)

(size(X_train), typeof(X_train)) = ((60,), Vector{Matrix{Float32}})
(size(y_train), typeof(y_train)) = ((60,), Vector{Matrix{Float32}})


((60,), Vector{Matrix{Float32}})

#### Setup the Optimizer

In [15]:
#optimizer = Flux.setup(Flux.Adam(0.01), model)
optimizer = Flux.Adam(0.01)

Adam(0.01, (0.9, 0.999), 1.0e-8, IdDict{Any, Any}())

#### Setup the Loss Function

#### Train on the dataset

In [16]:
number_epochs = 100

theta = Flux.params(model)

@showprogress  for epoch in 1:number_epochs
    Flux.reset!(model)
    
    delta = gradient(theta) do 
        #model(X_train[1])
        Flux.Losses.mse.([model(x) for x in X_train], y_train) |> mean
    end
    Flux.update!(optimizer, theta, delta)
    #Flux.train!(loss, model, train_data_loader, optimizer)
end

[32mProgress: 100%|█████████████████████████████████████████| Time: 0:04:52[39m


In [17]:
vcat([model(x) for x in X_train]...)

60×1198 Matrix{Float32}:
 0.183013  0.160393  0.184184  0.143454  …  0.658746  0.650613  0.640387
 0.179766  0.166007  0.178721  0.144786     0.655012  0.654235  0.649732
 0.17559   0.17117   0.173413  0.147072     0.656182  0.656509  0.652724
 0.172645  0.173731  0.169463  0.150133     0.656325  0.657268  0.654614
 0.170427  0.174839  0.166442  0.152926     0.656505  0.657515  0.655625
 0.168658  0.174919  0.164237  0.15519   …  0.656605  0.657561  0.656214
 0.167155  0.174313  0.162713  0.156912     0.656663  0.657532  0.656564
 0.165821  0.173258  0.161738  0.158157     0.65669   0.657479  0.656776
 0.16461   0.171932  0.161175  0.159023     0.656696  0.657422  0.656905
 0.163516  0.170455  0.160912  0.159582     0.656691  0.657369  0.656985
 0.162541  0.168926  0.160833  0.159925  …  0.656679  0.657324  0.657035
 0.161698  0.167405  0.160876  0.160108     0.656663  0.657285  0.657067
 0.160978  0.165966  0.160975  0.160182     0.656647  0.657254  0.657087
 ⋮                        

In [18]:
y_train

60-element Vector{Matrix{Float32}}:
 [0.097012416 0.09433364 … 0.9304822 0.92990553]
 [0.09433364 0.091561876 … 0.92990553 0.9311332]
 [0.091561876 0.07984228 … 0.9311332 0.92750585]
 [0.07984228 0.06432773 … 0.92750585 0.94415504]
 [0.06432773 0.058542304 … 0.94415504 0.93876034]
 [0.058542304 0.06568569 … 0.93876034 0.93403524]
 [0.06568569 0.06109085 … 0.93403524 0.93483526]
 [0.06109085 0.06639259 … 0.93483526 0.9313937]
 [0.06639259 0.061425738 … 0.9313937 0.94636875]
 [0.061425738 0.07474513 … 0.94636875 0.9656969]
 [0.07474513 0.027978288 … 0.9656969 0.97510976]
 [0.027978288 0.023792708 … 0.97510976 0.95966965]
 [0.023792708 0.024090357 … 0.95966965 0.97808623]
 ⋮
 [0.050654832 0.052143015 … 0.9576234 0.9641343]
 [0.052143015 0.056123994 … 0.9641343 0.9640227]
 [0.056123994 0.058188852 … 0.9640227 0.96971506]
 [0.058188852 0.06540666 … 0.96971506 0.95077753]
 [0.06540666 0.06882953 … 0.95077753 0.96294373]
 [0.06882953 0.072438434 … 0.96294373 0.9612323]
 [0.072438434 0.0799352

## Testing the model

In [19]:
dataset_test = DataFrame(CSV.File("../data/Google_Stock_Price_Test.csv"))
real_stock_price = Float32.(dataset_test.Open)

20-element Vector{Float32}:
 778.81
 788.36
 786.08
 795.26
 806.4
 807.86
 805.0
 807.14
 807.48
 807.08
 805.81
 805.12
 806.91
 807.25
 822.3
 829.62
 837.81
 834.71
 814.66
 796.86

In [20]:
@show first(dataset_train)
@show first(dataset_test)

first(dataset_train) = DataFrameRow
 Row │ Date      Open     High     Low      Close     Volume
     │ String15  Float64  Float64  Float64  String15  String15
─────┼──────────────────────────────────────────────────────────
   1 │ 1/3/2012   325.25   332.83   324.97  663.59    7,380,500
first(dataset_test) = DataFrameRow
 Row │ Date      Open     High     Low      Close    Volume
     │ String15  Float64  Float64  Float64  Float64  String15
─────┼─────────────────────────────────────────────────────────
   1 │ 1/3/2017   778.81   789.63    775.8   786.14  1,657,300


Row,Date,Open,High,Low,Close,Volume
Unnamed: 0_level_1,String15,Float64,Float64,Float64,Float64,String15
1,1/3/2017,778.81,789.63,775.8,786.14,1657300


In [21]:
dataset_total = Float32.(DataFrames.vcat(training_set, real_stock_price))[1258-59:end]
@show size(dataset_total), dataset_total[1]

(size(dataset_total), dataset_total[1]) = ((80,), 779.0f0)


((80,), 779.0f0)

In [22]:
test_set_scaled = StatsBase.transform(scale_class, dataset_total)

inputs = batch_timeseries(permutedims(test_set_scaled), 60, 1)
@show size(inputs), typeof(inputs)
@show size(inputs[1]), typeof(inputs[1])
@show size(inputs[:, 1][3])

#y_train = batch_timeseries(permutedims(dataset_total), 60, 1)

(size(inputs), typeof(inputs)) = ((60,), Vector{Matrix{Float32}})
(size(inputs[1]), typeof(inputs[1])) = ((1, 21), Matrix{Float32})
size((inputs[:, 1])[3]) = (1, 21)


(1, 21)

In [23]:
Flux.reset!(model)
preds = [model(x) for x in inputs]
preds[60]

1×21 Matrix{Float32}:
 0.656513  0.656512  0.656512  0.656512  …  0.656533  0.656534  0.656535

In [24]:
predicted_stock_prices = StatsBase.reconstruct(scale_class, vec(preds[60]))

21-element Vector{Float32}:
 632.0349
 632.0348
 632.0348
 632.0348
 632.0349
 632.0355
 632.03625
 632.0371
 632.03796
 632.03876
 632.0395
 632.04004
 632.0406
 632.04114
 632.0415
 632.0422
 632.0432
 632.0443
 632.04565
 632.04663
 632.0468