In [1]:
using DataFrames
using Dates

#### Loading data

- loading the data as exported by MATLAB

In [2]:
@time dat = readtable("../data/input_data/dat.txt", separator = '\t', nastrings = ["NaN"])

head(dat)

elapsed time: 18.539977886 seconds (3963227384 bytes allocated, 33.41% gc time)


Unnamed: 0,Date,Option_Price,Bid,Ask,Volume,Open_Interest,Strike,Expiry,DAX,EONIA_matched,Time_to_Maturity,IsCall
1,2006-07-03,3931.1,,,1,104,1800,2006-12-15,5712.69,0.031667592146348,0.466666666666667,1
2,2006-07-03,0.1,,,0,5515,1800,2006-12-15,5712.69,0.0316675921463482,0.466666666666667,0
3,2006-07-03,3734.0,,,0,2152,2000,2006-12-15,5712.69,0.0316675921463482,0.466666666666667,1
4,2006-07-03,0.1,,,0,20941,2000,2006-12-15,5712.69,0.0316675921463482,0.466666666666667,0
5,2006-07-03,3536.9,,,0,2,2200,2006-12-15,5712.69,0.0316675921463482,0.466666666666667,1
6,2006-07-03,0.1,,,0,4626,2200,2006-12-15,5712.69,0.0316675921463482,0.466666666666667,0


- some element types are not perfect yet:

In [3]:
eltypes(dat)

12-element Array{Type{T<:Top},1}:
 UTF8String
 Float64   
 Float64   
 Float64   
 Int64     
 Int64     
 Int64     
 UTF8String
 Float64   
 Float64   
 Float64   
 Int64     

- transform dates to `Date` type, `IsCall` to `Bool`:

In [4]:
@time begin
    dat[:Date] = Date(array(dat[:, 1]));
    dat[:Expiry] = Date(array(dat[:, :Expiry]));
    dat[:IsCall] = bool(array(dat[:, :IsCall]));
end

head(dat)

Use convert(Array, da).
 in array at /home/jovyan/.julia/v0.3/DataArrays/src/deprecated.jl:22
 in include_string at loading.jl:97
 in execute_request_0x535c5df2 at /home/jovyan/.julia/v0.3/IJulia/src/execute_request.jl:157
 in eventloop at /home/jovyan/.julia/v0.3/IJulia/src/IJulia.jl:123
 in anonymous at task.jl:340
Use convert(Array, da).
 in array at /home/jovyan/.julia/v0.3/DataArrays/src/deprecated.jl:22
 in include_string at loading.jl:97
 in execute_request_0x535c5df2 at /home/jovyan/.julia/v0.3/IJulia/src/execute_request.jl:157
 in eventloop at /home/jovyan/.julia/v0.3/IJulia/src/IJulia.jl:123
 in anonymous at task.jl:340


elapsed time: 102.985029114 seconds (10928008748 bytes allocated, 69.85% gc time)


Unnamed: 0,Date,Option_Price,Bid,Ask,Volume,Open_Interest,Strike,Expiry,DAX,EONIA_matched,Time_to_Maturity,IsCall
1,2006-07-03,3931.1,,,1,104,1800,2006-12-15,5712.69,0.031667592146348,0.466666666666667,True
2,2006-07-03,0.1,,,0,5515,1800,2006-12-15,5712.69,0.0316675921463482,0.466666666666667,False
3,2006-07-03,3734.0,,,0,2152,2000,2006-12-15,5712.69,0.0316675921463482,0.466666666666667,True
4,2006-07-03,0.1,,,0,20941,2000,2006-12-15,5712.69,0.0316675921463482,0.466666666666667,False
5,2006-07-03,3536.9,,,0,2,2200,2006-12-15,5712.69,0.0316675921463482,0.466666666666667,True
6,2006-07-03,0.1,,,0,4626,2200,2006-12-15,5712.69,0.0316675921463482,0.466666666666667,False


#### Option IDs

- create IDs for individual options: searching a single option determined through three separate columns is too costly

In [5]:
function optParamsToString(expi::Date, stri::Int, isc::Bool)
    y, m, d = yearmonthday(expi)
    dateStr = string(y, lpad(m, 2, "0"), lpad(d, 2, "0"))
    
    if isc
        return string("c_", dateStr, "_", stri)
    else
        return string("p_", dateStr, "_", stri)
    end
end

optParamsToString (generic function with 1 method)

In [6]:
nObs = size(dat, 1)
optIDs = Array(String, nObs)

@time begin
    for ii=1:nObs
        optIDs[ii] = optParamsToString(dat[ii, :Expiry], dat[ii, :Strike], dat[ii, :IsCall]) 
    end
end

optIDs[1:4]

elapsed time: 12.360254704 seconds (2390824040 bytes allocated, 59.73% gc time)


4-element Array{String,1}:
 "c_20061215_1800"
 "p_20061215_1800"
 "c_20061215_2000"
 "p_20061215_2000"

#### Create data table with option IDs and option parameters

- create a table that contains for each option `ID` its option parameters: expiry, strike, type

In [7]:
uniqueOpts = unique(optIDs)

nOpts = size(uniqueOpts, 1)

12917

- define function that decomposes option ID into its parameters:

In [8]:
function decodeOptID(id::String)
    opt = id
    x1, x2, x3 = split(opt, "_")
    if x1 == "c"
        isCall = true
    else
        isCall = false
    end
    expi = Date(x2, "yyyymmdd")
    stri = int(x3)

    return (expi, stri, isCall)
end

decodeOptID (generic function with 1 method)

- decode each option ID

In [9]:
@time begin
    expis = Array(Date, nOpts)
    stris = Array(Int, nOpts)
    iscs = Array(Bool, nOpts)
    for ii=1:nOpts
        x1, x2, x3 = decodeOptID(uniqueOpts[ii])
        expis[ii] = x1
        stris[ii] = x2
        iscs[ii] = x3
    end
end

opts = DataFrame(ID = uniqueOpts, Expiry = expis, Strike = stris, IsCall = iscs)

head(opts)

elapsed time: 1.250250312 seconds (137238272 bytes allocated, 64.89% gc time)


Unnamed: 0,ID,Expiry,Strike,IsCall
1,c_20061215_1800,2006-12-15,1800,True
2,p_20061215_1800,2006-12-15,1800,False
3,c_20061215_2000,2006-12-15,2000,True
4,p_20061215_2000,2006-12-15,2000,False
5,c_20061215_2200,2006-12-15,2200,True
6,p_20061215_2200,2006-12-15,2200,False


#### Create data table for underlying

- get **underlying**: get value for each day

In [10]:
function getAllDAXobs(df::DataFrame)
    vals1, vals2 = Date[], Float64[]
    valsSet = Set{Int}()
    nObs = size(df, 1)
    for ii=1:nObs
        currDat, currVal = df[ii, :Date], df[ii, :DAX]
        currKey = Dates.value(currDat)
        if !in(currKey, valsSet)
            push!(valsSet, currKey)
            push!(vals1, currDat)
            push!(vals2, currVal)
        end
    end
    return DataFrame(Date = vals1, DAX = vals2)
end

@time daxVals = getAllDAXobs(dat[[:Date, :DAX]])
size(daxVals, 1)

elapsed time: 1.673485434 seconds (188855812 bytes allocated, 64.56% gc time)


1908

#### Create data table for cohort / date

In [11]:
function getAllCohortParams(df::DataFrame)
    vals1, vals2, vals3, vals4 = Date[], Date[], Float64[], Float64[]
    valsSet = Set{Array{Int, 1}}()
    nObs = size(df, 1)
    for ii=1:nObs
        currDat, currExp = df[ii, :Date], df[ii, :Expiry]
        currEON, currTTM = df[ii, :EONIA_matched], df[ii, :Time_to_Maturity]
        currKeys = Int[Dates.value(currDat), Dates.value(currExp)]
        if !in(currKeys, valsSet)
            push!(valsSet, currKeys)
            push!(vals1, currDat)
            push!(vals2, currExp)
            push!(vals3, currEON)
            push!(vals4, currTTM)
        end
    end
    return DataFrame(Date = vals1, Expiry = vals2, EONIA_matched = vals3, Time_to_Maturity = vals4)
end

@time cohortParams = getAllCohortParams(dat[[:Date, :Expiry, :EONIA_matched, :Time_to_Maturity]])
size(cohortParams, 1)

elapsed time: 5.844227435 seconds (727837404 bytes allocated, 73.85% gc time)


21053

In [12]:
head(cohortParams)

Unnamed: 0,Date,Expiry,EONIA_matched,Time_to_Maturity
1,2006-07-03,2006-12-15,0.031667592146348,0.466666666666667
2,2006-07-03,2006-09-15,0.0297573099811956,0.211764705882353
3,2006-07-03,2006-08-18,0.02903277602483,0.133333333333333
4,2006-07-03,2006-07-21,0.0283102283088403,0.0549019607843137
5,2006-07-03,2007-06-15,0.0342512630396996,0.949019607843137
6,2006-07-03,2007-03-16,0.0330875802131789,0.709803921568627


#### Create data table for option prices

In [13]:
optPrices = DataFrame(Date = dat[:Date], ID = optIDs, Price = dat[:Option_Price])

head(optPrices)

Unnamed: 0,Date,ID,Price
1,2006-07-03,c_20061215_1800,3931.1
2,2006-07-03,p_20061215_1800,0.1
3,2006-07-03,c_20061215_2000,3734.0
4,2006-07-03,p_20061215_2000,0.1
5,2006-07-03,c_20061215_2200,3536.9
6,2006-07-03,p_20061215_2200,0.1


#### Create data table with all observations

In [14]:
keys = DataFrame(Date = dat[:Date], ID = optIDs)
obsData = dat[[:Bid, :Ask, :Volume, :Open_Interest]]
addObs = [keys obsData]

head(addObs)

Unnamed: 0,Date,ID,Bid,Ask,Volume,Open_Interest
1,2006-07-03,c_20061215_1800,,,1,104
2,2006-07-03,p_20061215_1800,,,0,5515
3,2006-07-03,c_20061215_2000,,,0,2152
4,2006-07-03,p_20061215_2000,,,0,20941
5,2006-07-03,c_20061215_2200,,,0,2
6,2006-07-03,p_20061215_2200,,,0,4626


#### Write relational database to disk

In [15]:
writetable("../data/rel_data/opts.csv", opts)
writetable("../data/rel_data/daxVals.csv", daxVals)
writetable("../data/rel_data/cohortParams.csv", cohortParams)
writetable("../data/rel_data/optPrices.csv", optPrices)
writetable("../data/rel_data/addObs.csv", addObs)

### Session info

In [16]:
versioninfo()

Julia Version 0.3.6
Commit a05f87b* (2015-01-08 22:33 UTC)
Platform Info:
  System: Linux (x86_64-linux-gnu)
  CPU: Intel(R) Core(TM) i5-4210U CPU @ 1.70GHz
  WORD_SIZE: 64
  BLAS: libopenblas (DYNAMIC_ARCH NO_AFFINITY Haswell)
  LAPACK: libopenblas
  LIBM: libopenlibm
  LLVM: libLLVM-3.3


In [17]:
Pkg.status()

20 required packages:
 - DataArrays                    0.2.15
 - DataFrames                    0.6.6
 - Dates                         0.3.2
 - Debug                         0.1.3
 - Distributions                 0.7.3
 - Docile                        0.5.3
 - GLM                           0.4.6
 - Gadfly                        0.3.12
 - IJulia                        0.2.5
 - JuMP                          0.9.1
 - Lexicon                       0.1.10
 - MAT                           0.2.12
 - NLopt                         0.2.1
 - Plotly                        0.0.3+             master
 - Quandl                        0.4.1
 - RDatasets                     0.1.2
 - Requires                      0.1.2              master
 - Taro                          0.1.4
 - TimeSeries                    0.5.9
 - Winston                       0.11.10
58 additional packages:
 - ArrayViews                    0.6.2
 - AssetMgmt                     0.0.0-             master (unregistered)
 - BinDeps     