In [30]:
import StatsBase: predict
import Base: getindex, show
import MLBase: Kfold
using MLMetrics
using SparseRegression

In [41]:
function Fakedata(N,d)
    n_obs = 100
    x = randn((n_obs,d))
    y = sum(x*randn(d),2)
    
    hcat(x,y)
end

Fakedata (generic function with 1 method)

In [42]:
Fakedata(1000,3)

100×4 Array{Float64,2}:
  0.268218   -0.520109   0.838051    0.147381  
 -0.34653    -0.1857     2.6975     -2.68419   
  0.133557   -0.952651  -1.45983     3.17738   
  0.230711    1.39974   -0.0653775  -2.13541   
 -0.563177   -1.34831   -1.40915     3.39354   
 -0.685602   -0.522715  -0.972277    1.49574   
  0.747316   -0.177215   0.420758    0.277596  
  0.0694265   0.562688  -0.875036    0.00932123
 -0.426148    0.560386  -1.1989      0.0714531 
  0.152492    0.513041   0.881208   -1.68396   
  0.424058   -0.165807  -0.125117    0.643866  
 -0.22453    -0.480042   0.891437   -0.251014  
  0.466396   -0.497403   0.859016    0.19888   
  ⋮                                            
 -0.106053    0.289055   0.535672   -1.09705   
  0.722829    1.07108    1.44111    -2.87523   
  1.47042     0.59728    0.147044   -0.323645  
  2.12663     0.177319  -0.942741    1.875     
  1.37237     0.673599   0.284274   -0.648203  
  1.45504     1.7839     0.704609   -2.88832   
 -0.463601   -1.

In [31]:
immutable Task 
    task_type::String
    target::Int
    features::Array{Int}
end

function Task(;task_type="regression", target=nothing, data=nothing)
    if target == nothing || data == nothing
        throw("Requires target and data to be set")
    end
    
    features = size(data,2)
    features = deleteat!( collect(1:features), target)
    
    Task(task_type, target, features)
end

immutable Learner
    name::String
    parameters::Union{Void,Dict{Any}}
    Learner(learner::String) = new(learner, Dict())
    Learner(learner::String, parameters::Dict{Any}) = new(learner, parameters)
end

function show(io::IO,l::Learner)
    println("Learner: $(l.name)")
    for (key, value) in l.parameters
       println(" ▁ ▂ ▃ $key: $value") 
    end
end

immutable Resampling
    method::String
    iterations::Int
    Resampling() = new("KFold", 3)
end

abstract type Parameter end

immutable DiscreteParameter <: Parameter 
    name::String
    values::Array{Any}
    DiscreteParameter(;name=nothing,values=nothing) = new(name, values)
end

immutable ContinuousParameter <: Parameter
    name::String
    lower::Real
    upper::Real
    transform::Function
    ContinuousParameter(;name=nothing, lower=nothing, upper=nothing, transform=nothing) = new(name, lower, upper, transform)
end


immutable ParametersSet
   parameters::Array{Parameter}
end

getindex(p::ParametersSet, i::Int64) = p.parameters[i]

immutable MLRModel{T}
    model::T
    parameters
end

In [4]:
#### ABSTRACT FUNCTIONS ####

function MLRModel(learner::Learner, task::Task, data)
    # Calls function with name "makeModelname"
    f_name = learner.name
    f_name = "make" * titlecase(f_name)
    
    f = getfield(Main, Symbol(f_name))
    f(learner, task, data)
end    

function learnᵧ(learner::Learner, task::Task, data)
    modelᵧ = MLRModel(learner, task, data)
    learnᵧ!(modelᵧ, learner=learner, task=task, data=data)
    modelᵧ
end

learnᵧ (generic function with 1 method)

In [5]:
### TRANSITION ###
function makeRidge(learner::Learner, task::Task, data)
    if isempty(learner.parameters)
        model = SModel(data[:, task.features], data[:, task.target])
    else
        parameters = []
        push!(parameters, get_λ(learner.parameters, data))
        model = SModel(data[:, task.features], data[:, task.target], L2DistLoss(), L2Penalty(), parameters...)
    end
    MLRModel(model, copy(learner.parameters))
end

function makeGlm(learner::Learner, task::Task, data)
    if isempty(learner.parameters)
        model = SModel(data[:, task.features], data[:, task.target])
    else
        parameters = []
        if get(learner.parameters, "λ", false) !== false
            # Add λ
            push!(parameters, get_λ(learner.parameters, task))
        end
        if get(learner.parameters, "penalty", false) !== false
            # Add penalty
            push!(parameters, learner.parameters["penalty"])
        end
        if get(learner.parameters, "loss", false) !== false
            # Add penalty
            push!(parameters, learner.parameters["loss"])
        end
        model = SModel(data[:, task.features], data[:, task.target], parameters...)
    end
    MLRModel(model, copy(learner.parameters))
end

makeGlm (generic function with 1 method)

In [6]:
#### MODEL WRAPPERS ####
using SparseRegression

function get_λ(parameters, task)
    if get(parameters, "λ", false) == false
        lambda = fill(0.0, task.features)
    elseif typeof(parameters["λ"]) <: Real
        lambda = fill(parameters["λ"], length(task.features) )
    elseif typeof(parameters["λ"]) <: Vector{Float64}
        lambda = copy(parameters["λ"])
    end
    lambda
end


function predictᵧ(modelᵧ::MLRModel{<:SModel}; data=data, task=task)
    predict(modelᵧ.model, data[:, task.features])
end

function learnᵧ!(modelᵧ::MLRModel{<:SModel}; learner=nothing::Learner, data=nothing::Matrix{Real}, task=nothing::Task)
    learn!(modelᵧ.model)
end

learnᵧ! (generic function with 1 method)

In [35]:
function update_parameters!(array, range) 
    array[1] += 1
    for i in 1:length(array)
        if array[i] > range[i][end]
            array[i+1] += 1
            array[i] = range[i][1]
        end
    end
end

function parameters_dictionary(ps::ParametersSet, array, discrete_dictionary)
    dict = Dict()
    for i in 1:length(array)
        if typeof(ps[i]) <: ContinuousParameter
            dict[ps[i].name] = ps[i].transform( convert(Float64, array[i]) )
        else
            dict[ps[i].name] = discrete_dictionary[ps[i].name][array[i]]
        end
    end
    dict
end

function get_samples(sampler::Resampling, n_obs::Int64)
    trainᵢ = []
    testᵢ = []
    if sampler.method == "KFold"
        kfold = Kfold(n_obs, sampler.iterations)
        for train in kfold
            push!(trainᵢ, collect(train))
            push!(testᵢ, setdiff(1:n_obs, trainᵢ[end]))
        end
    end  
    trainᵢ, testᵢ
end

function tune(;learner=nothing::Learner, task=nothing::Task, data=nothing::Matrix{Real}, 
                parameters_set=nothing::ParametersSet, sampler=Resampling()::Resampling, 
                measure=nothing::Function)
    
    n_parameters = length(parameters_set.parameters)
    n_obs        = size(data,1)
        
    parameters_array = Array{Any}(n_parameters)
    parameters_range = Array{Tuple}(n_parameters)
    
    # For discrete parameters, the range is set to 1:n_discrete_values
    # The discrete dictionary variable allows to connect this range to 
    # the actual discrete value
    discrete_dictionary = Dict()
    
    total_parameters = 1
    
    # Prepare parameters
    for i in 1:n_parameters
        if typeof(parameters_set[i]) <: ContinuousParameter 
            lower = parameters_set[i].lower
            upper = parameters_set[i].upper
            parameters_array[i] = lower
            parameters_range[i] = Tuple(lower:upper)
            params = length(lower:upper)
        else
            parameters_array[i] = 1
            parameters_range[i] = Tuple(1:length(parameters_set[i].values))
            discrete_dictionary[parameters_set[i].name] = parameters_set[i].values
            params = length(parameters_set[i].values)
        end
        total_parameters *= params
    end
    
    
    
    # Loop over parameters
    for i in 1:total_parameters
        # Set new parametersparameters_set[i].values
        update_parameters!(parameters_array, parameters_range) 
        pd = parameters_dictionary(parameters_set, parameters_array, discrete_dictionary)

        # Update learner with new parameters
        lrn = Learner(learner.name, pd)
                
        # Get training/testing validation sets
        trainⱼ, testⱼ = get_samples(sampler, n_obs)
        
        scores = []
        for j in 1:length(trainⱼ)  
            modelᵧ = learnᵧ(lrn, task, data[trainⱼ[j], :])
            preds = predictᵧ(modelᵧ, data=data[testⱼ[j],:], task=task)
            
            score = measure( data[testⱼ[j], task.target], preds)
            push!(scores, score)
        end
        println("Trained:")
        println(lrn)
        println("Average CV accuracy: $(mean(scores))\n")
    end
end

tune (generic function with 1 method)

In [43]:
ps = ParametersSet([
    ContinuousParameter(
        name = "λ",
        lower = -4,
        upper = 1,
        transform = x->10^x
    )
        ,
    DiscreteParameter(
        name = "penalty",
        values = [L1Penalty(), L2Penalty()]
    )
])

data = Fakedata(1000,3)

task = Task(task_type="regression", target=4, data=data)
lrn = Learner("glm")

tune(learner=lrn, task=task, data=data, parameters_set=ps, measure=mean_squared_error)

Trained:
Learner: glm
 ▁ ▂ ▃ penalty: L1Penalty
 ▁ ▂ ▃ λ: 0.001

Average CV accuracy: 3.4671119891542536e-6

Trained:
Learner: glm
 ▁ ▂ ▃ penalty: L1Penalty
 ▁ ▂ ▃ λ: 0.01

Average CV accuracy: 0.00025142169009761316

Trained:
Learner: glm
 ▁ ▂ ▃ penalty: L1Penalty
 ▁ ▂ ▃ λ: 0.1

Average CV accuracy: 0.021201706276891653

Trained:
Learner: glm
 ▁ ▂ ▃ penalty: L1Penalty
 ▁ ▂ ▃ λ: 1.0

Average CV accuracy: 0.09469438716592128

Trained:
Learner: glm
 ▁ ▂ ▃ penalty: L1Penalty
 ▁ ▂ ▃ λ: 10.0

Average CV accuracy: 0.09492975725555254

Trained:
Learner: glm
 ▁ ▂ ▃ penalty: L2Penalty
 ▁ ▂ ▃ λ: 0.0001

Average CV accuracy: 1.0414126711916376e-9

Trained:
Learner: glm
 ▁ ▂ ▃ penalty: L2Penalty
 ▁ ▂ ▃ λ: 0.001

Average CV accuracy: 1.0006589244492341e-7

Trained:
Learner: glm
 ▁ ▂ ▃ penalty: L2Penalty
 ▁ ▂ ▃ λ: 0.01

Average CV accuracy: 9.794488525262816e-6

Trained:
Learner: glm
 ▁ ▂ ▃ penalty: L2Penalty
 ▁ ▂ ▃ λ: 0.1

Average CV accuracy: 0.0007761718700458964

Trained:
Learner: glm
 ▁ ▂ ▃ pen

[1m[36mINFO: [39m[22m[36mConverged after 29 iterations: [0.00323168, -0.248301, -0.170411]
[39m[1m[36mINFO: [39m[22m[36mConverged after 24 iterations: [0.00340201, -0.247928, -0.170603]
[39m[1m[36mINFO: [39m[22m[36mConverged after 14 iterations: [0.00355639, -0.248206, -0.170575]
[39m[1m[36mINFO: [39m[22m[36mConverged after 16 iterations: [0.0, -0.239335, -0.158007]
[39m[1m[36mINFO: [39m[22m[36mConverged after 15 iterations: [0.0, -0.239382, -0.159363]
[39m[1m[36mINFO: [39m[22m[36mConverged after 9 iterations: [0.0, -0.241246, -0.162587]
[39m[1m[36mINFO: [39m[22m[36mConverged after 10 iterations: [-0.0, -0.160881, -0.0709844]
[39m[1m[36mINFO: [39m[22m[36mConverged after 13 iterations: [0.0, -0.150698, -0.0537748]
[39m[1m[36mINFO: [39m[22m[36mConverged after 14 iterations: [0.0, -0.164663, -0.0475805]
[39m[1m[36mINFO: [39m[22m[36mConverged after 2 iterations: [0.0, -0.0, -0.0]
[39m[1m[36mINFO: [39m[22m[36mConverged after 2

LoadError: [91mBoundsError: attempt to access 2-element Array{Any,1} at index [3][39m

In [9]:
data = FakeData(1000,3)

task = Task(task_type="regression", target=4, data=data)
lrn  = Learner("ridge")

train = 1:80
test  = 81:100


modelᵧ = learnᵧ(lrn, task, data[train,:])
pred = predictᵧ(modelᵧ, data=data[test,:], task=task)

mean_squared_error(data[test,task.target],pred)

[1m[36mINFO: [39m[22m[36mSweep finished
[39m

0.04912842417017403

In [10]:
2*data[2,1]+2*data[2,2]+2*data[2,3]+3

1.5219312808631484

In [11]:
mean_squared_error([1,1,1,1,1,1], [2,2,2,2,2,2])

1.0

In [3]:

# What is the alternative
g(x)=x
setfield!(x->x^2,:g,Main)

LoadError: [91mtype ##5#6 is immutable[39m

In [None]:
addf