# Logistic Regression

In [1]:
using MLJ
using RDatasets
using DataFrames
using CategoricalArrays
using Gadfly
import StatsBase: countmap, cor, var
using PrettyPrinting

In [2]:
sMarket = dataset("ISLR","Smarket")

Row,Year,Lag1,Lag2,Lag3,Lag4,Lag5,Volume,Today,Direction
Unnamed: 0_level_1,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Cat…
1,2001.0,0.381,-0.192,-2.624,-1.055,5.01,1.1913,0.959,Up
2,2001.0,0.959,0.381,-0.192,-2.624,-1.055,1.2965,1.032,Up
3,2001.0,1.032,0.959,0.381,-0.192,-2.624,1.4112,-0.623,Down
4,2001.0,-0.623,1.032,0.959,0.381,-0.192,1.276,0.614,Up
5,2001.0,0.614,-0.623,1.032,0.959,0.381,1.2057,0.213,Up
6,2001.0,0.213,0.614,-0.623,1.032,0.959,1.3491,1.392,Up
7,2001.0,1.392,0.213,0.614,-0.623,1.032,1.445,-0.403,Down
8,2001.0,-0.403,1.392,0.213,0.614,-0.623,1.4078,0.027,Up
9,2001.0,0.027,-0.403,1.392,0.213,0.614,1.164,1.303,Up
10,2001.0,1.303,0.027,-0.403,1.392,0.213,1.2326,0.287,Up


In [3]:
describe(sMarket, :mean, :std, :eltype)

Row,variable,mean,std,eltype
Unnamed: 0_level_1,Symbol,Union…,Union…,DataType
1,Year,2003.02,1.40902,Float64
2,Lag1,0.0038344,1.1363,Float64
3,Lag2,0.0039192,1.13628,Float64
4,Lag3,0.001716,1.1387,Float64
5,Lag4,0.001636,1.13877,Float64
6,Lag5,0.0056096,1.14755,Float64
7,Volume,1.47831,0.360357,Float64
8,Today,0.0031384,1.13633,Float64
9,Direction,,,"CategoricalValue{String, UInt8}"


In [4]:
y = sMarket.Direction
X = select(sMarket, Not(:Direction)); #All other cols other than Direction

To see the predictor pairwise correlations, convert to matrix.

In [5]:
cm = X |> Matrix |> cor
round.(cm, sigdigits=3)

8×8 Matrix{Float64}:
 1.0      0.0297    0.0306    0.0332    0.0357    0.0298    0.539    0.0301
 0.0297   1.0      -0.0263   -0.0108   -0.00299  -0.00567   0.0409  -0.0262
 0.0306  -0.0263    1.0      -0.0259   -0.0109   -0.00356  -0.0434  -0.0103
 0.0332  -0.0108   -0.0259    1.0      -0.0241   -0.0188   -0.0418  -0.00245
 0.0357  -0.00299  -0.0109   -0.0241    1.0      -0.0271   -0.0484  -0.0069
 0.0298  -0.00567  -0.00356  -0.0188   -0.0271    1.0      -0.022   -0.0349
 0.539    0.0409   -0.0434   -0.0418   -0.0484   -0.022     1.0      0.0146
 0.0301  -0.0262   -0.0103   -0.00245  -0.0069   -0.0349    0.0146   1.0

The target needs to be converted to a categorical object, given ordered factor with Up postive and Down negative.

In [14]:
classes(y[1])

2-element CategoricalArray{String,1,UInt8}:
 "Down"
 "Up"

In [9]:
LogisticClassifier = @load LogisticClassifier pkg=MLJLinearModels

[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mFor silent loading, specify `verbosity=0`. 


import MLJLinearModels ✔


MLJLinearModels.LogisticClassifier

In [10]:
LC = LogisticClassifier()

LogisticClassifier(
  lambda = 2.220446049250313e-16, 
  gamma = 0.0, 
  penalty = :l2, 
  fit_intercept = true, 
  penalize_intercept = false, 
  scale_penalty_with_samples = true, 
  solver = nothing)

In [11]:
X2 = select(X, Not([:Year, :Today]))
classif = machine(LC, X2, y)

untrained Machine; caches model-specific representations of data
  model: LogisticClassifier(lambda = 2.220446049250313e-16, …)
  args: 
    1:	Source @489 ⏎ Table{AbstractVector{Continuous}}
    2:	Source @598 ⏎ AbstractVector{Multiclass{2}}


In [12]:
fit!(classif)
ŷ = MLJ.predict(classif, X2)

[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mTraining machine(LogisticClassifier(lambda = 2.220446049250313e-16, …), …).
[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mSolver: MLJLinearModels.LBFGS()


1250-element CategoricalDistributions.UnivariateFiniteVector{Multiclass{2}, String, UInt8, Float64}:
 UnivariateFinite{Multiclass{2}}(Down=>0.493, Up=>0.507)
 UnivariateFinite{Multiclass{2}}(Down=>0.519, Up=>0.481)
 UnivariateFinite{Multiclass{2}}(Down=>0.519, Up=>0.481)
 UnivariateFinite{Multiclass{2}}(Down=>0.485, Up=>0.515)
 UnivariateFinite{Multiclass{2}}(Down=>0.489, Up=>0.511)
 UnivariateFinite{Multiclass{2}}(Down=>0.493, Up=>0.507)
 UnivariateFinite{Multiclass{2}}(Down=>0.507, Up=>0.493)
 UnivariateFinite{Multiclass{2}}(Down=>0.491, Up=>0.509)
 UnivariateFinite{Multiclass{2}}(Down=>0.482, Up=>0.518)
 UnivariateFinite{Multiclass{2}}(Down=>0.511, Up=>0.489)
 UnivariateFinite{Multiclass{2}}(Down=>0.503, Up=>0.497)
 UnivariateFinite{Multiclass{2}}(Down=>0.48, Up=>0.52)
 UnivariateFinite{Multiclass{2}}(Down=>0.482, Up=>0.518)
 ⋮
 UnivariateFinite{Multiclass{2}}(Down=>0.462, Up=>0.538)
 UnivariateFinite{Multiclass{2}}(Down=>0.472, Up=>0.528)
 UnivariateFinite{Multiclass{2}}(Down=>0.45

In [13]:
ŷ = predict_mode(classif, X2)
misclassification_rate(ŷ, y)

0.4784

In [15]:
cm = confusion_matrix(ŷ, y)

[33m[1m│ [22m[39musing: negative='Down' and positive='Up'.
[33m[1m└ [22m[39m[90m@ MLJBase ~/.julia/packages/MLJBase/9Nkjh/src/measures/confusion_matrix.jl:116[39m


              ┌───────────────────────────┐
              │       Ground Truth        │
┌─────────────┼─────────────┬─────────────┤
│  Predicted  │    Down     │     Up      │
├─────────────┼─────────────┼─────────────┤
│    Down     │     145     │     141     │
├─────────────┼─────────────┼─────────────┤
│     Up      │     457     │     507     │
└─────────────┴─────────────┴─────────────┘


In [18]:
@show accuracy(cm) # Correctly predicted/ total
@show precision(ŷ, y) #Predicted postive rate (rate which is predicted postive and is)
@show recall(ŷ, y) #True postive rate (rate which supposed to be postive and are)
@show f1score(ŷ, y) #Harmonic mean of precision and recall

accuracy(cm) = 0.5216
precision(ŷ, y) = 0.5259336099585062
recall(ŷ, y) = 0.7824074074074074
f1score(ŷ, y) = 0.6290322580645161


[33m[1m│ [22m[39musing: negative='Down' and positive='Up'.
[33m[1m└ [22m[39m[90m@ MLJBase ~/.julia/packages/MLJBase/9Nkjh/src/measures/confusion_matrix.jl:116[39m
[33m[1m│ [22m[39musing: negative='Down' and positive='Up'.
[33m[1m└ [22m[39m[90m@ MLJBase ~/.julia/packages/MLJBase/9Nkjh/src/measures/confusion_matrix.jl:116[39m
[33m[1m│ [22m[39musing: negative='Down' and positive='Up'.
[33m[1m└ [22m[39m[90m@ MLJBase ~/.julia/packages/MLJBase/9Nkjh/src/measures/confusion_matrix.jl:116[39m


0.6290322580645161

In [19]:
train = 1:findlast(X.Year .< 2005) #train index
test = last(train)+1:length(y); #test index

In [20]:
X3 = select(X2, [:Lag1, :Lag2])
classif = machine(LogisticClassifier(), X3, y)
fit!(classif, rows=train) #Only train on the given index
ŷ = predict_mode(classif, rows=test) #predict on given index
accuracy(ŷ, y[test])

[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mTraining machine(LogisticClassifier(lambda = 2.220446049250313e-16, …), …).
[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mSolver: MLJLinearModels.LBFGS()


0.5595238095238095