In [1]:
using CSV, DataFrames, GLM, Statistics, Dates, Gadfly, Random, MLBase, DecisionTree;
include("utils/precipitation.jl");
include("utils/random-forest.jl");
include("utils/reg-log.jl");

┌ Info: Loading DataFrames support into Gadfly.jl
└ @ Gadfly /home/chaime/.julia/packages/Gadfly/09PWZ/src/mapping.jl:228


In [2]:
no_soumission = 20;

In [3]:
function partitionTrainTest(data, at = 0.8) # https://discourse.julialang.org/t/simple-tool-for-train-test-split/473/2
    n = nrow(data)
    idx = shuffle(1:n)
    train_idx = view(idx, 1:floor(Int, at*n))
    test_idx = view(idx, (floor(Int, at*n)+1):n)
    return data[train_idx,:], data[test_idx,:]
end

partitionTrainTest (generic function with 2 methods)

In [27]:
val_form = @formula(SURVERSE ~ FS_sum + FS_max3 + 
                               SS_sum + SS_max3);

In [28]:
names_ft = [:FS_sum, :FS_max, :FS_max3,
            :SS_sum, :SS_max, :SS_max3];

In [32]:
params_rf = DataFrame(param=String[], min=Int8[], max=Int8[], step=Int8[]);

push!(params_rf, ["nft", 2, 6, 1]);
push!(params_rf, ["ntrees", 20, 50, 10]);
push!(params_rf, ["podata", 75, 95, 5]);
push!(params_rf, ["maxd", 5, 12, 1]);

params_rf

Unnamed: 0_level_0,param,min,max,step
Unnamed: 0_level_1,String,Int8,Int8,Int8
1,nft,2,6,1
2,ntrees,20,50,10
3,podata,75,95,5
4,maxd,5,12,1


## Ouvrage 3260

In [33]:
data_set = CSV.read("data/parsed/ouvrage_3260.csv");
train_set, val_set = partitionTrainTest(data_set);
val_labels = val_set[!, :SURVERSE];
size(data_set)

(1097, 10)

#### Train GLM

In [34]:
val_model = glm(val_form, train_set, Bernoulli(), LogitLink())
val_pred_glm = GLM.predict(val_model, val_set);

best_threshold_glm, f1_score_glm = find_best_threshold(val_pred_glm, val_labels)

(0.35, 0.7142857142857143)

In [35]:
evaluate_threshold(val_pred_glm, val_labels, 0.5) 

0.6666666666666666

#### Train Decision Tree 

In [36]:
train_features = convert(Matrix{Float64}, train_set[:, names_ft]);
train_labels = convert(Array{Int64}, train_set[!,:SURVERSE]);

dt_model = build_tree(train_labels, train_features)

Decision Tree
Leaves: 47
Depth:  11

In [37]:
dt_model = prune_tree(dt_model, 0.9)

Decision Tree
Leaves: 45
Depth:  11

In [38]:
val_features = convert(Matrix{Float64}, val_set[:, names_ft]);
val_labels = convert(Array{Int64}, val_set[!,:SURVERSE]);

val_pred_dt = apply_tree(dt_model, val_features);
r = roc(val_labels, val_pred_dt);
f1score(r)

0.6470588235294118

#### Train Random Forest

In [39]:
best_params, f1_score_rf = find_best_rf(train_set, val_set, names_ft, params_rf)

(Int8[3, 30, 85, 8], 0.7777777777777777)

#### Train ensemble model

Get probabilities for RF

In [40]:
val_pred_rf = get_rf_probas(train_set, val_set, names_ft, best_params);
val_pred_rf[:, 2];

Combine them

In [41]:
val_pred = (val_pred_glm + val_pred_rf[:, 2]) ./ 2;
best_threshold, f1_score = find_best_threshold(val_pred, val_labels)

(0.45, 0.7142857142857143)

In [42]:
evaluate_threshold(val_pred, val_labels, 0.5) 

0.7142857142857143

#### Get full model

In [43]:
test_set = CSV.read("data/parsed/test_3260.csv");
size(test_set)

(45, 9)

In [44]:
test_model_glm = glm(val_form, data_set, Bernoulli(), LogitLink());
test_pred_glm = GLM.predict(test_model_glm, test_set);

In [45]:
test_pred_rf = get_rf_probas(data_set, test_set, names_ft, best_params);
test_pred_rf[:, 2];

In [46]:
test_pred = (test_pred_glm + test_pred_rf[:, 2]) ./ 2;

In [47]:
test_pred[test_pred .>= best_threshold] .= 1.0;
test_pred[test_pred .< best_threshold] .= 0.0;
test_pred = convert(Array{Int}, trunc.(test_pred));

#### Get prediction for ouvrage 3260

In [49]:
pred_3260 = test_set;
pred_3260[!, :SURVERSE] = test_pred;
vis_ft = [:DATE, :SURVERSE, :FS_sum, :FS_max3, :SS_sum, :SS_max3];
first(pred_3260[!, vis_ft], 10)

Unnamed: 0_level_0,DATE,SURVERSE,FS_sum,FS_max3,SS_sum,SS_max3
Unnamed: 0_level_1,Date,Int64,Float64,Float64,Float64,Float64
1,2019-05-02,0,-0.068677,0.0211726,-0.0891232,-0.0297023
2,2019-05-09,0,0.71368,1.3347,0.50862,0.923961
3,2019-05-10,1,4.38952,2.62861,3.48538,2.006
4,2019-05-15,0,-0.366718,-0.370924,-0.292356,-0.304797
5,2019-05-20,0,0.17969,0.452479,0.305387,0.575507
6,2019-05-23,1,1.86859,2.39336,2.25403,3.10638
7,2019-05-24,0,-0.230116,-0.194481,-0.124988,-0.048042
8,2019-05-26,0,-0.354299,-0.351319,-0.208672,-0.17642
9,2019-05-30,0,-0.304626,-0.2729,-0.172807,-0.121401
10,2019-06-02,0,0.763353,0.668132,0.448846,0.39211


## Ouvrage 3350

In [50]:
data_set = CSV.read("data/parsed/ouvrage_3350.csv");
train_set, val_set = partitionTrainTest(data_set);
val_labels = val_set[!, :SURVERSE];
size(data_set)

(729, 10)

#### Train GLM

In [51]:
val_model = glm(val_form, train_set, Bernoulli(), LogitLink())
val_pred_glm = GLM.predict(val_model, val_set);

best_threshold_glm, f1_score_glm = find_best_threshold(val_pred_glm, val_labels)

(0.3, 0.8387096774193549)

In [52]:
evaluate_threshold(val_pred_glm, val_labels, 0.5) 

0.8070175438596491

#### Train Random Forest

In [53]:
best_params, f1_score_rf = find_best_rf(train_set, val_set, names_ft, params_rf)

(Int8[2, 20, 80, 6], 0.8375219170075978)

#### Train ensemble model

Get probabilities for RF

In [54]:
val_pred_rf = get_rf_probas(train_set, val_set, names_ft, best_params);
val_pred_rf[:, 2];

Combine them

In [55]:
val_pred = (val_pred_glm + val_pred_rf[:, 2]) ./ 2;
best_threshold, f1_score = find_best_threshold(val_pred, val_labels)

(0.4, 0.8524590163934426)

In [56]:
evaluate_threshold(val_pred, val_labels, 0.5) 

0.8275862068965517

#### Get full model

In [57]:
test_set = CSV.read("data/parsed/test_3350.csv");
size(test_set)

(70, 9)

In [58]:
test_model_glm = glm(val_form, data_set, Bernoulli(), LogitLink());
test_pred_glm = GLM.predict(test_model_glm, test_set);

In [59]:
test_pred_rf = get_rf_probas(data_set, test_set, names_ft, best_params);
test_pred_rf[:, 2];

In [60]:
test_pred = (test_pred_glm + test_pred_rf[:, 2]) ./ 2;

In [61]:
test_pred[test_pred .>= best_threshold] .= 1.0;
test_pred[test_pred .< best_threshold] .= 0.0;
test_pred = convert(Array{Int}, trunc.(test_pred));

#### Get prediction for ouvrage 3350

In [62]:
pred_3350 = test_set;
pred_3350[!, :SURVERSE] = test_pred;
vis_ft = [:DATE, :SURVERSE, :FS_sum, :FS_max3, :SS_sum, :SS_max3];
first(pred_3350[!, vis_ft], 10)

Unnamed: 0_level_0,DATE,SURVERSE,FS_sum,FS_max,SS_sum,SS_max
Unnamed: 0_level_1,Date,Int64,Float64,Float64,Float64,Float64
1,2019-05-01,1,0.251784,0.266921,0.512872,0.288286
2,2019-05-02,0,-0.221197,-0.15615,-0.0597502,-0.0342671
3,2019-05-08,0,-0.402043,-0.416502,-0.340659,-0.35682
4,2019-05-09,1,0.794321,0.982888,0.620914,0.852754
5,2019-05-10,1,3.56265,1.14561,3.81896,1.41722
6,2019-05-11,0,-0.402043,-0.416502,-0.319051,-0.303061
7,2019-05-13,0,-0.276842,-0.123606,-0.146184,0.127009
8,2019-05-14,1,1.04472,0.169289,0.685739,0.234527
9,2019-05-18,0,-0.37422,-0.351414,-0.340659,-0.35682
10,2019-05-19,1,0.390896,0.462185,0.361613,0.261407


## Ouvrage 4240

In [78]:
data_set = CSV.read("data/parsed/ouvrage_4240.csv");
train_set, val_set = partitionTrainTest(data_set);
val_labels = val_set[!, :SURVERSE];
size(data_set)

(1100, 10)

#### Train GLM

In [79]:
val_model = glm(val_form, train_set, Bernoulli(), LogitLink())
val_pred_glm = GLM.predict(val_model, val_set);

best_threshold_glm, f1_score_glm = find_best_threshold(val_pred_glm, val_labels)

(0.3, 0.6451612903225806)

In [80]:
evaluate_threshold(val_pred_glm, val_labels, 0.5) 

0.6

#### Train Random Forest

In [81]:
best_params, f1_score_rf = find_best_rf(train_set, val_set, names_ft, params_rf)

(Int8[5, 50, 95, 9], 0.6547619047619048)

#### Train ensemble model

Get probabilities for RF

In [82]:
val_pred_rf = get_rf_probas(train_set, val_set, names_ft, best_params);
val_pred_rf[:, 2];

Combine them

In [83]:
val_pred = (val_pred_glm + val_pred_rf[:, 2]) ./ 2;
best_threshold, f1_score = find_best_threshold(val_pred, val_labels)

(0.15, 0.6666666666666666)

In [85]:
evaluate_threshold(val_pred, val_labels, 0.5) 

0.5185185185185185

#### Get full model

In [86]:
test_set = CSV.read("data/parsed/test_4240.csv");
size(test_set)

(49, 9)

In [87]:
test_model_glm = glm(val_form, data_set, Bernoulli(), LogitLink());
test_pred_glm = GLM.predict(test_model_glm, test_set);

In [88]:
test_pred_rf = get_rf_probas(data_set, test_set, names_ft, best_params);
test_pred_rf[:, 2];

In [89]:
test_pred = (test_pred_glm + test_pred_rf[:, 2]) ./ 2;

In [90]:
test_pred[test_pred .>= best_threshold] .= 1.0;
test_pred[test_pred .< best_threshold] .= 0.0;
test_pred = convert(Array{Int}, trunc.(test_pred));

#### Get prediction for ouvrage 4240

In [92]:
pred_4240 = test_set;
pred_4240[!, :SURVERSE] = test_pred;
vis_ft = [:DATE, :SURVERSE, :FS_sum, :FS_max3, :SS_sum, :SS_max3];
first(pred_4240[!, vis_ft], 10)

Unnamed: 0_level_0,DATE,SURVERSE,FS_sum,FS_max3,SS_sum,SS_max3
Unnamed: 0_level_1,Date,Int64,Float64,Float64,Float64,Float64
1,2019-05-07,0,-0.358123,-0.373363,-0.373886,-0.388186
2,2019-05-09,0,0.623037,1.13181,0.529064,0.928872
3,2019-05-10,1,4.26944,2.45726,2.97993,1.10685
4,2019-05-15,0,-0.358123,-0.373363,-0.373886,-0.388186
5,2019-05-21,0,-0.358123,-0.373363,-0.338706,-0.334792
6,2019-05-22,0,-0.358123,-0.373363,-0.373886,-0.388186
7,2019-05-23,1,2.76109,3.80518,3.74216,4.70207
8,2019-05-24,0,-0.153104,-0.0588493,-0.115901,0.00337169
9,2019-05-25,0,-0.270258,-0.283502,-0.2918,-0.2636
10,2019-06-01,0,-0.358123,-0.373363,-0.373886,-0.388186


## Ouvrage 4350

In [93]:
data_set = CSV.read("data/parsed/ouvrage_4350.csv");
train_set, val_set = partitionTrainTest(data_set);
val_labels = val_set[!, :SURVERSE];
size(data_set)

(1100, 10)

#### Train GLM

In [94]:
val_model = glm(val_form, train_set, Bernoulli(), LogitLink())
val_pred_glm = GLM.predict(val_model, val_set);

best_threshold_glm, f1_score_glm = find_best_threshold(val_pred_glm, val_labels)

(0.55, 0.7058823529411765)

In [95]:
evaluate_threshold(val_pred_glm, val_labels, 0.5) 

0.6666666666666666

#### Train Random Forest

In [96]:
best_params, f1_score_rf = find_best_rf(train_set, val_set, names_ft, params_rf)

(Int8[4, 30, 80, 12], 0.9181286549707601)

#### Train ensemble model

Get probabilities for RF

In [97]:
val_pred_rf = get_rf_probas(train_set, val_set, names_ft, best_params);
val_pred_rf[:, 2];

Combine them

In [98]:
val_pred = (val_pred_glm + val_pred_rf[:, 2]) ./ 2;
best_threshold, f1_score = find_best_threshold(val_pred, val_labels)

(0.55, 0.7777777777777778)

In [103]:
evaluate_threshold(val_pred, val_labels, 0.5) 

0.7

#### Get full model

In [99]:
test_set = CSV.read("data/parsed/test_4350.csv");
size(test_set)

(65, 9)

In [100]:
test_model_glm = glm(val_form, data_set, Bernoulli(), LogitLink());
test_pred_glm = GLM.predict(test_model_glm, test_set);

In [101]:
test_pred_rf = get_rf_probas(data_set, test_set, names_ft, best_params);
test_pred_rf[:, 2];

In [102]:
test_pred = (test_pred_glm + test_pred_rf[:, 2]) ./ 2;

In [104]:
test_pred[test_pred .>= best_threshold] .= 1.0;
test_pred[test_pred .< best_threshold] .= 0.0;
test_pred = convert(Array{Int}, trunc.(test_pred));

#### Get prediction for ouvrage 4350

In [106]:
pred_4350 = test_set;
pred_4350[!, :SURVERSE] = test_pred;
vis_ft = [:DATE, :SURVERSE, :FS_sum, :FS_max3, :SS_sum, :SS_max3];
first(pred_4350[!, vis_ft], 10)

Unnamed: 0_level_0,DATE,SURVERSE,FS_sum,FS_max3,SS_sum,SS_max3
Unnamed: 0_level_1,Date,Int64,Float64,Float64,Float64,Float64
1,2019-05-03,0,0.0306699,0.119197,0.039479,0.0507577
2,2019-05-04,0,-0.366718,-0.370924,-0.0493965,-0.0514987
3,2019-05-07,0,-0.354299,-0.351319,-0.0493965,-0.0514987
4,2019-05-08,0,-0.391555,-0.410134,-0.0493965,-0.0514987
5,2019-05-10,1,4.38952,2.62861,0.588891,0.338935
6,2019-05-11,0,-0.366718,-0.370924,-0.0493965,-0.0514987
7,2019-05-12,0,-0.391555,-0.410134,-0.0493965,-0.0514987
8,2019-05-21,0,-0.391555,-0.410134,-0.0493965,-0.0514987
9,2019-05-22,0,-0.391555,-0.410134,-0.0493965,-0.0514987
10,2019-05-23,0,1.86859,2.39336,0.380842,0.524855


## Ouvrage 4380

In [107]:
data_set = CSV.read("data/parsed/ouvrage_4380.csv");
train_set, val_set = partitionTrainTest(data_set);
val_labels = val_set[!, :SURVERSE];
size(data_set)

(1103, 10)

#### Train GLM

In [108]:
val_model = glm(val_form, train_set, Bernoulli(), LogitLink())
val_pred_glm = GLM.predict(val_model, val_set);

best_threshold_glm, f1_score_glm = find_best_threshold(val_pred_glm, val_labels)

(0.55, 0.7407407407407407)

In [109]:
evaluate_threshold(val_pred_glm, val_labels, 0.5) 

0.7142857142857143

#### Train Random Forest

In [110]:
best_params, f1_score_rf = find_best_rf(train_set, val_set, names_ft, params_rf)

(Int8[3, 20, 85, 6], 0.6683569979716024)

#### Train ensemble model

Get probabilities for RF

In [111]:
val_pred_rf = get_rf_probas(train_set, val_set, names_ft, best_params);
val_pred_rf[:, 2];

Combine them

In [112]:
val_pred = (val_pred_glm + val_pred_rf[:, 2]) ./ 2;
best_threshold, f1_score = find_best_threshold(val_pred, val_labels)

(0.6, 0.7407407407407407)

In [113]:
evaluate_threshold(val_pred, val_labels, 0.5) 

0.6896551724137931

#### Get full model

In [114]:
test_set = CSV.read("data/parsed/test_4380.csv");
size(test_set)

(54, 9)

In [115]:
test_model_glm = glm(val_form, data_set, Bernoulli(), LogitLink());
test_pred_glm = GLM.predict(test_model_glm, test_set);

In [116]:
test_pred_rf = get_rf_probas(data_set, test_set, names_ft, best_params);
test_pred_rf[:, 2];

In [117]:
test_pred = (test_pred_glm + test_pred_rf[:, 2]) ./ 2;

In [118]:
test_pred[test_pred .>= best_threshold] .= 1.0;
test_pred[test_pred .< best_threshold] .= 0.0;
test_pred = convert(Array{Int}, trunc.(test_pred));

#### Get prediction for ouvrage 4380

In [120]:
pred_4380 = test_set;
pred_4380[!, :SURVERSE] = test_pred;
vis_ft = [:DATE, :SURVERSE, :FS_sum, :FS_max3, :SS_sum, :SS_max3];
first(pred_4380[!, vis_ft], 10)

Unnamed: 0_level_0,DATE,SURVERSE,FS_sum,FS_max3,SS_sum,SS_max3
Unnamed: 0_level_1,Date,Int64,Float64,Float64,Float64,Float64
1,2019-05-02,0,-0.068677,0.0211726,-0.035388,-0.0117939
2,2019-05-04,0,-0.366718,-0.370924,-0.116085,-0.121026
3,2019-05-05,0,-0.391555,-0.410134,-0.116085,-0.121026
4,2019-05-06,0,-0.391555,-0.410134,-0.116085,-0.121026
5,2019-05-08,0,-0.391555,-0.410134,-0.116085,-0.121026
6,2019-05-09,0,0.71368,1.3347,0.201957,0.366876
7,2019-05-10,1,4.38952,2.62861,1.38394,0.796521
8,2019-05-17,0,-0.329463,-0.351319,-0.0923509,-0.084615
9,2019-05-18,0,-0.391555,-0.410134,-0.116085,-0.121026
10,2019-05-28,0,-0.354299,-0.351319,-0.0686164,-0.0482044


# Prédiction

In [121]:
test_set = CSV.read("data/test.csv");
test_set[!, :SURVERSE] = zeros(size(test_set, 1));
size(test_set)

(283, 3)

Pour chaque ligne de test_set

    - On check l'ID de l'ouvrage pour savoir quel prediction load
    - On va chercher la prediction à telle date pour cet ouvrage
    - On le met à la ligne courante

In [122]:
for i=1:size(test_set, 1)
    curr_ouvrage = test_set[i, 1];
    pred_to_use = nothing;
    if curr_ouvrage == "3260-01D"
        pred_to_use = pred_3260;
    elseif curr_ouvrage == "3350-07D"
        pred_to_use = pred_3350;
    elseif curr_ouvrage == "4240-01D"
        pred_to_use = pred_4240;
    elseif curr_ouvrage == "4350-01D"
        pred_to_use = pred_4350;
    elseif curr_ouvrage == "4380-01D"
        pred_to_use = pred_4380;
    end
    
    curr_date = test_set[i, :DATE];
    pred_res = filter(row -> row.DATE == curr_date, pred_to_use);
    
    test_set[i, :SURVERSE] = pred_res[1, :SURVERSE];
end

In [123]:
test_set[!, :SURVERSE] = convert(Array{Int}, test_set[!, :SURVERSE]);
first(test_set, 10)

Unnamed: 0_level_0,NO_OUVRAGE,DATE,SURVERSE
Unnamed: 0_level_1,String,Date,Int64
1,3260-01D,2019-05-02,0
2,3260-01D,2019-05-09,0
3,3260-01D,2019-05-10,1
4,3260-01D,2019-05-15,0
5,3260-01D,2019-05-20,0
6,3260-01D,2019-05-23,1
7,3260-01D,2019-05-24,0
8,3260-01D,2019-05-26,0
9,3260-01D,2019-05-30,0
10,3350-07D,2019-05-01,1


In [124]:
ID = test_set[:,:NO_OUVRAGE].*"_".*string.(test_set[:,:DATE])
sampleSubmission = DataFrame(ID = ID, Surverse=test_set[:, :SURVERSE])
CSV.write("submissions/mc-submission-$(no_soumission).csv",sampleSubmission)

"submissions/mc-submission-20.csv"