In [1]:
using CSV, DataFrames, GLM, Statistics, Dates, Gadfly, Random, MLBase, DecisionTree;
include("utils/precipitation.jl");
include("utils/random-forest.jl");
include("utils/reg-log.jl");

┌ Info: Loading DataFrames support into Gadfly.jl
└ @ Gadfly /home/chaime/.julia/packages/Gadfly/09PWZ/src/mapping.jl:228


In [2]:
no_soumission = 23;

In [3]:
function partitionTrainTest(data, at = 0.8) # https://discourse.julialang.org/t/simple-tool-for-train-test-split/473/2
    n = nrow(data)
    idx = shuffle(1:n)
    train_idx = view(idx, 1:floor(Int, at*n))
    test_idx = view(idx, (floor(Int, at*n)+1):n)
    return data[train_idx,:], data[test_idx,:]
end

partitionTrainTest (generic function with 2 methods)

In [4]:
val_form = @formula(SURVERSE ~ FS_sum + FS_max + FS_max3 +
                               SS_sum + SS_max + SS_max3);

In [5]:
names_ft = [:FS_sum, :FS_max, :FS_max3,
            :SS_sum, :SS_max, :SS_max3];

## Ouvrage 3260

In [7]:
data_set_3260 = CSV.read("data/parsed/oversampled/ouvrage_3260.csv");
train_set_3260, val_set_3260 = partitionTrainTest(data_set_3260);
val_labels_3260 = val_set_3260[!, :SURVERSE];
size(data_set_3260)

(10000, 10)

#### Train GLM

In [8]:
val_model_glm_3260 = glm(val_form, train_set_3260, Bernoulli(), LogitLink())
val_pred_glm_3260 = GLM.predict(val_model_glm_3260, val_set_3260);

evaluate_threshold(val_pred_glm_3260, val_labels_3260, 0.5) 

0.8763213530655392

#### Train Decision Tree 

In [9]:
train_features_3260 = convert(Matrix{Float64}, train_set_3260[:, names_ft]);
train_labels_3260 = convert(Array{Int64}, train_set_3260[!,:SURVERSE]);

dt_model_3260 = build_tree(train_labels_3260, train_features_3260);
dt_model_3260 = prune_tree(dt_model_3260, 0.9)

Decision Tree
Leaves: 257
Depth:  26

In [10]:
val_features_3260 = convert(Matrix{Float64}, val_set_3260[:, names_ft]);
val_labels_3260 = convert(Array{Int64}, val_set_3260[!,:SURVERSE]);

val_pred_dt_3260 = apply_tree(dt_model_3260, val_features_3260);
r = roc(val_labels_3260, val_pred_dt_3260);
f1score(r)

0.9598393574297188

#### Randon forest

In [11]:
estimated_params_3260 = [5, 100, 80, 20];
get_rf_direct(train_set_3260, val_set_3260, names_ft, estimated_params_3260)

0.978744939271255

#### Train ensemble model

Combine them

In [12]:
val_pred_rf_3260 = get_rf_probas(train_set_3260, val_set_3260, names_ft, estimated_params_3260);
val_pred_3260 = (val_pred_glm_3260 + val_pred_rf_3260[:, 2]) ./ 2;
evaluate_threshold(val_pred_3260, val_labels_3260, 0.5) 

0.9655172413793104

#### Get full model

In [13]:
test_set_3260 = CSV.read("data/parsed/test_3260.csv");
size(test_set_3260)

(45, 9)

In [14]:
test_model_glm_3260 = glm(val_form, data_set_3260, Bernoulli(), LogitLink());
test_pred_glm_3260 = GLM.predict(test_model_glm_3260, test_set_3260);
test_pred_rf_3260 = get_rf_probas(data_set_3260, test_set_3260, names_ft, estimated_params_3260);
test_pred_3260 = (test_pred_glm_3260 + test_pred_rf_3260[:, 2]) ./ 2;
test_pred_3260[test_pred_3260 .>= 0.5] .= 1.0;
test_pred_3260[test_pred_3260 .< 0.5] .= 0.0;
test_pred_3260 = convert(Array{Int}, trunc.(test_pred_3260));

#### Get prediction for ouvrage 3260

In [15]:
pred_3260 = test_set_3260;
pred_3260[!, :SURVERSE] = test_pred_3260;
vis_ft = [:DATE, :SURVERSE, :FS_sum, :FS_max, :SS_sum, :SS_max];
first(pred_3260[!, vis_ft], 10)

Unnamed: 0_level_0,DATE,SURVERSE,FS_sum,FS_max,SS_sum,SS_max
Unnamed: 0_level_1,Date,Int64,Float64,Float64,Float64,Float64
1,2019-05-02,0,-0.0635098,-0.0332178,-0.0891232,-0.016819
2,2019-05-09,1,0.742849,1.01886,0.50862,0.620324
3,2019-05-10,1,4.53146,1.68837,3.48538,1.14162
4,2019-05-15,0,-0.370694,-0.35203,-0.292356,-0.306429
5,2019-05-20,0,0.192477,0.859456,0.305387,0.79409
6,2019-05-23,1,1.93319,1.59272,2.25403,2.56071
7,2019-05-24,0,-0.229901,-0.128861,-0.124988,0.0411031
8,2019-05-26,0,-0.357895,-0.320149,-0.208672,-0.161624
9,2019-05-30,0,-0.306698,-0.192624,-0.172807,-0.016819
10,2019-06-02,0,0.794047,0.381238,0.448846,0.127986


## Ouvrage 3350

In [16]:
data_set_3350 = CSV.read("data/parsed/oversampled/ouvrage_3350.csv");
train_set_3350, val_set_3350 = partitionTrainTest(data_set_3350);
val_labels_3350 = val_set_3350[!, :SURVERSE];
size(data_set_3350)

(10000, 10)

#### Train GLM

In [18]:
val_model_glm_3350 = glm(val_form, train_set_3350, Bernoulli(), LogitLink())
val_pred_glm_3350 = GLM.predict(val_model_glm_3350, val_set_3350);

evaluate_threshold(val_pred_glm_3350, val_labels_3350, 0.5) 

0.8684491978609625

#### Train Decision Tree 

In [19]:
train_features_3350 = convert(Matrix{Float64}, train_set_3350[:, names_ft]);
train_labels_3350 = convert(Array{Int64}, train_set_3350[!,:SURVERSE]);

dt_model_3350 = build_tree(train_labels_3350, train_features_3350);
dt_model_3350 = prune_tree(dt_model_3350, 0.9)

Decision Tree
Leaves: 328
Depth:  34

In [21]:
val_features_3350 = convert(Matrix{Float64}, val_set_3350[:, names_ft]);
val_labels_3350 = convert(Array{Int64}, val_set_3350[!,:SURVERSE]);

val_pred_dt_3350 = apply_tree(dt_model_3350, val_features_3350);
r = roc(val_labels_3350, val_pred_dt_3350);
f1score(r)

0.9365951073389915

#### Train Random Forest

In [23]:
estimated_params_3350 = [5, 100, 80, 25];
get_rf_direct(train_set_3350, val_set_3350, names_ft, estimated_params_3350)

0.9642492339121552

In [24]:
val_pred_rf_3350 = get_rf_probas(train_set_3350, val_set_3350, names_ft, estimated_params_3350);

#### Train ensemble model

Combine them

In [25]:
val_pred_3350 = (val_pred_glm_3350 + val_pred_rf_3350[:, 2]) ./ 2;
evaluate_threshold(val_pred_3350, val_labels_3350, 0.5) 

0.9530201342281879

#### Get full model

In [26]:
test_set_3350 = CSV.read("data/parsed/test_3350.csv");
size(test_set_3350)

(70, 9)

In [27]:
test_model_glm_3350 = glm(val_form, data_set_3350, Bernoulli(), LogitLink());
test_pred_glm_3350 = GLM.predict(test_model_glm_3350, test_set_3350);
test_pred_rf_3350 = get_rf_probas(data_set_3350, test_set_3350, names_ft, estimated_params_3350);
test_pred_3350 = (test_pred_glm_3350 + test_pred_rf_3350[:, 2]) ./ 2;
test_pred_3350[test_pred_3350 .>= 0.5] .= 1.0;
test_pred_3350[test_pred_3350 .< 0.5] .= 0.0;
test_pred_3350 = convert(Array{Int}, trunc.(test_pred_3350));

#### Get prediction for ouvrage 3350

In [28]:
pred_3350 = test_set_3350;
pred_3350[!, :SURVERSE] = test_pred_3350;
vis_ft = [:DATE, :SURVERSE, :FS_sum, :FS_max3, :SS_sum, :SS_max3];
first(pred_3350[!, vis_ft], 10)

Unnamed: 0_level_0,DATE,SURVERSE,FS_sum,FS_max3,SS_sum,SS_max3
Unnamed: 0_level_1,Date,Int64,Float64,Float64,Float64,Float64
1,2019-05-01,1,0.256732,0.476482,0.534936,0.609527
2,2019-05-02,0,-0.21833,-0.18342,-0.0552547,0.0260547
3,2019-05-08,0,-0.399971,-0.425384,-0.344782,-0.362927
4,2019-05-09,1,0.801656,1.46634,0.646292,1.21068
5,2019-05-10,1,3.58217,2.3462,3.94245,2.37762
6,2019-05-11,0,-0.399971,-0.425384,-0.322511,-0.327565
7,2019-05-13,0,-0.274219,-0.227413,-0.14434,-0.0446692
8,2019-05-14,1,1.05316,0.542472,0.713106,0.415036
9,2019-05-18,0,-0.372026,-0.381391,-0.344782,-0.362927
10,2019-05-19,1,0.396456,0.586466,0.379036,0.415036


## Ouvrage 4240

In [29]:
data_set_4240 = CSV.read("data/parsed/oversampled/ouvrage_4240.csv");
train_set_4240, val_set_4240 = partitionTrainTest(data_set_4240);
val_labels_4240 = val_set_4240[!, :SURVERSE];
size(data_set_4240)

(10000, 10)

#### Train GLM

In [30]:
val_model_glm_4240 = glm(val_form, train_set_4240, Bernoulli(), LogitLink())
val_pred_glm_4240 = GLM.predict(val_model_glm_4240, val_set_4240);

evaluate_threshold(val_pred_glm_4240, val_labels_4240, 0.5) 

0.8218298555377207

#### Train Decision Tree 

In [31]:
train_features_4240 = convert(Matrix{Float64}, train_set_4240[:, names_ft]);
train_labels_4240 = convert(Array{Int64}, train_set_4240[!,:SURVERSE]);

dt_model_4240 = build_tree(train_labels_4240, train_features_4240);
dt_model_4240 = prune_tree(dt_model_4240, 0.9)

Decision Tree
Leaves: 236
Depth:  29

In [32]:
val_features_4240 = convert(Matrix{Float64}, val_set_4240[:, names_ft]);
val_labels_4240 = convert(Array{Int64}, val_set_4240[!,:SURVERSE]);

val_pred_dt_4240 = apply_tree(dt_model_4240, val_features_4240);
r = roc(val_labels_4240, val_pred_dt_4240);
f1score(r)

0.9548577036310107

#### Train Random Forest

In [33]:
estimated_params_4240 = [5, 100, 80, 20];
get_rf_direct(train_set_4240, val_set_4240, names_ft, estimated_params_4240)

0.9708641975308642

In [34]:
val_pred_rf_4240 = get_rf_probas(train_set_4240, val_set_4240, names_ft, estimated_params_4240);

#### Train ensemble model

Combine them

In [35]:
val_pred_4240 = (val_pred_glm_4240 + val_pred_rf_4240[:, 2]) ./ 2;
evaluate_threshold(val_pred_4240, val_labels_4240, 0.5) 

0.9625615763546798

#### Get full model

In [36]:
test_set_4240 = CSV.read("data/parsed/test_4240.csv");
size(test_set_4240)

(49, 9)

In [37]:
test_model_glm_4240 = glm(val_form, data_set_4240, Bernoulli(), LogitLink());
test_pred_glm_4240 = GLM.predict(test_model_glm_4240, test_set_4240);
test_pred_rf_4240 = get_rf_probas(data_set_4240, test_set_4240, names_ft, estimated_params_4240);
test_pred_4240 = (test_pred_glm_4240 + test_pred_rf_4240[:, 2]) ./ 2;
test_pred_4240[test_pred_4240 .>= 0.5] .= 1.0;
test_pred_4240[test_pred_4240 .< 0.5] .= 0.0;
test_pred_4240 = convert(Array{Int}, trunc.(test_pred_4240))

49-element Array{Int64,1}:
 0
 0
 1
 0
 0
 0
 1
 0
 0
 0
 0
 1
 0
 ⋮
 0
 0
 0
 0
 0
 0
 0
 0
 0
 0
 0
 0

#### Get prediction for ouvrage 4240

In [38]:
pred_4240 = test_set_4240;
pred_4240[!, :SURVERSE] = test_pred_4240;
vis_ft = [:DATE, :SURVERSE, :FS_sum, :FS_max3, :SS_sum, :SS_max3];
first(pred_4240[!, vis_ft], 10)

Unnamed: 0_level_0,DATE,SURVERSE,FS_sum,FS_max3,SS_sum,SS_max3
Unnamed: 0_level_1,Date,Int64,Float64,Float64,Float64,Float64
1,2019-05-07,0,-0.358123,-0.373363,-0.373593,-0.388305
2,2019-05-09,0,0.623037,1.13181,0.555808,0.972262
3,2019-05-10,1,4.26944,2.45726,3.07847,1.15612
4,2019-05-15,0,-0.358123,-0.373363,-0.373593,-0.388305
5,2019-05-21,0,-0.358123,-0.373363,-0.337383,-0.333147
6,2019-05-22,0,-0.358123,-0.373363,-0.373593,-0.388305
7,2019-05-23,1,2.76109,3.80518,3.86303,4.8701
8,2019-05-24,0,-0.153104,-0.0588493,-0.10805,0.0161877
9,2019-05-25,0,-0.270258,-0.283502,-0.289102,-0.259603
10,2019-06-01,0,-0.358123,-0.373363,-0.373593,-0.388305


## Ouvrage 4350

In [39]:
data_set_4350 = CSV.read("data/parsed/oversampled/ouvrage_4350.csv");
train_set_4350, val_set_4350 = partitionTrainTest(data_set_4350);
val_labels_4350 = val_set_4350[!, :SURVERSE];
size(data_set_4350)

(10000, 10)

#### Train GLM

In [40]:
val_model_glm_4350 = glm(val_form, train_set_4350, Bernoulli(), LogitLink())
val_pred_glm_4350 = GLM.predict(val_model_glm_4350, val_set_4350);

evaluate_threshold(val_pred_glm_4350, val_labels_4350, 0.5) 

0.9323456790123457

#### Train Decision Tree 

In [42]:
train_features_4350 = convert(Matrix{Float64}, train_set_4350[:, names_ft]);
train_labels_4350 = convert(Array{Int64}, train_set_4350[!,:SURVERSE]);

dt_model_4350 = build_tree(train_labels_4350, train_features_4350);
dt_model_4350 = prune_tree(dt_model_4350, 0.9)

Decision Tree
Leaves: 95
Depth:  18

In [43]:
val_features_4350 = convert(Matrix{Float64}, val_set_4350[:, names_ft]);
val_labels_4350 = convert(Array{Int64}, val_set_4350[!,:SURVERSE]);

val_pred_dt_4350 = apply_tree(dt_model_4350, val_features_4350);
r = roc(val_labels_4350, val_pred_dt_4350);
f1score(r)

0.9807787087235091

#### Train Random Forest

In [45]:
estimated_params_4350 = [5, 100, 80, 13];
get_rf_direct(train_set_4350, val_set_4350, names_ft, estimated_params_4350)

0.9877750611246944

In [46]:
val_pred_rf_4350 = get_rf_probas(train_set_4350, val_set_4350, names_ft, estimated_params_4350);

#### Train ensemble model

Combine them

In [47]:
val_pred_4350 = (val_pred_glm_4350 + val_pred_rf_4350[:, 2]) ./ 2;
evaluate_threshold(val_pred_4350, val_labels_4350, 0.5) 

0.9805825242718447

#### Get full model

In [48]:
test_set_4350 = CSV.read("data/parsed/test_4350.csv");
size(test_set_4350)

(65, 9)

In [49]:
test_model_glm_4350 = glm(val_form, data_set_4350, Bernoulli(), LogitLink());
test_pred_glm_4350 = GLM.predict(test_model_glm_4350, test_set_4350);
test_pred_rf_4350 = get_rf_probas(data_set_4350, test_set_4350, names_ft, estimated_params_4350);
test_pred_4350 = (test_pred_glm_4350 + test_pred_rf_4350[:, 2]) ./ 2;
test_pred_4350[test_pred_4350 .>= 0.5] .= 1.0;
test_pred_4350[test_pred_4350 .< 0.5] .= 0.0;
test_pred_4350 = convert(Array{Int}, trunc.(test_pred_4350));

#### Get prediction for ouvrage 4350

In [50]:
pred_4350 = test_set_4350;
pred_4350[!, :SURVERSE] = test_pred_4350;
vis_ft = [:DATE, :SURVERSE, :FS_sum, :FS_max3, :SS_sum, :SS_max3];
first(pred_4350[!, vis_ft], 10)

Unnamed: 0_level_0,DATE,SURVERSE,FS_sum,FS_max3,SS_sum,SS_max3
Unnamed: 0_level_1,Date,Int64,Float64,Float64,Float64,Float64
1,2019-05-03,0,0.038885,0.13156,0.039479,0.0507577
2,2019-05-04,0,-0.370694,-0.376504,-0.0493965,-0.0514987
3,2019-05-07,0,-0.357895,-0.356181,-0.0493965,-0.0514987
4,2019-05-08,0,-0.396293,-0.417149,-0.0493965,-0.0514987
5,2019-05-10,1,4.53146,2.73285,0.588891,0.338935
6,2019-05-11,0,-0.370694,-0.376504,-0.0493965,-0.0514987
7,2019-05-12,0,-0.396293,-0.417149,-0.0493965,-0.0514987
8,2019-05-21,0,-0.396293,-0.417149,-0.0493965,-0.0514987
9,2019-05-22,0,-0.396293,-0.417149,-0.0493965,-0.0514987
10,2019-05-23,1,1.93319,2.48898,0.380842,0.524855


## Ouvrage 4380

In [51]:
data_set_4380 = CSV.read("data/parsed/oversampled/ouvrage_4380.csv");
train_set_4380, val_set_4380 = partitionTrainTest(data_set_4380);
val_labels_4380 = val_set_4380[!, :SURVERSE];
size(data_set_4380)

(10000, 10)

#### Train GLM

In [53]:
val_model_glm_4380 = glm(val_form, train_set_4380, Bernoulli(), LogitLink())
val_pred_glm_4380 = GLM.predict(val_model_glm_4380, val_set_4380);

evaluate_threshold(val_pred_glm_4380, val_labels_4380, 0.5) 

0.9226037929267042

#### Train Decision Tree 

In [54]:
train_features_4380 = convert(Matrix{Float64}, train_set_4380[:, names_ft]);
train_labels_4380 = convert(Array{Int64}, train_set_4380[!,:SURVERSE]);

dt_model_4380 = build_tree(train_labels_4380, train_features_4380);
dt_model_4380 = prune_tree(dt_model_4380, 0.9)

Decision Tree
Leaves: 222
Depth:  23

In [55]:
val_features_4380 = convert(Matrix{Float64}, val_set_4380[:, names_ft]);
val_labels_4380 = convert(Array{Int64}, val_set_4380[!,:SURVERSE]);

val_pred_dt_4380 = apply_tree(dt_model_4380, val_features_4380);
r = roc(val_labels_4380, val_pred_dt_4380);
f1score(r)

0.9698825931597754

#### Train Random Forest

In [57]:
estimated_params_4380 = [5, 100, 80, 18];
get_rf_direct(train_set_4380, val_set_4380, names_ft, estimated_params_4380)

0.9771225216065074

In [58]:
val_pred_rf_4380 = get_rf_probas(train_set_4380, val_set_4380, names_ft, estimated_params_4380);

#### Train ensemble model

Combine them

In [59]:
val_pred_4380 = (val_pred_glm_4380 + val_pred_rf_4380[:, 2]) ./ 2;
evaluate_threshold(val_pred_4380, val_labels_4380, 0.5) 

0.9671882887430591

#### Get full model

In [61]:
test_set_4380 = CSV.read("data/parsed/test_4380.csv");
size(test_set_4380)

(54, 9)

In [62]:
test_model_glm_4380 = glm(val_form, data_set_4380, Bernoulli(), LogitLink());
test_pred_glm_4380 = GLM.predict(test_model_glm_4380, test_set_4380);
test_pred_rf_4380 = get_rf_probas(data_set_4380, test_set_4380, names_ft, estimated_params_4380);
test_pred_4380 = (test_pred_glm_4380 + test_pred_rf_4380[:, 2]) ./ 2;
test_pred_4380[test_pred_4380 .>= 0.5] .= 1.0;
test_pred_4380[test_pred_4380 .< 0.5] .= 0.0;
test_pred_4380 = convert(Array{Int}, trunc.(test_pred_4380));

#### Get prediction for ouvrage 4380

In [63]:
pred_4380 = test_set_4380;
pred_4380[!, :SURVERSE] = test_pred_4380;
vis_ft = [:DATE, :SURVERSE, :FS_sum, :FS_max3, :SS_sum, :SS_max3];
first(pred_4380[!, vis_ft], 10)

Unnamed: 0_level_0,DATE,SURVERSE,FS_sum,FS_max3,SS_sum,SS_max3
Unnamed: 0_level_1,Date,Int64,Float64,Float64,Float64,Float64
1,2019-05-02,0,-0.0635098,0.0299474,-0.035388,-0.0117939
2,2019-05-04,0,-0.370694,-0.376504,-0.116085,-0.121026
3,2019-05-05,0,-0.396293,-0.417149,-0.116085,-0.121026
4,2019-05-06,0,-0.396293,-0.417149,-0.116085,-0.121026
5,2019-05-08,0,-0.396293,-0.417149,-0.116085,-0.121026
6,2019-05-09,1,0.742849,1.39156,0.201957,0.366876
7,2019-05-10,1,4.53146,2.73285,1.38394,0.796521
8,2019-05-17,0,-0.332296,-0.356181,-0.0923509,-0.084615
9,2019-05-18,0,-0.396293,-0.417149,-0.116085,-0.121026
10,2019-05-28,0,-0.357895,-0.356181,-0.0686164,-0.0482044


# Prédiction

In [64]:
test_set_final = CSV.read("data/test.csv");
test_set_final[!, :SURVERSE] = zeros(size(test_set_final, 1));
size(test_set_final)

(283, 3)

Pour chaque ligne de test_set

    - On check l'ID de l'ouvrage pour savoir quel prediction load
    - On va chercher la prediction à telle date pour cet ouvrage
    - On le met à la ligne courante

In [66]:
for i=1:size(test_set_final, 1)
    curr_ouvrage = test_set_final[i, 1];
    pred_to_use = nothing;
    if curr_ouvrage == "3260-01D"
        pred_to_use = pred_3260;
    elseif curr_ouvrage == "3350-07D"
        pred_to_use = pred_3350;
    elseif curr_ouvrage == "4240-01D"
        pred_to_use = pred_4240;
    elseif curr_ouvrage == "4350-01D"
        pred_to_use = pred_4350;
    elseif curr_ouvrage == "4380-01D"
        pred_to_use = pred_4380;
    end
    
    curr_date = test_set_final[i, :DATE];
    pred_res = filter(row -> row.DATE == curr_date, pred_to_use);
    
    test_set_final[i, :SURVERSE] = pred_res[1, :SURVERSE];
end

In [68]:
test_set_final[!, :SURVERSE] = convert(Array{Int}, test_set_final[!, :SURVERSE]);
first(test_set_final, 10)

Unnamed: 0_level_0,NO_OUVRAGE,DATE,SURVERSE
Unnamed: 0_level_1,String,Date,Int64
1,3260-01D,2019-05-02,0
2,3260-01D,2019-05-09,1
3,3260-01D,2019-05-10,1
4,3260-01D,2019-05-15,0
5,3260-01D,2019-05-20,0
6,3260-01D,2019-05-23,1
7,3260-01D,2019-05-24,0
8,3260-01D,2019-05-26,0
9,3260-01D,2019-05-30,0
10,3350-07D,2019-05-01,1


In [None]:
ID = test_set[:,:NO_OUVRAGE].*"_".*string.(test_set[:,:DATE])
sampleSubmission = DataFrame(ID = ID, Surverse=test_set[:, :SURVERSE])
CSV.write("submissions/mc-submission-$(no_soumission).csv",sampleSubmission)