In [1]:
using CSV, DataFrames, GLM, Statistics, Dates, Gadfly, Random, MLBase, DecisionTree;
include("utils/precipitation.jl");
include("utils/random-forest.jl");
include("utils/reg-log.jl");

┌ Info: Loading DataFrames support into Gadfly.jl
└ @ Gadfly /home/chaime/.julia/packages/Gadfly/09PWZ/src/mapping.jl:228


In [2]:
no_soumission = 22;

In [3]:
function partitionTrainTest(data, at = 0.8) # https://discourse.julialang.org/t/simple-tool-for-train-test-split/473/2
    n = nrow(data)
    idx = shuffle(1:n)
    train_idx = view(idx, 1:floor(Int, at*n))
    test_idx = view(idx, (floor(Int, at*n)+1):n)
    return data[train_idx,:], data[test_idx,:]
end

partitionTrainTest (generic function with 2 methods)

In [6]:
val_form = @formula(SURVERSE ~ FS_sum + FS_max + 
                               SS_sum + SS_max);

In [7]:
names_ft = [:FS_sum, :FS_max
            :SS_sum, :SS_max];

LoadError: syntax: missing separator in array expression

In [None]:
params_rf = DataFrame(param=String[], min=Int8[], max=Int8[], step=Int8[]);

push!(params_rf, ["nft", 2, 6, 1]);
push!(params_rf, ["ntrees", 50, 100, 10]);
push!(params_rf, ["podata", 75, 95, 5]);
push!(params_rf, ["maxd", 15, 27, 2]);

params_rf

## Ouvrage 3260

In [8]:
data_set = CSV.read("data/parsed/oversampled/ouvrage_3260.csv");
train_set, val_set = partitionTrainTest(data_set);
val_labels = val_set[!, :SURVERSE];
size(data_set)

(10000, 10)

#### Train GLM

In [9]:
val_model = glm(val_form, train_set, Bernoulli(), LogitLink())
val_pred_glm = GLM.predict(val_model, val_set);

best_threshold_glm, f1_score_glm = find_best_threshold(val_pred_glm, val_labels)

(0.3, 0.8262243285939969)

In [10]:
evaluate_threshold(val_pred_glm, val_labels, 0.5) 

0.8233305156382079

#### Train Decision Tree 

In [11]:
train_features = convert(Matrix{Float64}, train_set[:, names_ft]);
train_labels = convert(Array{Int64}, train_set[!,:SURVERSE]);

dt_model = build_tree(train_labels, train_features)

Decision Tree
Leaves: 286
Depth:  31

In [12]:
dt_model = prune_tree(dt_model, 0.9)

Decision Tree
Leaves: 268
Depth:  31

In [13]:
val_features = convert(Matrix{Float64}, val_set[:, names_ft]);
val_labels = convert(Array{Int64}, val_set[!,:SURVERSE]);

val_pred_dt = apply_tree(dt_model, val_features);
r = roc(val_labels, val_pred_dt);
f1score(r)

0.9185303514376997

#### Train Random Forest [TODO: RUN LATER, TOO LONG]

In [None]:
# best_params, f1_score_rf = find_best_rf(train_set, val_set, names_ft, params_rf)

#### QUICK RANDOM FOREST ESTIMATED

In [14]:
estimated_params = [2, 60, 80, 25];
get_rf_direct(train_set, val_set, names_ft, estimated_params)

0.9537254901960784

QUICK REPLACEMENT

In [15]:
val_pred_rf = get_rf_probas(train_set, val_set, names_ft, estimated_params);

#### Train ensemble model

Get probabilities for RF

In [None]:
# val_pred_rf = get_rf_probas(train_set, val_set, names_ft, best_params);
# val_pred_rf[:, 2];

Combine them

In [16]:
val_pred = (val_pred_glm + val_pred_rf[:, 2]) ./ 2;
best_threshold, f1_score = find_best_threshold(val_pred, val_labels)

(0.4, 0.9278033794162827)

In [17]:
evaluate_threshold(val_pred, val_labels, 0.5) 

0.9117174959871589

#### Get full model

In [18]:
test_set = CSV.read("data/parsed/test_3260.csv");
size(test_set)

(45, 9)

In [19]:
test_model_glm = glm(val_form, data_set, Bernoulli(), LogitLink());
test_pred_glm = GLM.predict(test_model_glm, test_set);

In [None]:
# test_pred_rf = get_rf_probas(data_set, test_set, names_ft, best_params);
# test_pred_rf[:, 2];

In [20]:
test_pred_rf = get_rf_probas(data_set, test_set, names_ft, estimated_params);

In [21]:
test_pred = (test_pred_glm + test_pred_rf[:, 2]) ./ 2;

In [22]:
test_pred[test_pred .>= 0.5] .= 1.0;
test_pred[test_pred .< 0.5] .= 0.0;
test_pred = convert(Array{Int}, trunc.(test_pred));

#### Get prediction for ouvrage 3260

In [24]:
pred_3260 = test_set;
pred_3260[!, :SURVERSE] = test_pred;
vis_ft = [:DATE, :SURVERSE, :FS_sum, :FS_max, :SS_sum, :SS_max];
first(pred_3260[!, vis_ft], 10)

Unnamed: 0_level_0,DATE,SURVERSE,FS_sum,FS_max,SS_sum,SS_max
Unnamed: 0_level_1,Date,Int64,Float64,Float64,Float64,Float64
1,2019-05-02,0,-0.0635098,-0.0332178,-0.0891232,-0.016819
2,2019-05-09,1,0.742849,1.01886,0.50862,0.620324
3,2019-05-10,1,4.53146,1.68837,3.48538,1.14162
4,2019-05-15,0,-0.370694,-0.35203,-0.292356,-0.306429
5,2019-05-20,0,0.192477,0.859456,0.305387,0.79409
6,2019-05-23,1,1.93319,1.59272,2.25403,2.56071
7,2019-05-24,0,-0.229901,-0.128861,-0.124988,0.0411031
8,2019-05-26,0,-0.357895,-0.320149,-0.208672,-0.161624
9,2019-05-30,0,-0.306698,-0.192624,-0.172807,-0.016819
10,2019-06-02,0,0.794047,0.381238,0.448846,0.127986


## Ouvrage 3350

In [25]:
data_set = CSV.read("data/parsed/oversampled/ouvrage_3350.csv");
train_set, val_set = partitionTrainTest(data_set);
val_labels = val_set[!, :SURVERSE];
size(data_set)

(10000, 10)

#### Train GLM

In [26]:
val_model = glm(val_form, train_set, Bernoulli(), LogitLink())
val_pred_glm = GLM.predict(val_model, val_set);

best_threshold_glm, f1_score_glm = find_best_threshold(val_pred_glm, val_labels)

(0.3, 0.8368421052631579)

In [27]:
evaluate_threshold(val_pred_glm, val_labels, 0.5) 

0.804490177736202

#### Train Decision Tree 

In [28]:
train_features = convert(Matrix{Float64}, train_set[:, names_ft]);
train_labels = convert(Array{Int64}, train_set[!,:SURVERSE]);

dt_model = build_tree(train_labels, train_features)

Decision Tree
Leaves: 332
Depth:  34

In [29]:
dt_model = prune_tree(dt_model, 0.9)

Decision Tree
Leaves: 307
Depth:  34

In [30]:
val_features = convert(Matrix{Float64}, val_set[:, names_ft]);
val_labels = convert(Array{Int64}, val_set[!,:SURVERSE]);

val_pred_dt = apply_tree(dt_model, val_features);
r = roc(val_labels, val_pred_dt);
f1score(r)

0.9043927648578811

#### QUICK RANDOM FOREST ESTIMATED

In [31]:
estimated_params = [3, 40, 80, 20];
get_rf_direct(train_set, val_set, names_ft, estimated_params)

0.941696113074205

#### Train Random Forest

In [None]:
# best_params, f1_score_rf = find_best_rf(train_set, val_set, names_ft, params_rf)

QUICK REPLACEMENT

In [32]:
val_pred_rf = get_rf_probas(train_set, val_set, names_ft, estimated_params);

#### Train ensemble model

Get probabilities for RF

In [None]:
# val_pred_rf = get_rf_probas(train_set, val_set, names_ft, best_params);
# val_pred_rf[:, 2];

Combine them

In [33]:
val_pred = (val_pred_glm + val_pred_rf[:, 2]) ./ 2;
best_threshold, f1_score = find_best_threshold(val_pred, val_labels)

(0.35, 0.9184549356223176)

In [34]:
evaluate_threshold(val_pred, val_labels, 0.5) 

0.9042076991942704

#### Get full model

In [35]:
test_set = CSV.read("data/parsed/test_3350.csv");
size(test_set)

(70, 9)

In [36]:
test_model_glm = glm(val_form, data_set, Bernoulli(), LogitLink());
test_pred_glm = GLM.predict(test_model_glm, test_set);

In [None]:
# test_pred_rf = get_rf_probas(data_set, test_set, names_ft, best_params);
# test_pred_rf[:, 2];

In [37]:
test_pred_rf = get_rf_probas(data_set, test_set, names_ft, estimated_params);

In [38]:
test_pred = (test_pred_glm + test_pred_rf[:, 2]) ./ 2;

In [39]:
test_pred[test_pred .>= 0.5] .= 1.0;
test_pred[test_pred .< 0.5] .= 0.0;
test_pred = convert(Array{Int}, trunc.(test_pred));

#### Get prediction for ouvrage 3350

In [40]:
pred_3350 = test_set;
pred_3350[!, :SURVERSE] = test_pred;
vis_ft = [:DATE, :SURVERSE, :FS_sum, :FS_max3, :SS_sum, :SS_max3];
first(pred_3350[!, vis_ft], 10)

Unnamed: 0_level_0,DATE,SURVERSE,FS_sum,FS_max3,SS_sum,SS_max3
Unnamed: 0_level_1,Date,Int64,Float64,Float64,Float64,Float64
1,2019-05-01,1,0.256732,0.476482,0.534936,0.609527
2,2019-05-02,0,-0.21833,-0.18342,-0.0552547,0.0260547
3,2019-05-08,0,-0.399971,-0.425384,-0.344782,-0.362927
4,2019-05-09,1,0.801656,1.46634,0.646292,1.21068
5,2019-05-10,1,3.58217,2.3462,3.94245,2.37762
6,2019-05-11,0,-0.399971,-0.425384,-0.322511,-0.327565
7,2019-05-13,0,-0.274219,-0.227413,-0.14434,-0.0446692
8,2019-05-14,1,1.05316,0.542472,0.713106,0.415036
9,2019-05-18,0,-0.372026,-0.381391,-0.344782,-0.362927
10,2019-05-19,0,0.396456,0.586466,0.379036,0.415036


## Ouvrage 4240

In [41]:
data_set = CSV.read("data/parsed/oversampled/ouvrage_4240.csv");
train_set, val_set = partitionTrainTest(data_set);
val_labels = val_set[!, :SURVERSE];
size(data_set)

(10000, 10)

#### Train GLM

In [42]:
val_model = glm(val_form, train_set, Bernoulli(), LogitLink())
val_pred_glm = GLM.predict(val_model, val_set);

best_threshold_glm, f1_score_glm = find_best_threshold(val_pred_glm, val_labels)

(0.45, 0.8051001821493625)

In [43]:
evaluate_threshold(val_pred_glm, val_labels, 0.5) 

0.7988826815642458

#### Train Decision Tree 

In [44]:
train_features = convert(Matrix{Float64}, train_set[:, names_ft]);
train_labels = convert(Array{Int64}, train_set[!,:SURVERSE]);

dt_model = build_tree(train_labels, train_features)

Decision Tree
Leaves: 297
Depth:  24

In [45]:
dt_model = prune_tree(dt_model, 0.9)

Decision Tree
Leaves: 269
Depth:  24

In [46]:
val_features = convert(Matrix{Float64}, val_set[:, names_ft]);
val_labels = convert(Array{Int64}, val_set[!,:SURVERSE]);

val_pred_dt = apply_tree(dt_model, val_features);
r = roc(val_labels, val_pred_dt);
f1score(r)

0.9124579124579124

#### QUICK RANDOM FOREST ESTIMATED

In [47]:
estimated_params = [3, 40, 80, 20];
get_rf_direct(train_set, val_set, names_ft, estimated_params)

0.9532710280373832

In [48]:
val_pred_rf = get_rf_probas(train_set, val_set, names_ft, estimated_params);

#### Train Random Forest

In [None]:
# best_params, f1_score_rf = find_best_rf(train_set, val_set, names_ft, params_rf)

#### Train ensemble model

Get probabilities for RF

In [None]:
# val_pred_rf = get_rf_probas(train_set, val_set, names_ft, best_params);
# val_pred_rf[:, 2];

Combine them

In [49]:
val_pred = (val_pred_glm + val_pred_rf[:, 2]) ./ 2;
best_threshold, f1_score = find_best_threshold(val_pred, val_labels)

(0.5, 0.9258312020460358)

In [50]:
evaluate_threshold(val_pred, val_labels, 0.5) 

0.9258312020460358

#### Get full model

In [51]:
test_set = CSV.read("data/parsed/test_4240.csv");
size(test_set)

(49, 9)

In [52]:
test_model_glm = glm(val_form, data_set, Bernoulli(), LogitLink());
test_pred_glm = GLM.predict(test_model_glm, test_set);

In [None]:
# test_pred_rf = get_rf_probas(data_set, test_set, names_ft, best_params);
# test_pred_rf[:, 2];

In [53]:
test_pred_rf = get_rf_probas(data_set, test_set, names_ft, estimated_params);

In [54]:
test_pred = (test_pred_glm + test_pred_rf[:, 2]) ./ 2;

In [55]:
test_pred[test_pred .>= 0.5] .= 1.0;
test_pred[test_pred .< 0.5] .= 0.0;
test_pred = convert(Array{Int}, trunc.(test_pred));

#### Get prediction for ouvrage 4240

In [56]:
pred_4240 = test_set;
pred_4240[!, :SURVERSE] = test_pred;
vis_ft = [:DATE, :SURVERSE, :FS_sum, :FS_max3, :SS_sum, :SS_max3];
first(pred_4240[!, vis_ft], 10)

Unnamed: 0_level_0,DATE,SURVERSE,FS_sum,FS_max3,SS_sum,SS_max3
Unnamed: 0_level_1,Date,Int64,Float64,Float64,Float64,Float64
1,2019-05-07,0,-0.358123,-0.373363,-0.373593,-0.388305
2,2019-05-09,0,0.623037,1.13181,0.555808,0.972262
3,2019-05-10,1,4.26944,2.45726,3.07847,1.15612
4,2019-05-15,0,-0.358123,-0.373363,-0.373593,-0.388305
5,2019-05-21,0,-0.358123,-0.373363,-0.337383,-0.333147
6,2019-05-22,0,-0.358123,-0.373363,-0.373593,-0.388305
7,2019-05-23,1,2.76109,3.80518,3.86303,4.8701
8,2019-05-24,0,-0.153104,-0.0588493,-0.10805,0.0161877
9,2019-05-25,0,-0.270258,-0.283502,-0.289102,-0.259603
10,2019-06-01,0,-0.358123,-0.373363,-0.373593,-0.388305


## Ouvrage 4350

In [57]:
data_set = CSV.read("data/parsed/oversampled/ouvrage_4350.csv");
train_set, val_set = partitionTrainTest(data_set);
val_labels = val_set[!, :SURVERSE];
size(data_set)

(10000, 10)

#### Train GLM

In [58]:
val_model = glm(val_form, train_set, Bernoulli(), LogitLink())
val_pred_glm = GLM.predict(val_model, val_set);

best_threshold_glm, f1_score_glm = find_best_threshold(val_pred_glm, val_labels)

(0.3, 0.8764607679465777)

In [59]:
evaluate_threshold(val_pred_glm, val_labels, 0.5) 

0.8665480427046264

#### Train Decision Tree 

In [60]:
train_features = convert(Matrix{Float64}, train_set[:, names_ft]);
train_labels = convert(Array{Int64}, train_set[!,:SURVERSE]);

dt_model = build_tree(train_labels, train_features)

Decision Tree
Leaves: 153
Depth:  16

In [61]:
dt_model = prune_tree(dt_model, 0.9)

Decision Tree
Leaves: 142
Depth:  16

In [62]:
val_features = convert(Matrix{Float64}, val_set[:, names_ft]);
val_labels = convert(Array{Int64}, val_set[!,:SURVERSE]);

val_pred_dt = apply_tree(dt_model, val_features);
r = roc(val_labels, val_pred_dt);
f1score(r)

0.9500438212094654

#### QUICK RANDOM FOREST ESTIMATED

In [63]:
estimated_params = [3, 40, 80, 20];
get_rf_direct(train_set, val_set, names_ft, estimated_params)

0.9734816082121471

In [64]:
val_pred_rf = get_rf_probas(train_set, val_set, names_ft, estimated_params);

#### Train Random Forest

In [None]:
# best_params, f1_score_rf = find_best_rf(train_set, val_set, names_ft, params_rf)

#### Train ensemble model

Get probabilities for RF

In [None]:
# val_pred_rf = get_rf_probas(train_set, val_set, names_ft, best_params);
# val_pred_rf[:, 2];

Combine them

In [65]:
val_pred = (val_pred_glm + val_pred_rf[:, 2]) ./ 2;
best_threshold, f1_score = find_best_threshold(val_pred, val_labels)

(0.5, 0.9527027027027027)

In [66]:
evaluate_threshold(val_pred, val_labels, 0.5) 

0.9527027027027027

#### Get full model

In [67]:
test_set = CSV.read("data/parsed/test_4350.csv");
size(test_set)

(65, 9)

In [68]:
test_model_glm = glm(val_form, data_set, Bernoulli(), LogitLink());
test_pred_glm = GLM.predict(test_model_glm, test_set);

In [None]:
# test_pred_rf = get_rf_probas(data_set, test_set, names_ft, best_params);
# test_pred_rf[:, 2];

In [69]:
test_pred_rf = get_rf_probas(data_set, test_set, names_ft, estimated_params);

In [70]:
test_pred = (test_pred_glm + test_pred_rf[:, 2]) ./ 2;

In [71]:
test_pred[test_pred .>= 0.5] .= 1.0;
test_pred[test_pred .< 0.5] .= 0.0;
test_pred = convert(Array{Int}, trunc.(test_pred));

#### Get prediction for ouvrage 4350

In [72]:
pred_4350 = test_set;
pred_4350[!, :SURVERSE] = test_pred;
vis_ft = [:DATE, :SURVERSE, :FS_sum, :FS_max3, :SS_sum, :SS_max3];
first(pred_4350[!, vis_ft], 10)

Unnamed: 0_level_0,DATE,SURVERSE,FS_sum,FS_max3,SS_sum,SS_max3
Unnamed: 0_level_1,Date,Int64,Float64,Float64,Float64,Float64
1,2019-05-03,0,0.038885,0.13156,0.039479,0.0507577
2,2019-05-04,0,-0.370694,-0.376504,-0.0493965,-0.0514987
3,2019-05-07,0,-0.357895,-0.356181,-0.0493965,-0.0514987
4,2019-05-08,0,-0.396293,-0.417149,-0.0493965,-0.0514987
5,2019-05-10,1,4.53146,2.73285,0.588891,0.338935
6,2019-05-11,0,-0.370694,-0.376504,-0.0493965,-0.0514987
7,2019-05-12,0,-0.396293,-0.417149,-0.0493965,-0.0514987
8,2019-05-21,0,-0.396293,-0.417149,-0.0493965,-0.0514987
9,2019-05-22,0,-0.396293,-0.417149,-0.0493965,-0.0514987
10,2019-05-23,1,1.93319,2.48898,0.380842,0.524855


## Ouvrage 4380

In [73]:
data_set = CSV.read("data/parsed/oversampled/ouvrage_4380.csv");
train_set, val_set = partitionTrainTest(data_set);
val_labels = val_set[!, :SURVERSE];
size(data_set)

(10000, 10)

#### Train GLM

In [74]:
val_model = glm(val_form, train_set, Bernoulli(), LogitLink())
val_pred_glm = GLM.predict(val_model, val_set);

best_threshold_glm, f1_score_glm = find_best_threshold(val_pred_glm, val_labels)

(0.2, 0.862992125984252)

In [75]:
evaluate_threshold(val_pred_glm, val_labels, 0.5) 

0.8465608465608465

#### Train Decision Tree 

In [76]:
train_features = convert(Matrix{Float64}, train_set[:, names_ft]);
train_labels = convert(Array{Int64}, train_set[!,:SURVERSE]);

dt_model = build_tree(train_labels, train_features)

Decision Tree
Leaves: 254
Depth:  22

In [77]:
dt_model = prune_tree(dt_model, 0.9)

Decision Tree
Leaves: 233
Depth:  22

In [78]:
val_features = convert(Matrix{Float64}, val_set[:, names_ft]);
val_labels = convert(Array{Int64}, val_set[!,:SURVERSE]);

val_pred_dt = apply_tree(dt_model, val_features);
r = roc(val_labels, val_pred_dt);
f1score(r)

0.8995708154506438

#### QUICK RANDOM FOREST ESTIMATED

In [79]:
estimated_params = [3, 40, 80, 20];
get_rf_direct(train_set, val_set, names_ft, estimated_params)

0.9367720465890182

In [80]:
val_pred_rf = get_rf_probas(train_set, val_set, names_ft, estimated_params);

#### Train Random Forest

In [None]:
# best_params, f1_score_rf = find_best_rf(train_set, val_set, names_ft, params_rf)

#### Train ensemble model

Get probabilities for RF

In [None]:
# val_pred_rf = get_rf_probas(train_set, val_set, names_ft, best_params);
# val_pred_rf[:, 2];

Combine them

In [81]:
val_pred = (val_pred_glm + val_pred_rf[:, 2]) ./ 2;
best_threshold, f1_score = find_best_threshold(val_pred, val_labels)

(0.55, 0.9148211243611585)

In [82]:
evaluate_threshold(val_pred, val_labels, 0.5) 

0.914095079232694

#### Get full model

In [83]:
test_set = CSV.read("data/parsed/test_4380.csv");
size(test_set)

(54, 9)

In [84]:
test_model_glm = glm(val_form, data_set, Bernoulli(), LogitLink());
test_pred_glm = GLM.predict(test_model_glm, test_set);

In [None]:
# test_pred_rf = get_rf_probas(data_set, test_set, names_ft, best_params);
# test_pred_rf[:, 2];

In [85]:
test_pred_rf = get_rf_probas(data_set, test_set, names_ft, estimated_params);

In [86]:
test_pred = (test_pred_glm + test_pred_rf[:, 2]) ./ 2;

In [87]:
test_pred[test_pred .>= 0.5] .= 1.0;
test_pred[test_pred .< 0.5] .= 0.0;
test_pred = convert(Array{Int}, trunc.(test_pred));

#### Get prediction for ouvrage 4380

In [88]:
pred_4380 = test_set;
pred_4380[!, :SURVERSE] = test_pred;
vis_ft = [:DATE, :SURVERSE, :FS_sum, :FS_max3, :SS_sum, :SS_max3];
first(pred_4380[!, vis_ft], 10)

Unnamed: 0_level_0,DATE,SURVERSE,FS_sum,FS_max3,SS_sum,SS_max3
Unnamed: 0_level_1,Date,Int64,Float64,Float64,Float64,Float64
1,2019-05-02,0,-0.0635098,0.0299474,-0.035388,-0.0117939
2,2019-05-04,0,-0.370694,-0.376504,-0.116085,-0.121026
3,2019-05-05,0,-0.396293,-0.417149,-0.116085,-0.121026
4,2019-05-06,0,-0.396293,-0.417149,-0.116085,-0.121026
5,2019-05-08,0,-0.396293,-0.417149,-0.116085,-0.121026
6,2019-05-09,0,0.742849,1.39156,0.201957,0.366876
7,2019-05-10,1,4.53146,2.73285,1.38394,0.796521
8,2019-05-17,0,-0.332296,-0.356181,-0.0923509,-0.084615
9,2019-05-18,0,-0.396293,-0.417149,-0.116085,-0.121026
10,2019-05-28,0,-0.357895,-0.356181,-0.0686164,-0.0482044


# Prédiction

In [89]:
test_set = CSV.read("data/test.csv");
test_set[!, :SURVERSE] = zeros(size(test_set, 1));
size(test_set)

(283, 3)

Pour chaque ligne de test_set

    - On check l'ID de l'ouvrage pour savoir quel prediction load
    - On va chercher la prediction à telle date pour cet ouvrage
    - On le met à la ligne courante

In [90]:
for i=1:size(test_set, 1)
    curr_ouvrage = test_set[i, 1];
    pred_to_use = nothing;
    if curr_ouvrage == "3260-01D"
        pred_to_use = pred_3260;
    elseif curr_ouvrage == "3350-07D"
        pred_to_use = pred_3350;
    elseif curr_ouvrage == "4240-01D"
        pred_to_use = pred_4240;
    elseif curr_ouvrage == "4350-01D"
        pred_to_use = pred_4350;
    elseif curr_ouvrage == "4380-01D"
        pred_to_use = pred_4380;
    end
    
    curr_date = test_set[i, :DATE];
    pred_res = filter(row -> row.DATE == curr_date, pred_to_use);
    
    test_set[i, :SURVERSE] = pred_res[1, :SURVERSE];
end

In [91]:
test_set[!, :SURVERSE] = convert(Array{Int}, test_set[!, :SURVERSE]);
first(test_set, 10)

Unnamed: 0_level_0,NO_OUVRAGE,DATE,SURVERSE
Unnamed: 0_level_1,String,Date,Int64
1,3260-01D,2019-05-02,0
2,3260-01D,2019-05-09,1
3,3260-01D,2019-05-10,1
4,3260-01D,2019-05-15,0
5,3260-01D,2019-05-20,0
6,3260-01D,2019-05-23,1
7,3260-01D,2019-05-24,0
8,3260-01D,2019-05-26,0
9,3260-01D,2019-05-30,0
10,3350-07D,2019-05-01,1


In [92]:
ID = test_set[:,:NO_OUVRAGE].*"_".*string.(test_set[:,:DATE])
sampleSubmission = DataFrame(ID = ID, Surverse=test_set[:, :SURVERSE])
CSV.write("submissions/mc-submission-$(no_soumission).csv",sampleSubmission)

"submissions/mc-submission-22.csv"