In [245]:
using CSV, DataFrames, GLM, Statistics, Dates, Gadfly, Random, MLBase, DecisionTree;
include("utils/precipitation.jl");
include("utils/random-forest.jl");
include("utils/reg-log.jl");

In [342]:
no_soumission = 19;

In [231]:
function partitionTrainTest(data, at = 0.8) # https://discourse.julialang.org/t/simple-tool-for-train-test-split/473/2
    n = nrow(data)
    idx = shuffle(1:n)
    train_idx = view(idx, 1:floor(Int, at*n))
    test_idx = view(idx, (floor(Int, at*n)+1):n)
    return data[train_idx,:], data[test_idx,:]
end

partitionTrainTest (generic function with 2 methods)

In [232]:
val_form = @formula(SURVERSE ~ McTavish_sum + McTavish_max3 + 
                               Bellevue_sum + Bellevue_max3 + 
                               Assomption_sum + Assomption_max3 + 
                               Trudeau_sum + Trudeau_max3 + 
                               StHubert_sum + StHubert_max3);

In [233]:
names_ft = [:McTavish_sum, :McTavish_max, :McTavish_max3,
            :Bellevue_sum, :Bellevue_max, :Bellevue_max3,
            :Assomption_sum, :Assomption_max, :Assomption_max3,
            :Trudeau_sum, :Trudeau_max, :Trudeau_max3,
            :StHubert_sum, :StHubert_max, :StHubert_max3];

In [234]:
params_rf = DataFrame(param=String[], min=Int8[], max=Int8[], step=Int8[]);

push!(params_rf, ["nft", 3, 7, 1]);
push!(params_rf, ["ntrees", 75, 120, 15]);
push!(params_rf, ["podata", 75, 95, 5]);
push!(params_rf, ["maxd", 5, 12, 1]);

params_rf

Unnamed: 0_level_0,param,min,max,step
Unnamed: 0_level_1,String,Int8,Int8,Int8
1,nft,3,7,1
2,ntrees,75,120,15
3,podata,75,95,5
4,maxd,5,12,1


## Ouvrage 3260

In [235]:
data_set = CSV.read("data/parsed/ouvrage_3260.csv");
train_set, val_set = partitionTrainTest(data_set);
val_labels = val_set[!, :SURVERSE];
size(data_set)

(1097, 17)

#### Train GLM

In [236]:
val_model = glm(val_form, train_set, Bernoulli(), LogitLink())
val_pred_glm = GLM.predict(val_model, val_set);

best_threshold_glm, f1_score_glm = find_best_threshold(val_pred_glm, val_labels)

(0.65, 0.6666666666666666)

In [237]:
evaluate_threshold(val_pred_glm, val_labels, 0.5) 

0.5714285714285714

#### Train Random Forest

In [238]:
best_params, f1_score_rf = find_best_rf(train_set, val_set, names_ft, params_rf)

(Int8[3, 75, 90, 7], 0.6862745098039216)

#### Train ensemble model

Get probabilities for RF

In [279]:
val_pred_rf = get_rf_probas(train_set, val_set, names_ft, best_params);
val_pred_rf[:, 2];

Combine them

In [280]:
val_pred = (val_pred_glm + val_pred_rf[:, 2]) ./ 2;
best_threshold, f1_score = find_best_threshold(val_pred, val_labels)

(0.75, 0.6896551724137931)

#### Get full model

In [281]:
test_set = CSV.read("data/parsed/test_3260.csv");
size(test_set)

(45, 16)

In [282]:
test_model_glm = glm(val_form, data_set, Bernoulli(), LogitLink());
test_pred_glm = GLM.predict(test_model_glm, test_set);

In [284]:
test_pred_rf = get_rf_probas(data_set, test_set, names_ft, best_params);
test_pred_rf[:, 2];

In [285]:
test_pred = (test_pred_glm + test_pred_rf[:, 2]) ./ 2;

In [286]:
test_pred[test_pred .>= best_threshold] .= 1.0;
test_pred[test_pred .< best_threshold] .= 0.0;
test_pred = convert(Array{Int}, trunc.(test_pred));

#### Get prediction for ouvrage 3260

In [289]:
pred_3260 = test_set;
pred_3260[!, :SURVERSE] = test_pred;
vis_ft = [:DATE, :SURVERSE, :McTavish_sum, :Bellevue_sum, :Assomption_sum, :Trudeau_sum, :StHubert_sum];
first(pred_3260[!, vis_ft], 10)

Unnamed: 0_level_0,DATE,SURVERSE,McTavish_sum,Bellevue_sum,Assomption_sum,Trudeau_sum,StHubert_sum
Unnamed: 0_level_1,Date,Int64,Float64,Float64,Float64,Float64,Float64
1,2019-05-02,0,26.0,19.0,15.0,13.0,17.0
2,2019-05-09,0,89.0,67.0,77.0,86.0,67.0
3,2019-05-10,1,385.0,265.0,286.0,285.0,316.0
4,2019-05-15,0,2.0,0.0,0.0,0.0,0.0
5,2019-05-20,0,46.0,5.0,49.0,53.0,50.0
6,2019-05-23,0,182.0,0.0,351.0,159.0,213.0
7,2019-05-24,0,13.0,0.0,22.0,15.0,14.0
8,2019-05-26,0,3.0,0.0,21.0,5.0,7.0
9,2019-05-30,0,7.0,0.0,13.0,12.0,10.0
10,2019-06-02,0,93.0,0.0,62.0,68.0,62.0


## Ouvrage 3350

In [290]:
data_set = CSV.read("data/parsed/ouvrage_3350.csv");
train_set, val_set = partitionTrainTest(data_set);
val_labels = val_set[!, :SURVERSE];
size(data_set)

(729, 17)

#### Train GLM

In [291]:
val_model = glm(val_form, train_set, Bernoulli(), LogitLink())
val_pred_glm = GLM.predict(val_model, val_set);

best_threshold_glm, f1_score_glm = find_best_threshold(val_pred_glm, val_labels)

(0.3, 0.8253968253968254)

In [292]:
evaluate_threshold(val_pred_glm, val_labels, 0.5) 

0.7868852459016393

#### Train Random Forest

In [293]:
best_params, f1_score_rf = find_best_rf(train_set, val_set, names_ft, params_rf)

(Int8[7, 90, 90, 9], 0.8333333333333334)

#### Train ensemble model

Get probabilities for RF

In [294]:
val_pred_rf = get_rf_probas(train_set, val_set, names_ft, best_params);
val_pred_rf[:, 2];

Combine them

In [295]:
val_pred = (val_pred_glm + val_pred_rf[:, 2]) ./ 2;
best_threshold, f1_score = find_best_threshold(val_pred, val_labels)

(0.55, 0.8333333333333334)

#### Get full model

In [296]:
test_set = CSV.read("data/parsed/test_3350.csv");
size(test_set)

(70, 16)

In [297]:
test_model_glm = glm(val_form, data_set, Bernoulli(), LogitLink());
test_pred_glm = GLM.predict(test_model_glm, test_set);

In [298]:
test_pred_rf = get_rf_probas(data_set, test_set, names_ft, best_params);
test_pred_rf[:, 2];

In [299]:
test_pred = (test_pred_glm + test_pred_rf[:, 2]) ./ 2;

In [300]:
test_pred[test_pred .>= best_threshold] .= 1.0;
test_pred[test_pred .< best_threshold] .= 0.0;
test_pred = convert(Array{Int}, trunc.(test_pred));

#### Get prediction for ouvrage 3350

In [301]:
pred_3350 = test_set;
pred_3350[!, :SURVERSE] = test_pred;
vis_ft = [:DATE, :SURVERSE, :McTavish_sum, :Bellevue_sum, :Assomption_sum, :Trudeau_sum, :StHubert_sum];
first(pred_3350[!, vis_ft], 10)

Unnamed: 0_level_0,DATE,SURVERSE,McTavish_sum,Bellevue_sum,Assomption_sum,Trudeau_sum,StHubert_sum
Unnamed: 0_level_1,Date,Int64,Float64,Float64,Float64,Float64,Float64
1,2019-05-01,1,79.0,52.0,58.0,47.0,68.0
2,2019-05-02,0,26.0,19.0,15.0,13.0,17.0
3,2019-05-08,0,0.0,0.0,0.0,0.0,0.0
4,2019-05-09,1,89.0,67.0,77.0,86.0,67.0
5,2019-05-10,1,385.0,265.0,286.0,285.0,316.0
6,2019-05-11,0,2.0,0.0,0.0,0.0,0.0
7,2019-05-13,0,18.0,22.0,0.0,9.0,5.0
8,2019-05-14,1,95.0,130.0,62.0,104.0,84.0
9,2019-05-18,0,0.0,0.0,0.0,2.0,0.0
10,2019-05-19,1,65.0,56.0,37.0,57.0,62.0


## Ouvrage 4240

In [302]:
data_set = CSV.read("data/parsed/ouvrage_4240.csv");
train_set, val_set = partitionTrainTest(data_set);
val_labels = val_set[!, :SURVERSE];
size(data_set)

(1100, 17)

#### Train GLM

In [303]:
val_model = glm(val_form, train_set, Bernoulli(), LogitLink())
val_pred_glm = GLM.predict(val_model, val_set);

best_threshold_glm, f1_score_glm = find_best_threshold(val_pred_glm, val_labels)

(0.55, 0.7272727272727273)

In [304]:
evaluate_threshold(val_pred_glm, val_labels, 0.5) 

0.6956521739130435

#### Train Random Forest

In [305]:
best_params, f1_score_rf = find_best_rf(train_set, val_set, names_ft, params_rf)

(Int8[3, 75, 80, 9], 0.8)

#### Train ensemble model

Get probabilities for RF

In [306]:
val_pred_rf = get_rf_probas(train_set, val_set, names_ft, best_params);
val_pred_rf[:, 2];

Combine them

In [307]:
val_pred = (val_pred_glm + val_pred_rf[:, 2]) ./ 2;
best_threshold, f1_score = find_best_threshold(val_pred, val_labels)

(0.5, 0.7619047619047619)

#### Get full model

In [308]:
test_set = CSV.read("data/parsed/test_4240.csv");
size(test_set)

(49, 16)

In [309]:
test_model_glm = glm(val_form, data_set, Bernoulli(), LogitLink());
test_pred_glm = GLM.predict(test_model_glm, test_set);

In [310]:
test_pred_rf = get_rf_probas(data_set, test_set, names_ft, best_params);
test_pred_rf[:, 2];

In [311]:
test_pred = (test_pred_glm + test_pred_rf[:, 2]) ./ 2;

In [312]:
test_pred[test_pred .>= best_threshold] .= 1.0;
test_pred[test_pred .< best_threshold] .= 0.0;
test_pred = convert(Array{Int}, trunc.(test_pred));

#### Get prediction for ouvrage 4240

In [313]:
pred_4240 = test_set;
pred_4240[!, :SURVERSE] = test_pred;
vis_ft = [:DATE, :SURVERSE, :McTavish_sum, :Bellevue_sum, :Assomption_sum, :Trudeau_sum, :StHubert_sum];
first(pred_4240[!, vis_ft], 10)

Unnamed: 0_level_0,DATE,SURVERSE,McTavish_sum,Bellevue_sum,Assomption_sum,Trudeau_sum,StHubert_sum
Unnamed: 0_level_1,Date,Int64,Float64,Float64,Float64,Float64,Float64
1,2019-05-07,0,3.0,0.0,0.0,0.0,0.0
2,2019-05-09,0,89.0,67.0,77.0,86.0,67.0
3,2019-05-10,1,385.0,265.0,286.0,285.0,316.0
4,2019-05-15,0,2.0,0.0,0.0,0.0,0.0
5,2019-05-21,0,0.0,0.0,3.0,2.0,0.0
6,2019-05-22,0,0.0,0.0,0.0,2.0,0.0
7,2019-05-23,0,182.0,0.0,351.0,159.0,213.0
8,2019-05-24,0,13.0,0.0,22.0,15.0,14.0
9,2019-05-25,0,0.0,0.0,7.0,4.0,6.0
10,2019-06-01,0,0.0,0.0,0.0,0.0,0.0


## Ouvrage 4350

In [314]:
data_set = CSV.read("data/parsed/ouvrage_4350.csv");
train_set, val_set = partitionTrainTest(data_set);
val_labels = val_set[!, :SURVERSE];
size(data_set)

(1100, 17)

#### Train GLM

In [315]:
val_model = glm(val_form, train_set, Bernoulli(), LogitLink())
val_pred_glm = GLM.predict(val_model, val_set);

best_threshold_glm, f1_score_glm = find_best_threshold(val_pred_glm, val_labels)

(0.55, 0.7058823529411765)

In [318]:
evaluate_threshold(val_pred_glm, val_labels, 0.5) 

0.6666666666666666

#### Train Random Forest

In [317]:
best_params, f1_score_rf = find_best_rf(train_set, val_set, names_ft, params_rf)

(Int8[4, 105, 80, 7], 0.7017543859649122)

#### Train ensemble model

Get probabilities for RF

In [319]:
val_pred_rf = get_rf_probas(train_set, val_set, names_ft, best_params);
val_pred_rf[:, 2];

Combine them

In [320]:
val_pred = (val_pred_glm + val_pred_rf[:, 2]) ./ 2;
best_threshold, f1_score = find_best_threshold(val_pred, val_labels)

(0.55, 0.7058823529411765)

#### Get full model

In [321]:
test_set = CSV.read("data/parsed/test_4350.csv");
size(test_set)

(65, 16)

In [322]:
test_model_glm = glm(val_form, data_set, Bernoulli(), LogitLink());
test_pred_glm = GLM.predict(test_model_glm, test_set);

In [323]:
test_pred_rf = get_rf_probas(data_set, test_set, names_ft, best_params);
test_pred_rf[:, 2];

In [324]:
test_pred = (test_pred_glm + test_pred_rf[:, 2]) ./ 2;

In [325]:
test_pred[test_pred .>= best_threshold] .= 1.0;
test_pred[test_pred .< best_threshold] .= 0.0;
test_pred = convert(Array{Int}, trunc.(test_pred));

#### Get prediction for ouvrage 4350

In [326]:
pred_4350 = test_set;
pred_4350[!, :SURVERSE] = test_pred;
vis_ft = [:DATE, :SURVERSE, :McTavish_sum, :Bellevue_sum, :Assomption_sum, :Trudeau_sum, :StHubert_sum];
first(pred_4350[!, vis_ft], 10)

Unnamed: 0_level_0,DATE,SURVERSE,McTavish_sum,Bellevue_sum,Assomption_sum,Trudeau_sum,StHubert_sum
Unnamed: 0_level_1,Date,Int64,Float64,Float64,Float64,Float64,Float64
1,2019-05-03,0,34.0,27.0,34.0,31.0,44.0
2,2019-05-04,0,2.0,0.0,0.0,2.0,0.0
3,2019-05-07,0,3.0,0.0,0.0,0.0,0.0
4,2019-05-08,0,0.0,0.0,0.0,0.0,0.0
5,2019-05-10,1,385.0,265.0,286.0,285.0,316.0
6,2019-05-11,0,2.0,0.0,0.0,0.0,0.0
7,2019-05-12,0,0.0,0.0,0.0,0.0,0.0
8,2019-05-21,0,0.0,0.0,3.0,2.0,0.0
9,2019-05-22,0,0.0,0.0,0.0,2.0,0.0
10,2019-05-23,0,182.0,0.0,351.0,159.0,213.0


## Ouvrage 4380

In [327]:
data_set = CSV.read("data/parsed/ouvrage_4380.csv");
train_set, val_set = partitionTrainTest(data_set);
val_labels = val_set[!, :SURVERSE];
size(data_set)

(1103, 17)

#### Train GLM

In [328]:
val_model = glm(val_form, train_set, Bernoulli(), LogitLink())
val_pred_glm = GLM.predict(val_model, val_set);

best_threshold_glm, f1_score_glm = find_best_threshold(val_pred_glm, val_labels)

(0.65, 0.6086956521739131)

In [329]:
evaluate_threshold(val_pred_glm, val_labels, 0.5) 

0.5384615384615384

#### Train Random Forest

In [330]:
best_params, f1_score_rf = find_best_rf(train_set, val_set, names_ft, params_rf)

(Int8[3, 75, 90, 7], 0.7459770114942528)

#### Train ensemble model

Get probabilities for RF

In [331]:
val_pred_rf = get_rf_probas(train_set, val_set, names_ft, best_params);
val_pred_rf[:, 2];

Combine them

In [332]:
val_pred = (val_pred_glm + val_pred_rf[:, 2]) ./ 2;
best_threshold, f1_score = find_best_threshold(val_pred, val_labels)

(0.4, 0.6666666666666666)

#### Get full model

In [333]:
test_set = CSV.read("data/parsed/test_4380.csv");
size(test_set)

(54, 16)

In [334]:
test_model_glm = glm(val_form, data_set, Bernoulli(), LogitLink());
test_pred_glm = GLM.predict(test_model_glm, test_set);

In [335]:
test_pred_rf = get_rf_probas(data_set, test_set, names_ft, best_params);
test_pred_rf[:, 2];

In [336]:
test_pred = (test_pred_glm + test_pred_rf[:, 2]) ./ 2;

In [337]:
test_pred[test_pred .>= best_threshold] .= 1.0;
test_pred[test_pred .< best_threshold] .= 0.0;
test_pred = convert(Array{Int}, trunc.(test_pred));

#### Get prediction for ouvrage 4380

In [338]:
pred_4380 = test_set;
pred_4380[!, :SURVERSE] = test_pred;
vis_ft = [:DATE, :SURVERSE, :McTavish_sum, :Bellevue_sum, :Assomption_sum, :Trudeau_sum, :StHubert_sum];
first(pred_4380[!, vis_ft], 10)

Unnamed: 0_level_0,DATE,SURVERSE,McTavish_sum,Bellevue_sum,Assomption_sum,Trudeau_sum,StHubert_sum
Unnamed: 0_level_1,Date,Int64,Float64,Float64,Float64,Float64,Float64
1,2019-05-02,0,26.0,19.0,15.0,13.0,17.0
2,2019-05-04,0,2.0,0.0,0.0,2.0,0.0
3,2019-05-05,0,0.0,0.0,0.0,0.0,0.0
4,2019-05-06,0,0.0,0.0,0.0,0.0,0.0
5,2019-05-08,0,0.0,0.0,0.0,0.0,0.0
6,2019-05-09,0,89.0,67.0,77.0,86.0,67.0
7,2019-05-10,1,385.0,265.0,286.0,285.0,316.0
8,2019-05-17,0,5.0,11.0,0.0,3.0,5.0
9,2019-05-18,0,0.0,0.0,0.0,2.0,0.0
10,2019-05-28,0,3.0,0.0,0.0,15.0,10.0


# Prédiction

In [339]:
test_set = CSV.read("data/test.csv");
test_set[!, :SURVERSE] = zeros(size(test_set, 1));
size(test_set)

(283, 3)

Pour chaque ligne de test_set

    - On check l'ID de l'ouvrage pour savoir quel prediction load
    - On va chercher la prediction à telle date pour cet ouvrage
    - On le met à la ligne courante

In [340]:
for i=1:size(test_set, 1)
    curr_ouvrage = test_set[i, 1];
    pred_to_use = nothing;
    if curr_ouvrage == "3260-01D"
        pred_to_use = pred_3260;
    elseif curr_ouvrage == "3350-07D"
        pred_to_use = pred_3350;
    elseif curr_ouvrage == "4240-01D"
        pred_to_use = pred_4240;
    elseif curr_ouvrage == "4350-01D"
        pred_to_use = pred_4350;
    elseif curr_ouvrage == "4380-01D"
        pred_to_use = pred_4380;
    end
    
    curr_date = test_set[i, :DATE];
    pred_res = filter(row -> row.DATE == curr_date, pred_to_use);
    
    test_set[i, :SURVERSE] = pred_res[1, :SURVERSE];
end

In [341]:
test_set[!, :SURVERSE] = convert(Array{Int}, test_set[!, :SURVERSE]);
first(test_set, 10)

Unnamed: 0_level_0,NO_OUVRAGE,DATE,SURVERSE
Unnamed: 0_level_1,String,Date,Int64
1,3260-01D,2019-05-02,0
2,3260-01D,2019-05-09,0
3,3260-01D,2019-05-10,1
4,3260-01D,2019-05-15,0
5,3260-01D,2019-05-20,0
6,3260-01D,2019-05-23,0
7,3260-01D,2019-05-24,0
8,3260-01D,2019-05-26,0
9,3260-01D,2019-05-30,0
10,3350-07D,2019-05-01,1


In [346]:
ID = test_set[:,:NO_OUVRAGE].*"_".*string.(test_set[:,:DATE])
sampleSubmission = DataFrame(ID = ID, Surverse=test_set[:, :SURVERSE])
CSV.write("submissions/mc-submission-$(no_soumission).csv",sampleSubmission)

"submissions/mc-submission-19.csv"