In [None]:
using CSV, DataFrames, GLM, Statistics, Dates, Gadfly, Random, MLBase, DecisionTree;
include("utils/precipitation.jl");
include("utils/random-forest.jl");
include("utils/reg-log.jl");

In [None]:
no_soumission = 24;

In [None]:
function partitionTrainTest(data, at = 0.8) # https://discourse.julialang.org/t/simple-tool-for-train-test-split/473/2
    n = nrow(data)
    idx = shuffle(1:n)
    train_idx = view(idx, 1:floor(Int, at*n))
    test_idx = view(idx, (floor(Int, at*n)+1):n)
    return data[train_idx,:], data[test_idx,:]
end

In [None]:
val_form = @formula(SURVERSE ~ FS_sum + FS_max + FS_max3 +
                               SS_sum + SS_max + SS_max3);

In [None]:
names_ft = [:FS_sum, :FS_max, :FS_max3,
            :SS_sum, :SS_max, :SS_max3];

## Ouvrage 3260

In [None]:
data_set_3260 = CSV.read("data/parsed/oversampled/ouvrage_3260.csv");
train_set_3260, val_set_3260 = partitionTrainTest(data_set_3260);
val_labels_3260 = val_set_3260[!, :SURVERSE];
size(data_set_3260)

#### Train GLM

In [None]:
val_model_glm_3260 = glm(val_form, train_set_3260, Bernoulli(), LogitLink())
val_pred_glm_3260 = GLM.predict(val_model_glm_3260, val_set_3260);

evaluate_threshold(val_pred_glm_3260, val_labels_3260, 0.5) 

#### Train Decision Tree 

In [None]:
train_features_3260 = convert(Matrix{Float64}, train_set_3260[:, names_ft]);
train_labels_3260 = convert(Array{Int64}, train_set_3260[!,:SURVERSE]);

dt_model_3260 = build_tree(train_labels_3260, train_features_3260);
dt_model_3260 = prune_tree(dt_model_3260, 0.9)

In [None]:
val_features_3260 = convert(Matrix{Float64}, val_set_3260[:, names_ft]);
val_labels_3260 = convert(Array{Int64}, val_set_3260[!,:SURVERSE]);

val_pred_dt_3260 = apply_tree(dt_model_3260, val_features_3260);
r = roc(val_labels_3260, val_pred_dt_3260);
f1score(r)

#### Randon forest

In [None]:
estimated_params_3260 = [5, 100, 80, 21];
get_rf_direct(train_set_3260, val_set_3260, names_ft, estimated_params_3260)

#### Train ensemble model

Combine them

In [None]:
val_pred_rf_3260 = get_rf_probas(train_set_3260, val_set_3260, names_ft, estimated_params_3260);
val_pred_3260 = (val_pred_glm_3260 + val_pred_rf_3260[:, 2]) ./ 2;
evaluate_threshold(val_pred_3260, val_labels_3260, 0.5) 

#### Get full model

In [None]:
test_set_3260 = CSV.read("data/parsed/test_3260.csv");
size(test_set_3260)

In [None]:
test_model_glm_3260 = glm(val_form, data_set_3260, Bernoulli(), LogitLink());
test_pred_glm_3260 = GLM.predict(test_model_glm_3260, test_set_3260);
test_pred_rf_3260 = get_rf_probas(data_set_3260, test_set_3260, names_ft, estimated_params_3260);
test_pred_3260 = (test_pred_glm_3260 + test_pred_rf_3260[:, 2]) ./ 2;
test_pred_3260[test_pred_3260 .>= 0.5] .= 1.0;
test_pred_3260[test_pred_3260 .< 0.5] .= 0.0;
test_pred_3260 = convert(Array{Int}, trunc.(test_pred_3260));

#### Get prediction for ouvrage 3260

In [None]:
pred_3260 = test_set_3260;
pred_3260[!, :SURVERSE] = test_pred_3260;
vis_ft = [:DATE, :SURVERSE, :FS_sum, :FS_max, :SS_sum, :SS_max];
first(pred_3260[!, vis_ft], 10)

## Ouvrage 3350

In [None]:
data_set_3350 = CSV.read("data/parsed/oversampled/ouvrage_3350.csv");
train_set_3350, val_set_3350 = partitionTrainTest(data_set_3350);
val_labels_3350 = val_set_3350[!, :SURVERSE];
size(data_set_3350)

#### Train GLM

In [None]:
val_model_glm_3350 = glm(val_form, train_set_3350, Bernoulli(), LogitLink())
val_pred_glm_3350 = GLM.predict(val_model_glm_3350, val_set_3350);

evaluate_threshold(val_pred_glm_3350, val_labels_3350, 0.5) 

#### Train Decision Tree 

In [None]:
train_features_3350 = convert(Matrix{Float64}, train_set_3350[:, names_ft]);
train_labels_3350 = convert(Array{Int64}, train_set_3350[!,:SURVERSE]);

dt_model_3350 = build_tree(train_labels_3350, train_features_3350);
dt_model_3350 = prune_tree(dt_model_3350, 0.9)

In [None]:
val_features_3350 = convert(Matrix{Float64}, val_set_3350[:, names_ft]);
val_labels_3350 = convert(Array{Int64}, val_set_3350[!,:SURVERSE]);

val_pred_dt_3350 = apply_tree(dt_model_3350, val_features_3350);
r = roc(val_labels_3350, val_pred_dt_3350);
f1score(r)

#### Train Random Forest

In [None]:
estimated_params_3350 = [5, 100, 80, 33];
get_rf_direct(train_set_3350, val_set_3350, names_ft, estimated_params_3350)

In [None]:
val_pred_rf_3350 = get_rf_probas(train_set_3350, val_set_3350, names_ft, estimated_params_3350);

#### Train ensemble model

Combine them

In [None]:
val_pred_3350 = (val_pred_glm_3350 + val_pred_rf_3350[:, 2]) ./ 2;
evaluate_threshold(val_pred_3350, val_labels_3350, 0.5) 

#### Get full model

In [None]:
test_set_3350 = CSV.read("data/parsed/test_3350.csv");
size(test_set_3350)

In [None]:
test_model_glm_3350 = glm(val_form, data_set_3350, Bernoulli(), LogitLink());
test_pred_glm_3350 = GLM.predict(test_model_glm_3350, test_set_3350);
test_pred_rf_3350 = get_rf_probas(data_set_3350, test_set_3350, names_ft, estimated_params_3350);
test_pred_3350 = (test_pred_glm_3350 + test_pred_rf_3350[:, 2]) ./ 2;
test_pred_3350[test_pred_3350 .>= 0.5] .= 1.0;
test_pred_3350[test_pred_3350 .< 0.5] .= 0.0;
test_pred_3350 = convert(Array{Int}, trunc.(test_pred_3350));

#### Get prediction for ouvrage 3350

In [None]:
pred_3350 = test_set_3350;
pred_3350[!, :SURVERSE] = test_pred_3350;
vis_ft = [:DATE, :SURVERSE, :FS_sum, :FS_max3, :SS_sum, :SS_max3];
first(pred_3350[!, vis_ft], 10)

## Ouvrage 4240

In [None]:
data_set_4240 = CSV.read("data/parsed/oversampled/ouvrage_4240.csv");
train_set_4240, val_set_4240 = partitionTrainTest(data_set_4240);
val_labels_4240 = val_set_4240[!, :SURVERSE];
size(data_set_4240)

#### Train GLM

In [None]:
val_model_glm_4240 = glm(val_form, train_set_4240, Bernoulli(), LogitLink())
val_pred_glm_4240 = GLM.predict(val_model_glm_4240, val_set_4240);

evaluate_threshold(val_pred_glm_4240, val_labels_4240, 0.5) 

#### Train Decision Tree 

In [None]:
train_features_4240 = convert(Matrix{Float64}, train_set_4240[:, names_ft]);
train_labels_4240 = convert(Array{Int64}, train_set_4240[!,:SURVERSE]);

dt_model_4240 = build_tree(train_labels_4240, train_features_4240);
dt_model_4240 = prune_tree(dt_model_4240, 0.9)

In [None]:
val_features_4240 = convert(Matrix{Float64}, val_set_4240[:, names_ft]);
val_labels_4240 = convert(Array{Int64}, val_set_4240[!,:SURVERSE]);

val_pred_dt_4240 = apply_tree(dt_model_4240, val_features_4240);
r = roc(val_labels_4240, val_pred_dt_4240);
f1score(r)

#### Train Random Forest

In [None]:
estimated_params_4240 = [5, 100, 80, 20];
get_rf_direct(train_set_4240, val_set_4240, names_ft, estimated_params_4240)

In [None]:
val_pred_rf_4240 = get_rf_probas(train_set_4240, val_set_4240, names_ft, estimated_params_4240);

#### Train ensemble model

Combine them

In [None]:
val_pred_4240 = (val_pred_glm_4240 + val_pred_rf_4240[:, 2]) ./ 2;
evaluate_threshold(val_pred_4240, val_labels_4240, 0.5) 

#### Get full model

In [None]:
test_set_4240 = CSV.read("data/parsed/test_4240.csv");
size(test_set_4240)

In [None]:
test_pred_rf_4240 = get_rf_probas(data_set_4240, test_set_4240, names_ft, estimated_params_4240);
test_pred_4240 = (test_pred_glm_4240 + test_pred_rf_4240[:, 2]) ./ 2;
test_pred_4240[test_pred_4240 .>= 0.5] .= 1.0;
test_pred_4240[test_pred_4240 .< 0.5] .= 0.0;
test_pred_4240 = convert(Array{Int}, trunc.(test_pred_4240));

#### Get prediction for ouvrage 4240

In [None]:
pred_4240 = test_set_4240;
pred_4240[!, :SURVERSE] = test_pred_4240;
vis_ft = [:DATE, :SURVERSE, :FS_sum, :FS_max3, :SS_sum, :SS_max3];
first(pred_4240[!, vis_ft], 10)

## Ouvrage 4350

In [None]:
data_set_4350 = CSV.read("data/parsed/oversampled/ouvrage_4350.csv");
train_set_4350, val_set_4350 = partitionTrainTest(data_set_4350);
val_labels_4350 = val_set_4350[!, :SURVERSE];
size(data_set_4350)

#### Train GLM

In [None]:
val_model_glm_4350 = glm(val_form, train_set_4350, Bernoulli(), LogitLink())
val_pred_glm_4350 = GLM.predict(val_model_glm_4350, val_set_4350);

evaluate_threshold(val_pred_glm_4350, val_labels_4350, 0.5) 

#### Train Decision Tree 

In [None]:
train_features_4350 = convert(Matrix{Float64}, train_set_4350[:, names_ft]);
train_labels_4350 = convert(Array{Int64}, train_set_4350[!,:SURVERSE]);

dt_model_4350 = build_tree(train_labels_4350, train_features_4350);
dt_model_4350 = prune_tree(dt_model_4350, 0.9)

In [None]:
val_features_4350 = convert(Matrix{Float64}, val_set_4350[:, names_ft]);
val_labels_4350 = convert(Array{Int64}, val_set_4350[!,:SURVERSE]);

val_pred_dt_4350 = apply_tree(dt_model_4350, val_features_4350);
r = roc(val_labels_4350, val_pred_dt_4350);
f1score(r)

#### Train Random Forest

In [None]:
estimated_params_4350 = [5, 100, 80, 13];
get_rf_direct(train_set_4350, val_set_4350, names_ft, estimated_params_4350)

In [None]:
val_pred_rf_4350 = get_rf_probas(train_set_4350, val_set_4350, names_ft, estimated_params_4350);

#### Train ensemble model

Combine them

In [None]:
val_pred_4350 = (val_pred_glm_4350 + val_pred_rf_4350[:, 2]) ./ 2;
evaluate_threshold(val_pred_4350, val_labels_4350, 0.5) 

#### Get full model

In [None]:
test_set_4350 = CSV.read("data/parsed/test_4350.csv");
size(test_set_4350)

In [None]:
test_model_glm_4350 = glm(val_form, data_set_4350, Bernoulli(), LogitLink());
test_pred_glm_4350 = GLM.predict(test_model_glm_4350, test_set_4350);
test_pred_rf_4350 = get_rf_probas(data_set_4350, test_set_4350, names_ft, estimated_params_4350);
test_pred_4350 = (test_pred_glm_4350 + test_pred_rf_4350[:, 2]) ./ 2;
test_pred_4350[test_pred_4350 .>= 0.5] .= 1.0;
test_pred_4350[test_pred_4350 .< 0.5] .= 0.0;
test_pred_4350 = convert(Array{Int}, trunc.(test_pred_4350));

#### Get prediction for ouvrage 4350

In [None]:
pred_4350 = test_set_4350;
pred_4350[!, :SURVERSE] = test_pred_4350;
vis_ft = [:DATE, :SURVERSE, :FS_sum, :FS_max3, :SS_sum, :SS_max3];
first(pred_4350[!, vis_ft], 10)

## Ouvrage 4380

In [None]:
data_set_4380 = CSV.read("data/parsed/oversampled/ouvrage_4380.csv");
train_set_4380, val_set_4380 = partitionTrainTest(data_set_4380);
val_labels_4380 = val_set_4380[!, :SURVERSE];
size(data_set_4380)

#### Train GLM

In [None]:
val_model_glm_4380 = glm(val_form, train_set_4380, Bernoulli(), LogitLink())
val_pred_glm_4380 = GLM.predict(val_model_glm_4380, val_set_4380);

evaluate_threshold(val_pred_glm_4380, val_labels_4380, 0.5) 

#### Train Decision Tree 

In [None]:
train_features_4380 = convert(Matrix{Float64}, train_set_4380[:, names_ft]);
train_labels_4380 = convert(Array{Int64}, train_set_4380[!,:SURVERSE]);

dt_model_4380 = build_tree(train_labels_4380, train_features_4380);
dt_model_4380 = prune_tree(dt_model_4380, 0.9)

In [None]:
val_features_4380 = convert(Matrix{Float64}, val_set_4380[:, names_ft]);
val_labels_4380 = convert(Array{Int64}, val_set_4380[!,:SURVERSE]);

val_pred_dt_4380 = apply_tree(dt_model_4380, val_features_4380);
r = roc(val_labels_4380, val_pred_dt_4380);
f1score(r)

#### Train Random Forest

In [None]:
estimated_params_4380 = [5, 100, 80, 18];
get_rf_direct(train_set_4380, val_set_4380, names_ft, estimated_params_4380)

In [None]:
val_pred_rf_4380 = get_rf_probas(train_set_4380, val_set_4380, names_ft, estimated_params_4380);

#### Train ensemble model

Combine them

In [None]:
val_pred_4380 = (val_pred_glm_4380 + val_pred_rf_4380[:, 2]) ./ 2;
evaluate_threshold(val_pred_4380, val_labels_4380, 0.5) 

#### Get full model

In [None]:
test_set_4380 = CSV.read("data/parsed/test_4380.csv");
size(test_set_4380)

In [None]:
test_pred_rf_4380 = get_rf_probas(data_set_4380, test_set_4380, names_ft, estimated_params_4380);
test_pred_4380 = (test_pred_glm_4380 + test_pred_rf_4380[:, 2]) ./ 2;
test_pred_4380[test_pred_4380 .>= 0.5] .= 1.0;
test_pred_4380[test_pred_4380 .< 0.5] .= 0.0;
test_pred_4380 = convert(Array{Int}, trunc.(test_pred_4380));

#### Get prediction for ouvrage 4380

In [None]:
pred_4380 = test_set_4380;
pred_4380[!, :SURVERSE] = test_pred_4380;
vis_ft = [:DATE, :SURVERSE, :FS_sum, :FS_max3, :SS_sum, :SS_max3];
first(pred_4380[!, vis_ft], 10)

# Prédiction

In [None]:
test_set_final = CSV.read("data/test.csv");
test_set_final[!, :SURVERSE] = zeros(size(test_set_final, 1));
size(test_set_final)

Pour chaque ligne de test_set

    - On check l'ID de l'ouvrage pour savoir quel prediction load
    - On va chercher la prediction à telle date pour cet ouvrage
    - On le met à la ligne courante

In [None]:
for i=1:size(test_set_final, 1)
    curr_ouvrage = test_set_final[i, 1];
    pred_to_use = nothing;
    if curr_ouvrage == "3260-01D"
        pred_to_use = pred_3260;
    elseif curr_ouvrage == "3350-07D"
        pred_to_use = pred_3350;
    elseif curr_ouvrage == "4240-01D"
        pred_to_use = pred_4240;
    elseif curr_ouvrage == "4350-01D"
        pred_to_use = pred_4350;
    elseif curr_ouvrage == "4380-01D"
        pred_to_use = pred_4380;
    end
    
    curr_date = test_set_final[i, :DATE];
    pred_res = filter(row -> row.DATE == curr_date, pred_to_use);
    
    test_set_final[i, :SURVERSE] = pred_res[1, :SURVERSE];
end

In [None]:
test_set_final[!, :SURVERSE] = convert(Array{Int}, test_set_final[!, :SURVERSE]);
first(test_set_final, 10)

In [None]:
ID = test_set_final[:,:NO_OUVRAGE].*"_".*string.(test_set_final[:,:DATE])
sampleSubmission = DataFrame(ID = ID, Surverse=test_set_final[:, :SURVERSE])
CSV.write("submissions/mc-submission-$(no_soumission).csv",sampleSubmission)