In [25]:
using CSV, DataFrames, GLM, Statistics, Dates, Gadfly, Random, MLBase, DecisionTree;
include("utils/precipitation.jl");
include("utils/random-forest.jl");
include("utils/reg-log.jl");

In [8]:
no_soumission = 21;

In [9]:
function partitionTrainTest(data, at = 0.8) # https://discourse.julialang.org/t/simple-tool-for-train-test-split/473/2
    n = nrow(data)
    idx = shuffle(1:n)
    train_idx = view(idx, 1:floor(Int, at*n))
    test_idx = view(idx, (floor(Int, at*n)+1):n)
    return data[train_idx,:], data[test_idx,:]
end

partitionTrainTest (generic function with 2 methods)

In [52]:
val_form = @formula(SURVERSE ~ FS_sum + FS_max + FS_max3 + 
                               SS_sum + SS_max + SS_max3);

In [11]:
names_ft = [:FS_sum, :FS_max, :FS_max3,
            :SS_sum, :SS_max, :SS_max3];

In [12]:
params_rf = DataFrame(param=String[], min=Int8[], max=Int8[], step=Int8[]);

push!(params_rf, ["nft", 2, 6, 1]);
push!(params_rf, ["ntrees", 50, 100, 10]);
push!(params_rf, ["podata", 75, 95, 5]);
push!(params_rf, ["maxd", 15, 27, 2]);

params_rf

Unnamed: 0_level_0,param,min,max,step
Unnamed: 0_level_1,String,Int8,Int8,Int8
1,nft,2,6,1
2,ntrees,50,100,10
3,podata,75,95,5
4,maxd,15,27,2


## Ouvrage 3260

In [49]:
data_set = CSV.read("data/parsed/oversampled/ouvrage_3260.csv");
train_set, val_set = partitionTrainTest(data_set);
val_labels = val_set[!, :SURVERSE];
size(data_set)

(5000, 10)

#### Train GLM

In [53]:
val_model = glm(val_form, train_set, Bernoulli(), LogitLink())
val_pred_glm = GLM.predict(val_model, val_set);

best_threshold_glm, f1_score_glm = find_best_threshold(val_pred_glm, val_labels)

(0.2, 0.8871331828442438)

In [54]:
evaluate_threshold(val_pred_glm, val_labels, 0.5) 

0.8660826032540676

#### Train Decision Tree 

In [16]:
train_features = convert(Matrix{Float64}, train_set[:, names_ft]);
train_labels = convert(Array{Int64}, train_set[!,:SURVERSE]);

dt_model = build_tree(train_labels, train_features)

Decision Tree
Leaves: 162
Depth:  16

In [18]:
dt_model = prune_tree(dt_model, 0.9)

Decision Tree
Leaves: 157
Depth:  16

In [19]:
val_features = convert(Matrix{Float64}, val_set[:, names_ft]);
val_labels = convert(Array{Int64}, val_set[!,:SURVERSE]);

val_pred_dt = apply_tree(dt_model, val_features);
r = roc(val_labels, val_pred_dt);
f1score(r)

0.945679012345679

#### Train Random Forest [TODO: RUN LATER, TOO LONG]

In [None]:
# best_params, f1_score_rf = find_best_rf(train_set, val_set, names_ft, params_rf)

#### QUICK RANDOM FOREST ESTIMATED

In [26]:
estimated_params = [3, 40, 80, 20];
get_rf_direct(train_set, val_set, names_ft, estimated_params)

0.9618696186961869

QUICK REPLACEMENT

In [27]:
val_pred_rf = get_rf_probas(train_set, val_set, names_ft, estimated_params);

#### Train ensemble model

Get probabilities for RF

In [None]:
# val_pred_rf = get_rf_probas(train_set, val_set, names_ft, best_params);
# val_pred_rf[:, 2];

Combine them

In [28]:
val_pred = (val_pred_glm + val_pred_rf[:, 2]) ./ 2;
best_threshold, f1_score = find_best_threshold(val_pred, val_labels)

(0.45, 0.9462102689486552)

In [29]:
evaluate_threshold(val_pred, val_labels, 0.5) 

0.9392812887236679

#### Get full model

In [55]:
test_set = CSV.read("data/parsed/test_3260.csv");
size(test_set)

(45, 9)

In [56]:
test_model_glm = glm(val_form, data_set, Bernoulli(), LogitLink());
test_pred_glm = GLM.predict(test_model_glm, test_set);

In [58]:
# test_pred_rf = get_rf_probas(data_set, test_set, names_ft, best_params);
# test_pred_rf[:, 2];

In [60]:
test_pred_rf = get_rf_probas(data_set, test_set, names_ft, estimated_params);

In [61]:
test_pred = (test_pred_glm + test_pred_rf[:, 2]) ./ 2;

In [63]:
test_pred[test_pred .>= best_threshold] .= 1.0;
test_pred[test_pred .< best_threshold] .= 0.0;
test_pred = convert(Array{Int}, trunc.(test_pred));

#### Get prediction for ouvrage 3260

In [None]:
pred_3260 = test_set;
pred_3260[!, :SURVERSE] = test_pred;
vis_ft = [:DATE, :SURVERSE, :FS_sum, :FS_max3, :SS_sum, :SS_max3];
first(pred_3260[!, vis_ft], 10)

## Ouvrage 3350

In [83]:
data_set = CSV.read("data/parsed/oversampled/ouvrage_3350.csv");
train_set, val_set = partitionTrainTest(data_set);
val_labels = val_set[!, :SURVERSE];
size(data_set)

(5000, 10)

#### Train GLM

In [84]:
val_model = glm(val_form, train_set, Bernoulli(), LogitLink())
val_pred_glm = GLM.predict(val_model, val_set);

best_threshold_glm, f1_score_glm = find_best_threshold(val_pred_glm, val_labels)

(0.2, 0.8723404255319149)

In [85]:
evaluate_threshold(val_pred_glm, val_labels, 0.5) 

0.8622327790973872

#### Train Decision Tree 

In [86]:
train_features = convert(Matrix{Float64}, train_set[:, names_ft]);
train_labels = convert(Array{Int64}, train_set[!,:SURVERSE]);

dt_model = build_tree(train_labels, train_features)

Decision Tree
Leaves: 140
Depth:  15

In [87]:
dt_model = prune_tree(dt_model, 0.9)

Decision Tree
Leaves: 126
Depth:  15

In [88]:
val_features = convert(Matrix{Float64}, val_set[:, names_ft]);
val_labels = convert(Array{Int64}, val_set[!,:SURVERSE]);

val_pred_dt = apply_tree(dt_model, val_features);
r = roc(val_labels, val_pred_dt);
f1score(r)

0.9316909294512878

#### QUICK RANDOM FOREST ESTIMATED

In [89]:
estimated_params = [3, 40, 80, 20];
get_rf_direct(train_set, val_set, names_ft, estimated_params)

0.9542920847268673

#### Train Random Forest

In [None]:
best_params, f1_score_rf = find_best_rf(train_set, val_set, names_ft, params_rf)

QUICK REPLACEMENT

In [90]:
val_pred_rf = get_rf_probas(train_set, val_set, names_ft, estimated_params);

#### Train ensemble model

Get probabilities for RF

In [91]:
# val_pred_rf = get_rf_probas(train_set, val_set, names_ft, best_params);
# val_pred_rf[:, 2];

Combine them

In [92]:
val_pred = (val_pred_glm + val_pred_rf[:, 2]) ./ 2;
best_threshold, f1_score = find_best_threshold(val_pred, val_labels)

(0.45, 0.9377777777777778)

In [93]:
evaluate_threshold(val_pred, val_labels, 0.5) 

0.9358830146231721

#### Get full model

In [94]:
test_set = CSV.read("data/parsed/test_3350.csv");
size(test_set)

(70, 9)

In [95]:
test_model_glm = glm(val_form, data_set, Bernoulli(), LogitLink());
test_pred_glm = GLM.predict(test_model_glm, test_set);

In [None]:
# test_pred_rf = get_rf_probas(data_set, test_set, names_ft, best_params);
# test_pred_rf[:, 2];

In [96]:
test_pred_rf = get_rf_probas(data_set, test_set, names_ft, estimated_params);

In [97]:
test_pred = (test_pred_glm + test_pred_rf[:, 2]) ./ 2;

In [98]:
test_pred[test_pred .>= best_threshold] .= 1.0;
test_pred[test_pred .< best_threshold] .= 0.0;
test_pred = convert(Array{Int}, trunc.(test_pred));

#### Get prediction for ouvrage 3350

In [100]:
pred_3350 = test_set;
pred_3350[!, :SURVERSE] = test_pred;
vis_ft = [:DATE, :SURVERSE, :FS_sum, :FS_max3, :SS_sum, :SS_max3];
first(pred_3350[!, vis_ft], 10);

## Ouvrage 4240

In [117]:
data_set = CSV.read("data/parsed/oversampled/ouvrage_4240.csv");
train_set, val_set = partitionTrainTest(data_set);
val_labels = val_set[!, :SURVERSE];
size(data_set)

(5000, 10)

#### Train GLM

In [118]:
val_model = glm(val_form, train_set, Bernoulli(), LogitLink())
val_pred_glm = GLM.predict(val_model, val_set);

best_threshold_glm, f1_score_glm = find_best_threshold(val_pred_glm, val_labels)

(0.15, 0.8366890380313199)

In [119]:
evaluate_threshold(val_pred_glm, val_labels, 0.5) 

0.803763440860215

#### Train Decision Tree 

In [120]:
train_features = convert(Matrix{Float64}, train_set[:, names_ft]);
train_labels = convert(Array{Int64}, train_set[!,:SURVERSE]);

dt_model = build_tree(train_labels, train_features)

Decision Tree
Leaves: 147
Depth:  21

In [121]:
dt_model = prune_tree(dt_model, 0.9)

Decision Tree
Leaves: 140
Depth:  21

In [122]:
val_features = convert(Matrix{Float64}, val_set[:, names_ft]);
val_labels = convert(Array{Int64}, val_set[!,:SURVERSE]);

val_pred_dt = apply_tree(dt_model, val_features);
r = roc(val_labels, val_pred_dt);
f1score(r)

0.9535759096612296

#### QUICK RANDOM FOREST ESTIMATED

In [123]:
estimated_params = [3, 40, 80, 20];
get_rf_direct(train_set, val_set, names_ft, estimated_params)

0.9612983770287141

In [125]:
val_pred_rf = get_rf_probas(train_set, val_set, names_ft, estimated_params);

#### Train Random Forest

In [None]:
# best_params, f1_score_rf = find_best_rf(train_set, val_set, names_ft, params_rf)

#### Train ensemble model

Get probabilities for RF

In [124]:
# val_pred_rf = get_rf_probas(train_set, val_set, names_ft, best_params);
# val_pred_rf[:, 2];

Combine them

In [126]:
val_pred = (val_pred_glm + val_pred_rf[:, 2]) ./ 2;
best_threshold, f1_score = find_best_threshold(val_pred, val_labels)

(0.45, 0.9387254901960784)

In [127]:
evaluate_threshold(val_pred, val_labels, 0.5) 

0.9368029739776952

#### Get full model

In [128]:
test_set = CSV.read("data/parsed/test_4240.csv");
size(test_set)

(49, 9)

In [129]:
test_model_glm = glm(val_form, data_set, Bernoulli(), LogitLink());
test_pred_glm = GLM.predict(test_model_glm, test_set);

In [None]:
# test_pred_rf = get_rf_probas(data_set, test_set, names_ft, best_params);
# test_pred_rf[:, 2];

In [130]:
test_pred_rf = get_rf_probas(data_set, test_set, names_ft, estimated_params);

In [131]:
test_pred = (test_pred_glm + test_pred_rf[:, 2]) ./ 2;

In [132]:
test_pred[test_pred .>= best_threshold] .= 1.0;
test_pred[test_pred .< best_threshold] .= 0.0;
test_pred = convert(Array{Int}, trunc.(test_pred));

#### Get prediction for ouvrage 4240

In [135]:
pred_4240 = test_set;
pred_4240[!, :SURVERSE] = test_pred;
vis_ft = [:DATE, :SURVERSE, :FS_sum, :FS_max3, :SS_sum, :SS_max3];
first(pred_4240[!, vis_ft], 10);

## Ouvrage 4350

In [144]:
data_set = CSV.read("data/parsed/oversampled/ouvrage_4350.csv");
train_set, val_set = partitionTrainTest(data_set);
val_labels = val_set[!, :SURVERSE];
size(data_set)

(5000, 10)

#### Train GLM

In [145]:
val_model = glm(val_form, train_set, Bernoulli(), LogitLink())
val_pred_glm = GLM.predict(val_model, val_set);

best_threshold_glm, f1_score_glm = find_best_threshold(val_pred_glm, val_labels)

(0.35, 0.9368295589988082)

In [146]:
evaluate_threshold(val_pred_glm, val_labels, 0.5) 

0.9320388349514563

#### Train Decision Tree 

In [147]:
train_features = convert(Matrix{Float64}, train_set[:, names_ft]);
train_labels = convert(Array{Int64}, train_set[!,:SURVERSE]);

dt_model = build_tree(train_labels, train_features)

Decision Tree
Leaves: 74
Depth:  14

In [148]:
dt_model = prune_tree(dt_model, 0.9)

Decision Tree
Leaves: 69
Depth:  14

In [149]:
val_features = convert(Matrix{Float64}, val_set[:, names_ft]);
val_labels = convert(Array{Int64}, val_set[!,:SURVERSE]);

val_pred_dt = apply_tree(dt_model, val_features);
r = roc(val_labels, val_pred_dt);
f1score(r)

0.9671132764920828

#### QUICK RANDOM FOREST ESTIMATED

In [150]:
estimated_params = [3, 40, 80, 20];
get_rf_direct(train_set, val_set, names_ft, estimated_params)

0.9796407185628743

In [151]:
val_pred_rf = get_rf_probas(train_set, val_set, names_ft, estimated_params);

#### Train Random Forest

In [139]:
# best_params, f1_score_rf = find_best_rf(train_set, val_set, names_ft, params_rf)

#### Train ensemble model

Get probabilities for RF

In [152]:
# val_pred_rf = get_rf_probas(train_set, val_set, names_ft, best_params);
# val_pred_rf[:, 2];

Combine them

In [153]:
val_pred = (val_pred_glm + val_pred_rf[:, 2]) ./ 2;
best_threshold, f1_score = find_best_threshold(val_pred, val_labels)

(0.45, 0.9691943127962085)

In [154]:
evaluate_threshold(val_pred, val_labels, 0.5) 

0.9679715302491103

#### Get full model

In [155]:
test_set = CSV.read("data/parsed/test_4350.csv");
size(test_set)

(65, 9)

In [156]:
test_model_glm = glm(val_form, data_set, Bernoulli(), LogitLink());
test_pred_glm = GLM.predict(test_model_glm, test_set);

In [157]:
# test_pred_rf = get_rf_probas(data_set, test_set, names_ft, best_params);
# test_pred_rf[:, 2];

In [158]:
test_pred_rf = get_rf_probas(data_set, test_set, names_ft, estimated_params);

In [159]:
test_pred = (test_pred_glm + test_pred_rf[:, 2]) ./ 2;

In [160]:
test_pred[test_pred .>= best_threshold] .= 1.0;
test_pred[test_pred .< best_threshold] .= 0.0;
test_pred = convert(Array{Int}, trunc.(test_pred));

#### Get prediction for ouvrage 4350

In [161]:
pred_4350 = test_set;
pred_4350[!, :SURVERSE] = test_pred;
vis_ft = [:DATE, :SURVERSE, :FS_sum, :FS_max3, :SS_sum, :SS_max3];
first(pred_4350[!, vis_ft], 10);

## Ouvrage 4380

In [169]:
data_set = CSV.read("data/parsed/oversampled/ouvrage_4380.csv");
train_set, val_set = partitionTrainTest(data_set);
val_labels = val_set[!, :SURVERSE];
size(data_set)

(5000, 10)

#### Train GLM

In [170]:
val_model = glm(val_form, train_set, Bernoulli(), LogitLink())
val_pred_glm = GLM.predict(val_model, val_set);

best_threshold_glm, f1_score_glm = find_best_threshold(val_pred_glm, val_labels)

(0.35, 0.9133489461358314)

In [171]:
evaluate_threshold(val_pred_glm, val_labels, 0.5) 

0.8943488943488943

#### Train Decision Tree 

In [172]:
train_features = convert(Matrix{Float64}, train_set[:, names_ft]);
train_labels = convert(Array{Int64}, train_set[!,:SURVERSE]);

dt_model = build_tree(train_labels, train_features)

Decision Tree
Leaves: 135
Depth:  17

In [173]:
dt_model = prune_tree(dt_model, 0.9)

Decision Tree
Leaves: 128
Depth:  17

In [174]:
val_features = convert(Matrix{Float64}, val_set[:, names_ft]);
val_labels = convert(Array{Int64}, val_set[!,:SURVERSE]);

val_pred_dt = apply_tree(dt_model, val_features);
r = roc(val_labels, val_pred_dt);
f1score(r)

0.9566265060240964

#### QUICK RANDOM FOREST ESTIMATED

In [175]:
estimated_params = [3, 40, 80, 20];
get_rf_direct(train_set, val_set, names_ft, estimated_params)

0.9702734839476813

In [176]:
val_pred_rf = get_rf_probas(train_set, val_set, names_ft, estimated_params);

#### Train Random Forest

In [177]:
# best_params, f1_score_rf = find_best_rf(train_set, val_set, names_ft, params_rf)

#### Train ensemble model

Get probabilities for RF

In [178]:
# val_pred_rf = get_rf_probas(train_set, val_set, names_ft, best_params);
# val_pred_rf[:, 2];

Combine them

In [179]:
val_pred = (val_pred_glm + val_pred_rf[:, 2]) ./ 2;
best_threshold, f1_score = find_best_threshold(val_pred, val_labels)

(0.45, 0.9499417927823051)

In [180]:
evaluate_threshold(val_pred, val_labels, 0.5) 

0.9493521790341578

#### Get full model

In [181]:
test_set = CSV.read("data/parsed/test_4380.csv");
size(test_set)

(54, 9)

In [182]:
test_model_glm = glm(val_form, data_set, Bernoulli(), LogitLink());
test_pred_glm = GLM.predict(test_model_glm, test_set);

In [183]:
# test_pred_rf = get_rf_probas(data_set, test_set, names_ft, best_params);
# test_pred_rf[:, 2];

In [184]:
test_pred_rf = get_rf_probas(data_set, test_set, names_ft, estimated_params);

In [185]:
test_pred = (test_pred_glm + test_pred_rf[:, 2]) ./ 2;

In [186]:
test_pred[test_pred .>= best_threshold] .= 1.0;
test_pred[test_pred .< best_threshold] .= 0.0;
test_pred = convert(Array{Int}, trunc.(test_pred));

#### Get prediction for ouvrage 4380

In [187]:
pred_4380 = test_set;
pred_4380[!, :SURVERSE] = test_pred;
vis_ft = [:DATE, :SURVERSE, :FS_sum, :FS_max3, :SS_sum, :SS_max3];
first(pred_4380[!, vis_ft], 10);

# Prédiction

In [None]:
test_set = CSV.read("data/test.csv");
test_set[!, :SURVERSE] = zeros(size(test_set, 1));
size(test_set)

Pour chaque ligne de test_set

    - On check l'ID de l'ouvrage pour savoir quel prediction load
    - On va chercher la prediction à telle date pour cet ouvrage
    - On le met à la ligne courante

In [None]:
for i=1:size(test_set, 1)
    curr_ouvrage = test_set[i, 1];
    pred_to_use = nothing;
    if curr_ouvrage == "3260-01D"
        pred_to_use = pred_3260;
    elseif curr_ouvrage == "3350-07D"
        pred_to_use = pred_3350;
    elseif curr_ouvrage == "4240-01D"
        pred_to_use = pred_4240;
    elseif curr_ouvrage == "4350-01D"
        pred_to_use = pred_4350;
    elseif curr_ouvrage == "4380-01D"
        pred_to_use = pred_4380;
    end
    
    curr_date = test_set[i, :DATE];
    pred_res = filter(row -> row.DATE == curr_date, pred_to_use);
    
    test_set[i, :SURVERSE] = pred_res[1, :SURVERSE];
end

In [None]:
test_set[!, :SURVERSE] = convert(Array{Int}, test_set[!, :SURVERSE]);
first(test_set, 10)

In [None]:
ID = test_set[:,:NO_OUVRAGE].*"_".*string.(test_set[:,:DATE])
sampleSubmission = DataFrame(ID = ID, Surverse=test_set[:, :SURVERSE])
CSV.write("submissions/mc-submission-$(no_soumission).csv",sampleSubmission)