In [1]:
using CSV, DataFrames, GLM, Statistics, Dates, Gadfly, Random, MLBase, DecisionTree;
include("utils/precipitation.jl");

In [2]:
function partitionTrainTest(data, at = 0.8) # https://discourse.julialang.org/t/simple-tool-for-train-test-split/473/2
    n = nrow(data)
    idx = shuffle(1:n)
    train_idx = view(idx, 1:floor(Int, at*n))
    test_idx = view(idx, (floor(Int, at*n)+1):n)
    return data[train_idx,:], data[test_idx,:]
end

partitionTrainTest (generic function with 2 methods)

In [3]:
val_form = @formula(SURVERSE ~ McTavish_sum + McTavish_max + McTavish_max3 + 
                               Bellevue_sum + Bellevue_max + Bellevue_max3 + 
                               Assomption_sum + Assomption_max + Assomption_max3 + 
                               Trudeau_sum + Trudeau_max + Trudeau_max3 + 
                               StHubert_sum + StHubert_max + StHubert_max3);

## Ouvrage 3260

In [4]:
data_3260 = CSV.read("data/parsed/ouvrage_3260.csv");
train_set, val_set = partitionTrainTest(data_3260);

#### GLM Train / Val

In [5]:
val_model = glm(val_form, train_set, Bernoulli(), LogitLink())
val_labels = val_set[!, :SURVERSE];
val_pred = GLM.predict(val_model, val_set);

threshold = 0.5;
val_pred[val_pred .>= threshold] .= 1.0;
val_pred[val_pred .< threshold] .= 0.0;
val_pred = convert(Array{Int}, trunc.(val_pred));

r = roc(val_labels, val_pred);
f1score(r)

0.5454545454545454

#### GLM Submission

In [6]:
test_model = glm(val_form, data_3260, Bernoulli(), LogitLink());

In [7]:
test_3260 = CSV.read("data/parsed/test_3260.csv");
size(test_3260)

(45, 16)

In [8]:
test_pred = GLM.predict(test_model, test_3260);
threshold = 0.5;
test_pred[test_pred .>= threshold] .= 1.0;
test_pred[test_pred .< threshold] .= 0.0;
test_pred = convert(Array{Int}, trunc.(test_pred));

In [9]:
pred_3260 = test_3260;
pred_3260[!, :SURVERSE] = test_pred;

In [10]:
first(pred_3260[!, [:DATE, :SURVERSE, :McTavish_sum]], 5)

Unnamed: 0_level_0,DATE,SURVERSE,McTavish_sum
Unnamed: 0_level_1,Date,Int64,Float64
1,2019-05-02,0,26.0
2,2019-05-09,0,89.0
3,2019-05-10,1,385.0
4,2019-05-15,0,2.0
5,2019-05-20,0,46.0


#### Random Forest

## Ouvrage 3350

In [11]:
data_3350 = CSV.read("data/parsed/ouvrage_3350.csv");
train_set, val_set = partitionTrainTest(data_3350);

#### GLM Train / Val

In [12]:
val_model = glm(val_form, train_set, Bernoulli(), LogitLink())
val_labels = val_set[!, :SURVERSE];
val_pred = GLM.predict(val_model, val_set);

threshold = 0.5;
val_pred[val_pred .>= threshold] .= 1.0;
val_pred[val_pred .< threshold] .= 0.0;
val_pred = convert(Array{Int}, trunc.(val_pred));

r = roc(val_labels, val_pred);
f1score(r)

0.7777777777777778

#### GLM Submission

In [13]:
test_model = glm(val_form, data_3350, Bernoulli(), LogitLink());

In [14]:
test_3350 = CSV.read("data/parsed/test_3350.csv");
size(test_3350)

(70, 16)

In [15]:
test_pred = GLM.predict(test_model, test_3350);
threshold = 0.5;
test_pred[test_pred .>= threshold] .= 1.0;
test_pred[test_pred .< threshold] .= 0.0;
test_pred = convert(Array{Int}, trunc.(test_pred));

In [16]:
pred_3350 = test_3350;
pred_3350[!, :SURVERSE] = test_pred;

In [17]:
first(pred_3350[!, [:DATE, :SURVERSE, :McTavish_sum]], 5)

Unnamed: 0_level_0,DATE,SURVERSE,McTavish_sum
Unnamed: 0_level_1,Date,Int64,Float64
1,2019-05-01,1,79.0
2,2019-05-02,0,26.0
3,2019-05-08,0,0.0
4,2019-05-09,1,89.0
5,2019-05-10,1,385.0


#### Random Forest

## Ouvrage 4240

In [18]:
data_4240 = CSV.read("data/parsed/ouvrage_4240.csv");
train_set, val_set = partitionTrainTest(data_4240);

#### GLM Train / Val

In [19]:
val_model = glm(val_form, train_set, Bernoulli(), LogitLink())
val_labels = val_set[!, :SURVERSE];
val_pred = GLM.predict(val_model, val_set);

threshold = 0.5;
val_pred[val_pred .>= threshold] .= 1.0;
val_pred[val_pred .< threshold] .= 0.0;
val_pred = convert(Array{Int}, trunc.(val_pred));

r = roc(val_labels, val_pred);
f1score(r)

0.7619047619047619

#### GLM Submission

In [20]:
test_model = glm(val_form, data_4240, Bernoulli(), LogitLink());

In [21]:
test_4240 = CSV.read("data/parsed/test_4240.csv");
size(test_4240)

(49, 16)

In [22]:
test_pred = GLM.predict(test_model, test_4240);
threshold = 0.5;
test_pred[test_pred .>= threshold] .= 1.0;
test_pred[test_pred .< threshold] .= 0.0;
test_pred = convert(Array{Int}, trunc.(test_pred));

In [23]:
pred_4240 = test_4240;
pred_4240[!, :SURVERSE] = test_pred;

In [24]:
first(pred_4240[!, [:DATE, :SURVERSE, :McTavish_sum]], 5)

Unnamed: 0_level_0,DATE,SURVERSE,McTavish_sum
Unnamed: 0_level_1,Date,Int64,Float64
1,2019-05-07,0,3.0
2,2019-05-09,0,89.0
3,2019-05-10,1,385.0
4,2019-05-15,0,2.0
5,2019-05-21,0,0.0


#### Random Forest

## Ouvrage 4350

In [25]:
data_4350 = CSV.read("data/parsed/ouvrage_4350.csv");
train_set, val_set = partitionTrainTest(data_4350);

#### GLM Train / Val

In [26]:
val_model = glm(val_form, train_set, Bernoulli(), LogitLink())
val_labels = val_set[!, :SURVERSE];
val_pred = GLM.predict(val_model, val_set);

threshold = 0.5;
val_pred[val_pred .>= threshold] .= 1.0;
val_pred[val_pred .< threshold] .= 0.0;
val_pred = convert(Array{Int}, trunc.(val_pred));

r = roc(val_labels, val_pred);
f1score(r)

0.6666666666666666

#### GLM Submission

In [27]:
test_model = glm(val_form, data_4350, Bernoulli(), LogitLink());

In [28]:
test_4350 = CSV.read("data/parsed/test_4350.csv");
size(test_4350)

(65, 16)

In [29]:
test_pred = GLM.predict(test_model, test_4350);
threshold = 0.5;
test_pred[test_pred .>= threshold] .= 1.0;
test_pred[test_pred .< threshold] .= 0.0;
test_pred = convert(Array{Int}, trunc.(test_pred));

In [30]:
pred_4350 = test_4350;
pred_4350[!, :SURVERSE] = test_pred;

In [31]:
first(pred_4350[!, [:DATE, :SURVERSE, :McTavish_sum]], 5)

Unnamed: 0_level_0,DATE,SURVERSE,McTavish_sum
Unnamed: 0_level_1,Date,Int64,Float64
1,2019-05-03,0,34.0
2,2019-05-04,0,2.0
3,2019-05-07,0,3.0
4,2019-05-08,0,0.0
5,2019-05-10,0,385.0


#### Random Forest

## Ouvrage 4380

In [32]:
data_4380 = CSV.read("data/parsed/ouvrage_4380.csv");
train_set, val_set = partitionTrainTest(data_4380);

#### GLM Train / Val

In [33]:
val_model = glm(val_form, train_set, Bernoulli(), LogitLink())
val_labels = val_set[!, :SURVERSE];
val_pred = GLM.predict(val_model, val_set);

threshold = 0.5;
val_pred[val_pred .>= threshold] .= 1.0;
val_pred[val_pred .< threshold] .= 0.0;
val_pred = convert(Array{Int}, trunc.(val_pred));

r = roc(val_labels, val_pred);
f1score(r)

0.7027027027027027

#### GLM Submission

In [34]:
test_model = glm(val_form, data_4380, Bernoulli(), LogitLink());

In [35]:
test_4380 = CSV.read("data/parsed/test_4380.csv");
size(test_4380)

(54, 16)

In [36]:
test_pred = GLM.predict(test_model, test_4380);
threshold = 0.5;
test_pred[test_pred .>= threshold] .= 1.0;
test_pred[test_pred .< threshold] .= 0.0;
test_pred = convert(Array{Int}, trunc.(test_pred));

In [37]:
pred_4380 = test_4380;
pred_4380[!, :SURVERSE] = test_pred;

In [38]:
first(pred_4380[!, [:DATE, :SURVERSE, :McTavish_sum]], 5)

Unnamed: 0_level_0,DATE,SURVERSE,McTavish_sum
Unnamed: 0_level_1,Date,Int64,Float64
1,2019-05-02,0,26.0
2,2019-05-04,0,2.0
3,2019-05-05,0,0.0
4,2019-05-06,0,0.0
5,2019-05-08,0,0.0


#### Random Forest

# Prédiction

In [None]:
test_set = CSV.read("data/test.csv");

Pour chaque ligne de test_set

    - On check l'ID de l'ouvrage pour savoir quel prediction load
    - On va chercher la prediction à telle date pour cet ouvrage
    - On le met à la ligne courante