# Data Processing

In [240]:
using Pkg;
Pkg.add("CSV");
Pkg.add("Random");
Pkg.add("DataStructures");
Pkg.add("BenchmarkTools");
Pkg.add("DataFrames");
Pkg.add("Statistics");
Pkg.add("Dates");
Pkg.add("Gadfly");
Pkg.add("MLBase");
Pkg.add("DecisionTree");
Pkg.add("IterTools");

using CSV, DataFrames,Distributions,DataStructures,BenchmarkTools,DecisionTree, Statistics, Dates, Gadfly, Random, MLBase, IterTools;
include("utils/precipitation.jl");

[32m[1m Resolving[22m[39m package versions...
[32m[1m  Updating[22m[39m `~/.julia/environments/v1.2/Project.toml`
[90m [no changes][39m
[32m[1m  Updating[22m[39m `~/.julia/environments/v1.2/Manifest.toml`
[90m [no changes][39m
[32m[1m Resolving[22m[39m package versions...
[32m[1m  Updating[22m[39m `~/.julia/environments/v1.2/Project.toml`
[90m [no changes][39m
[32m[1m  Updating[22m[39m `~/.julia/environments/v1.2/Manifest.toml`
[90m [no changes][39m
[32m[1m Resolving[22m[39m package versions...
[32m[1m  Updating[22m[39m `~/.julia/environments/v1.2/Project.toml`
[90m [no changes][39m
[32m[1m  Updating[22m[39m `~/.julia/environments/v1.2/Manifest.toml`
[90m [no changes][39m
[32m[1m Resolving[22m[39m package versions...
[32m[1m  Updating[22m[39m `~/.julia/environments/v1.2/Project.toml`
[90m [no changes][39m
[32m[1m  Updating[22m[39m `~/.julia/environments/v1.2/Manifest.toml`
[90m [no changes][39m
[32m[1m Resolving[22m[39

## Build features

### Get and filter the features

#### Latitude, Longitude, Height

In [174]:
features = CSV.read("data/ouvrages-surverses.csv");
colnames = ["N_Env", "ID_SOMA", "ID_OUVRAGE", "NOM", "SOMA_SEC", "REGION", "TP_X", "TP_Y", "TP_Z", "TP_LAT", "TP_LNG", "EMI_X", "EMI_Y", "EMI_LNG", "EMI_LAT"];
names!(features, Symbol.(colnames));
select!(features, [:ID_OUVRAGE, :TP_LAT, :TP_LNG, :TP_Z]);

#### Replace missing Z index with mean

In [175]:
features.TP_Z = coalesce.(features.TP_Z, mean(features[completecases(features), :].TP_Z));
first(shuffleDf(features), 10)

Unnamed: 0_level_0,ID_OUVRAGE,TP_LAT,TP_LNG,TP_Z
Unnamed: 0_level_1,String,Float64,Float64,Float64
1,4390-01D,45.4618,-73.5555,14.62
2,0801-06D,45.5187,-73.533,18.66
3,3370-01D,45.5653,-73.6631,18.21
4,3400-01D,45.5435,-73.6755,26.04
5,4280-01D,45.6009,-73.5111,12.53
6,3540-02D,45.4751,-73.8727,26.52
7,4620-05D,45.4254,-73.8902,32.55
8,4796-01D,45.45,-73.5686,15.747
9,3350-08D,45.5401,-73.7086,24.65
10,3240-04D,45.6531,-73.583,12.63


### Load dates and surverses

In [176]:
surverses = CSV.read("data/surverses.csv",missingstring="-99999");

#### Filter months

In [177]:
surverses = filter(row -> month(row.DATE) > 4, surverses);
surverses = filter(row -> month(row.DATE) < 11, surverses);

#### Filter non rain surverses

In [178]:
raison = coalesce.(surverses[:,:RAISON],"Inconnue");
surverses[!,:RAISON] = raison;

surverses = filter(row -> row.RAISON ∈ ["P","Inconnue","TS"], surverses);
select!(surverses, [:NO_OUVRAGE, :DATE, :SURVERSE]);

#### Remove missing data and rename

In [179]:
surverses = dropmissing(surverses, disallowmissing=true);
rename!(surverses, :NO_OUVRAGE => :ID_OUVRAGE);
first(shuffleDf(surverses),10)

Unnamed: 0_level_0,ID_OUVRAGE,DATE,SURVERSE
Unnamed: 0_level_1,String,Date,Int64
1,3410-01D,2016-06-20,0
2,3480-05D,2017-06-28,0
3,0801-08D,2013-08-03,0
4,4260-01D,2017-05-07,0
5,4350-01D,2013-05-02,0
6,3350-10D,2013-10-08,0
7,4430-06D,2013-05-22,0
8,4300-01D,2017-05-08,0
9,0672-02D,2013-06-14,0
10,3275-01D,2017-10-19,0


### Augment features with dates and label

In [180]:
comb = join(features, surverses, on = :ID_OUVRAGE);
first(shuffleDf(comb), 10)

Unnamed: 0_level_0,ID_OUVRAGE,TP_LAT,TP_LNG,TP_Z,DATE,SURVERSE
Unnamed: 0_level_1,String,Float64,Float64,Float64,Date,Int64
1,3350-08D,45.5401,-73.7086,24.65,2018-06-06,0
2,0642-01D,45.6727,-73.5262,19.3526,2016-07-25,0
3,4270-01D,45.6105,-73.5087,11.17,2016-10-20,0
4,3350-08D,45.5401,-73.7086,24.65,2013-07-18,0
5,3350-09D,45.5371,-73.713,23.72,2013-06-22,0
6,4360-01D,45.4907,-73.5508,15.15,2013-05-28,0
7,3350-07D,45.5461,-73.6921,20.75,2018-07-01,0
8,4240-02D,45.6498,-73.4877,19.29,2017-06-12,0
9,4610-06D,45.4305,-73.8563,29.05,2018-06-19,0
10,3350-10D,45.5372,-73.7129,23.73,2013-09-22,0


### Load precipitation data

#### Load and filter months between May & October included

In [181]:
precipitation = CSV.read("data/precipitations.csv",missingstring="-99999");
rename!(precipitation, Symbol("St-Hubert")=>:StHubert);

precipitation = filter(row -> month(row.date) > 4, precipitation);
precipitation = filter(row -> month(row.date) < 11, precipitation); 

#### Replace missing data by 0

In [182]:
precipitation[!,:McTavish] = coalesce.(precipitation[:,:McTavish], 0);
precipitation[!,:Bellevue] = coalesce.(precipitation[:,:Bellevue], 0);
precipitation[!,:Assomption] = coalesce.(precipitation[:,:Assomption], 0);
precipitation[!,:Trudeau] = coalesce.(precipitation[:,:Trudeau], 0);
precipitation[!,:StHubert] = coalesce.(precipitation[:,:StHubert], 0);

first(shuffleDf(precipitation), 5)

Unnamed: 0_level_0,date,heure,McTavish,Bellevue,Assomption,Trudeau,StHubert
Unnamed: 0_level_1,Date,Int64,Int64,Int64,Int64,Int64,Int64
1,2016-05-14,12,0,0,0,0,0
2,2018-10-08,1,0,0,0,0,0
3,2014-05-14,8,0,0,0,0,0
4,2015-07-02,15,0,0,0,0,0
5,2016-09-02,3,0,0,0,0,0


### Extract features from precipitation

#### Sum of precipitation for the day

In [183]:
pcp_sum = by(precipitation, :date,  McTavish = :McTavish=>sum, Bellevue = :Bellevue=>sum, 
   Assomption = :Assomption=>sum, Trudeau = :Trudeau=>sum, StHubert = :StHubert=>sum);
first(shuffleDf(pcp_sum), 5)

Unnamed: 0_level_0,date,McTavish,Bellevue,Assomption,Trudeau,StHubert
Unnamed: 0_level_1,Date,Int64,Int64,Int64,Int64,Int64
1,2014-09-29,0,0,0,0,0
2,2013-05-09,10,0,19,0,0
3,2017-05-13,0,38,0,9,15
4,2014-07-17,0,7,0,0,0
5,2017-08-14,0,0,0,0,0


#### Maximum precipitation in an hour for the day

In [184]:
pcp_max = by(precipitation, :date,  McTavish = :McTavish=>maximum, Bellevue = :Bellevue=>maximum, 
   Assomption = :Assomption=>maximum, Trudeau = :Trudeau=>maximum, StHubert = :StHubert=>maximum)
first(shuffleDf(pcp_max),5)

Unnamed: 0_level_0,date,McTavish,Bellevue,Assomption,Trudeau,StHubert
Unnamed: 0_level_1,Date,Int64,Int64,Int64,Int64,Int64
1,2019-08-26,0,0,0,2,0
2,2019-06-23,0,0,0,0,0
3,2017-06-23,31,28,20,24,25
4,2018-07-05,0,0,4,0,0
5,2017-08-13,15,7,10,6,8


#### Maximum precipitation during three consecutive hours in a day

In [185]:
pcp_max3h = by(precipitation, :date,  McTavish = :McTavish=>maximum3, Bellevue = :Bellevue=>maximum3, 
   Assomption = :Assomption=>maximum3, Trudeau = :Trudeau=>maximum3, StHubert = :StHubert=>maximum3)
first(shuffleDf(pcp_max3h),5)

Unnamed: 0_level_0,date,McTavish,Bellevue,Assomption,Trudeau,StHubert
Unnamed: 0_level_1,Date,Int64,Int64,Int64,Int64,Int64
1,2018-08-19,0,0,7,0,0
2,2019-09-15,0,0,0,0,0
3,2015-07-03,0,0,0,0,0
4,2015-07-14,0,7,30,0,12
5,2018-05-26,38,125,45,66,36


### Add precipitation data to features

#### Get stations lat-lng

In [186]:
station_df = DataFrame(STATION = String[], LAT = Float64[], LNG = Float64[]);

push!(station_df, ["McTavish", 45.504742, -73.579167]);
push!(station_df, ["Bellevue", 45.427222, -73.929167]);
push!(station_df, ["Assomption", 45.809444, -73.434722]);
push!(station_df, ["Trudeau", 45.467778, -73.741667]);
push!(station_df, ["StHubert", 45.5175, -73.416944]);

station_df

Unnamed: 0_level_0,STATION,LAT,LNG
Unnamed: 0_level_1,String,Float64,Float64
1,McTavish,45.5047,-73.5792
2,Bellevue,45.4272,-73.9292
3,Assomption,45.8094,-73.4347
4,Trudeau,45.4678,-73.7417
5,StHubert,45.5175,-73.4169


### Standardize TP and station data

In [187]:
meanlat = mean(comb.TP_LAT);
stdlat = std(comb.TP_LAT);
comb.TP_LAT = (comb.TP_LAT .- meanlat) ./ stdlat;
station_df.LAT = (station_df.LAT .- meanlat) ./ stdlat;

meanlng = mean(comb.TP_LNG);
stdlng = std(comb.TP_LNG);
comb.TP_LNG = (comb.TP_LNG .- meanlng) ./ stdlng;
station_df.LNG = (station_df.LNG .- meanlng) ./ stdlng;

meanz = mean(comb.TP_Z);
stdz = std(comb.TP_Z);
comb.TP_Z = (comb.TP_Z .- meanz) ./ stdz;

In [188]:
station_df

Unnamed: 0_level_0,STATION,LAT,LNG
Unnamed: 0_level_1,String,Float64,Float64
1,McTavish,-0.399934,0.53979
2,Bellevue,-1.29892,-2.14237
3,Assomption,3.13364,1.64672
4,Trudeau,-0.828599,-0.705498
5,StHubert,-0.251981,1.78296


### Augment Features

#### Add pcp_sum and pcp_max columns

In [189]:
comb.PCP_SUM = zeros(size(comb, 1));
comb.PCP_MAX = zeros(size(comb, 1));
comb.PCP_MAX3 = zeros(size(comb, 1));
permutecols!(comb, [:ID_OUVRAGE, :TP_LAT, :TP_LNG, :TP_Z, :DATE, :PCP_SUM, :PCP_MAX, :PCP_MAX3, :SURVERSE]);

In [190]:
first(shuffleDf(comb), 10)

Unnamed: 0_level_0,ID_OUVRAGE,TP_LAT,TP_LNG,TP_Z,DATE,PCP_SUM,PCP_MAX,PCP_MAX3
Unnamed: 0_level_1,String,Float64,Float64,Float64,Date,Float64,Float64,Float64
1,4600-01D,-1.26773,-1.32401,0.56828,2017-10-10,0.0,0.0,0.0
2,3350-06D,0.183866,-0.182787,0.254423,2017-09-06,0.0,0.0,0.0
3,0801-08D,-0.230069,0.861407,-0.850227,2015-08-26,0.0,0.0,0.0
4,3350-10D,-0.0239445,-0.484678,0.626743,2013-07-07,0.0,0.0,0.0
5,3350-08D,0.0104279,-0.451902,0.768286,2013-06-10,0.0,0.0,0.0
6,0801-09D,-0.271792,0.873046,0.865212,2016-06-09,0.0,0.0,0.0
7,4340-01D,-0.150978,0.805845,-0.0502015,2013-07-12,0.0,0.0,0.0
8,3480-05D,-0.471255,-1.0262,0.799056,2015-10-13,0.0,0.0,0.0
9,4620-01D,-1.43344,-1.80389,0.809826,2014-05-11,0.0,0.0,0.0
10,4230-06D,1.65681,1.22251,-1.6441,2013-07-09,0.0,0.0,0.0


#### Find closest station to each ouvrage and add pcp_sum and pcp_max to it

In [191]:
for i=1:size(comb, 1)
    id_ouvrage = comb[i, 1]; 
    closest_station = "McTavish"; # initial value
    shortest_dist = -1;
    
    # Find closest station
    for j=1:size(station_df, 1)
        dist = findDistance(comb[i, :TP_LAT], comb[i, :TP_LNG], station_df[j, :LAT], station_df[j, :LNG]);
        
        if shortest_dist == -1 || dist < shortest_dist
            shortest_dist = dist;
            closest_station = station_df[j, :STATION];
        end
    end
    
    # Augment comb with a weighted p_sum, based on the distance to the station
    p_sum = pcp_sum[∈([comb[i, :DATE]]).(pcp_sum.date), Symbol(closest_station)];
#     comb[i, :PCP_SUM] = p_sum[1] * (1 - shortest_dist);
    comb[i, :PCP_SUM] = p_sum[1]; 
    
    # Augment comb with a weighted p_max, based on the distance to the station
    p_max = pcp_max[∈([comb[i, :DATE]]).(pcp_max.date), Symbol(closest_station)]
#     comb[i, :PCP_MAX] = p_max[1] * (1 - shortest_dist);
    comb[i, :PCP_MAX] = p_max[1];
    
    # Augment comb with a weighted p_max3h, based on the distance to the station
    p_max3 = pcp_max3h[∈([comb[i, :DATE]]).(pcp_max3h.date), Symbol(closest_station)]
#     comb[i, :PCP_MAX3] = p_max3[1] * (1 - shortest_dist);
    comb[i, :PCP_MAX3] = p_max3[1]; 
end

#### Remove outlier in PCP_SUM and PCP_MAX AND PCP_MAX3 that cause compression

In [192]:
comb[comb[:PCP_SUM] .> 750, :PCP_SUM] = 750;
comb[comb[:PCP_MAX] .> 500, :PCP_MAX] = 500;
comb[comb[:PCP_MAX3] .> 750, :PCP_MAX3] = 750;

│   caller = top-level scope at In[192]:1
└ @ Core In[192]:1
│   caller = top-level scope at In[192]:2
└ @ Core In[192]:2
│   caller = top-level scope at In[192]:3
└ @ Core In[192]:3


In [193]:
first(shuffleDf(filter(row -> row.SURVERSE == 1, comb)), 10)

Unnamed: 0_level_0,ID_OUVRAGE,TP_LAT,TP_LNG,TP_Z,DATE,PCP_SUM,PCP_MAX,PCP_MAX3
Unnamed: 0_level_1,String,Float64,Float64,Float64,Date,Float64,Float64,Float64
1,4380-01D,-0.828933,0.658409,-0.0467296,2017-06-05,434.0,91.0,210.0
2,4430-05D,-1.33635,-0.0587449,1.69293,2018-09-03,39.0,33.0,33.0
3,4300-01D,0.3648,0.987515,-0.933306,2015-05-25,200.0,56.0,110.0
4,0801-05D,-0.254625,0.907701,-1.07023,2015-05-18,318.0,140.0,236.0
5,4330-01D,0.0527466,0.876211,-0.845611,2018-10-09,56.0,51.0,51.0
6,0672-03D,1.55381,0.838395,-1.48717,2016-08-21,230.0,70.0,160.0
7,4400-02D,-1.0264,0.591287,-0.536371,2016-06-22,56.0,27.0,38.0
8,4370-01D,-0.787544,0.689553,-0.0467296,2018-07-25,387.0,257.0,307.0
9,3270-01D,1.18312,0.417032,0.239038,2015-06-08,288.0,52.0,114.0
10,4370-01D,-0.787544,0.689553,-0.0467296,2013-08-09,72.0,72.0,72.0


### Split dates into months and days

In [194]:
comb.MONTH = month.(comb.DATE);
comb.DAY = day.(comb.DATE);
first(shuffleDf(comb[!, [:DATE, :MONTH, :DAY]]), 5)

Unnamed: 0_level_0,DATE,MONTH,DAY
Unnamed: 0_level_1,Date,Int64,Int64
1,2014-08-02,8,2
2,2018-09-24,9,24
3,2017-06-05,6,5
4,2015-09-23,9,23
5,2015-06-24,6,24


## Standardize the PCP and Date

In [195]:
mean_pcpsum = mean(comb.PCP_SUM);
std_pcpsum = std(comb.PCP_SUM);
comb.PCP_SUM = (comb.PCP_SUM .- mean_pcpsum) ./ std_pcpsum;

mean_pcpmax = mean(comb.PCP_MAX);
std_pcpmax = std(comb.PCP_MAX);
comb.PCP_MAX = (comb.PCP_MAX .- mean_pcpmax) ./ std_pcpmax;

mean_pcpmax3 = mean(comb.PCP_MAX3);
std_pcpmax3 = std(comb.PCP_MAX3);
comb.PCP_MAX3 = (comb.PCP_MAX3 .- mean_pcpmax3) ./ std_pcpmax3;

meanmonth = mean(comb.MONTH);
stdmonth = std(comb.MONTH);
comb.MONTH = (comb.MONTH .- meanmonth) ./ stdmonth;

meanday = mean(comb.DAY);
stdday = std(comb.DAY);
comb.DAY = (comb.DAY .- meanday) ./ stdday;

In [196]:
first(shuffleDf(filter(row -> row.SURVERSE == 1, comb)), 10)

Unnamed: 0_level_0,ID_OUVRAGE,TP_LAT,TP_LNG,TP_Z,DATE,PCP_SUM,PCP_MAX
Unnamed: 0_level_1,String,Float64,Float64,Float64,Date,Float64,Float64
1,0801-05D,-0.254625,0.907701,-1.07023,2013-07-16,-0.400087,-0.401707
2,4430-04D,-1.38578,0.00660048,1.82832,2018-09-02,0.903974,1.72099
3,3380-01D,0.163171,-0.158884,0.395966,2016-10-27,1.20282,0.551968
4,3310-01D,0.865632,0.140067,-0.0117388,2016-10-21,6.17456,1.53641
5,3350-05D,0.362175,-0.0977361,0.248269,2014-07-27,1.66468,1.19801
6,0801-05D,-0.254625,0.907701,-1.07023,2018-08-04,-0.345752,-0.278652
7,4290-01D,0.5267,1.08435,-1.19485,2014-09-13,1.28433,1.07495
8,3270-01D,1.18312,0.417032,0.239038,2015-06-13,-0.37292,-0.34018
9,4720-01D,1.86614,1.22986,-1.39332,2015-05-18,3.53926,3.90522
10,4370-02D,-0.787414,0.689638,-0.0467296,2017-06-20,1.96352,2.18245


# Validate model

### Split train and validation sets

In [197]:
r_idx = shuffle(1:size(comb, 1));
train_ceil = floor(Int, size(r_idx, 1) * 0.8);
train_set = comb[r_idx[1:train_ceil], :];
val_set = comb[r_idx[train_ceil+1:size(r_idx, 1)], :];

### Train model on train set

In [233]:
#= fonction qui trouve la distribution qui représente le mieux chaque variable explicative=#

function fitBestDistribution(data::Array, verbose::Bool)
    # Distributions à essayer
    distr = [Beta, Binomial, 
              Categorical, DiscreteUniform, Exponential, 
              Normal, Gamma, Geometric, Laplace, Pareto, 
              Poisson, Uniform, Multinomial, MvNormal, Dirichlet, Weibull];
    
    distrNames = ["Beta", "Binomial", 
                  "Categorical", "DiscreteUniform", "Exponential", 
                  "Normal", "Gamma", "Geometric", "Laplace", "Pareto", 
                  "Poisson", "Uniform", "Multinomial", "MvNormal", "Dirichlet", "Weibull"];
    
    # Déclaration des variables
    maxLikelihood = -Inf;
    distrName = nothing;
    finalfd = nothing;
    fitDist = nothing;
    
    for i = 1:length(distr) # Pour chaque type de modèle
        if (verbose)
            println("Trying model of type: ", distrNames[i]);
        end
        try # On essaie de faire fit le modèle sur les données
            fitDist = fit(distr[i], data);
        catch
            if (verbose)
                println("Invalid");
            end
            continue
        end
        
#         k = sizeof(fieldnames(distr[i]))[1]; # le nombre de paramètres dans le modèle
#         nData = sizeof(data)[1]; # le nombre de données dans l'ensemble
        newLikelihood = loglikelihood(fitDist, data)
#         newBIC = BIC(fitDist, data, k, nData); # calcul du BIC
        
        # Si on trouve un meilleur BIC que celui qu'on a déjà, le modèle courant est le meilleur
        if (newLikelihood > maxLikelihood) 
            maxLikelihood = newLikelihood;
            distrName = distrNames[i];
            finalfd = fitDist;
        end
    end
    
    # Décommenter cette ligne pour voir quel est le modèle trouvé
    println("The best distribution is of type ", distrName, " with a likelihood of ", maxLikelihood)
    return finalfd;
end

fitBestDistribution (generic function with 1 method)

In [234]:
#= Cette fonction retourne un tableau des meilleurs distribution selon les variables données 
(1 distribution lorsqu'il n'y a pas de surverse et une distribution lorsqu'il y a surverse=#

function getBestLikelihoodDistributions(train::DataFrame, variable::Symbol)
    x_m = [];
    for i=0:1
        ind = train[:,:SURVERSE] .== i;
        x=train[ind,variable];
        push!(x_m,fitBestDistribution(x, false));
    end
    
    return x_m;
end

getBestLikelihoodDistributions (generic function with 1 method)

In [235]:
function getPrioris(trainSet::DataFrame)
    dAlpha = trainSet[:,:SURVERSE];
    n_mode = Float64[];
    for i=0:1
        push!(n_mode, count(dAlpha .== i));
    end
    α = n_mode/size(trainSet,1);
    M = Categorical(α); #Categorical en 2 dim ? Bernoulli ? 
    mode(M);
    
    return α;
end

getPrioris (generic function with 1 method)

In [344]:
function predictNaiveBayes(data::DataFrame, likelihoodDistrs::Array, prioris::Array, variables::Array)
    n_data = size(data,1);
    nb_exp_var = size(variables,1);
    y_m = Array{Int64}(undef,n_data);
    
    

    for i=1:n_data
        p = [];
        for j=1:2
#             prob =1; # à priori non informatif (Uniform(0,1))
            prob = prioris[j]
            for k=1:nb_exp_var
                prob *= pdf.(likelihoodDistrs[k][j], data[i,variables[k]]);
            end
            push!(p,prob);
        end
        _, ind = findmax(p);
        y_m[i] =ind -1;
    end
    
    return y_m;
end

predictNaiveBayes (generic function with 2 methods)

In [331]:
function BIC(distributions::Dict, data::DataFrame)
    n = size(data)[1]
    k = length(keys(distributions))
    totalLogLikelihood = 0
    for variable in keys(distributions)
        for j=0:1
            ind = data[:,:SURVERSE] .== j;
            x=data[ind,variable];
#             @show distributions
            totalLogLikelihood += loglikelihood(distributions[variable][j+1], x)
        end
    end
    return (totalLogLikelihood - k*log(n)/2);
end

BIC (generic function with 5 methods)

In [328]:
function findBestVariablesCombinationBIC(train::DataFrame, likelihoodDistrs::Dict, variables::Array{Symbol})
    bics = []
    bicsDict = Dict()
    combinations = []
    for combination in subsets(variables)
        push!(combinations, combination)
        modelVariables = []
        modelLikelihoodDistrs = Dict()
        for variable in combination
            modelLikelihoodDistrs[variable] = likelihoodDistrs[variable]
        end
#         @show modelLikelihoodDistrs
        bic = BIC(modelLikelihoodDistrs, train)
        if bic == 0.0
            continue
        end
        push!(bics, bic)
#         bicsDict[combination] = bic
    end
#     @show bicsDict
    @show _, indexMax = findmax(bics)
#     sortedBics = sort(collect(keys(bicsDict)))
    println("Best combination: ", combinations[indexMax], " with bic: ", bics[indexMax])
    return (combinations[indexMax], bics[indexMax])
end

findBestVariablesCombinationBIC (generic function with 2 methods)

In [366]:
function findBestVariablesCombinationF1Validation(train::DataFrame, validation::DataFrame, likelihoodDistrs::Dict, variables::Array{Symbol})
    f1scores = []
    combinations = []
    for combination in subsets(variables)
        push!(combinations, combination)
        modelVariables = []
        modelLikelihoodDistrs = []
        for variable in combination
            push!(modelLikelihoodDistrs, likelihoodDistrs[variable])
            push!(modelVariables, variable)
        end
        prioris = getPrioris(train)
#         @show modelVariables
        y = predictNaiveBayes(validation, modelLikelihoodDistrs, prioris, modelVariables);
        r = roc(val_set[:SURVERSE], y);
        push!(f1scores, f1score(r))
#         println(f1score(r))
    end
    _, indexMax = findmax(f1scores)
    println("Best combination: ", combinations[indexMax], " with f1score: ", f1scores[indexMax])
    return (combinations[indexMax], f1scores[indexMax])
end

findBestVariablesCombinationF1Validation (generic function with 1 method)

In [336]:
function fitModel(train::DataFrame, variables::Array{Symbol} )
    likelihoodDistrs = Dict();
    
    for variable in variables
        likelihoodDistrs[variable] = getBestLikelihoodDistributions(train, variable);
    end
    return likelihoodDistrs
end

fitModel (generic function with 1 method)

In [367]:
function trainNaiveBayes(train::DataFrame, validation::DataFrame, variables::Array{Symbol} )
    likelihoodDistrs = fitModel(train, variables)
    modelVariables, f1score = findBestVariablesCombinationF1Validation(train, validation, likelihoodDistrs, variables)
    return (likelihoodDistrs, modelVariables, f1score)
end

trainNaiveBayes (generic function with 1 method)

In [368]:
function trainOnValidationAndPredictNaiveBayes(train::DataFrame,validation::DataFrame, test::DataFrame, variables::Array{Symbol} )
    likelihoodDistrs, modelVariables, f1score = trainNaiveBayes(train, validation, variables)
    prioris = getPrioris(train)
    
    modelLikelihoodDistrs = []
    for variable in modelVariables
        push!(modelLikelihoodDistrs, likelihoodDistrs[variable]);
    end
    y = predictNaiveBayes(test, modelLikelihoodDistrs, prioris, modelVariables);
    return y;
end

trainOnValidationAndPredictNaiveBayes (generic function with 1 method)

In [356]:
# Creer un vecteur y avec un modèle des variables `variables` entraîné sur `train` et appliqué sur `test`
function trainAndPredictNaiveBayes(train::DataFrame, test::DataFrame, variables::Array{Symbol} )
    n_variables = size(variables,1);
    likelihoodDistrs = fitModel(train, variables)
    
#     modelVariables, bic = findBestVariablesCombinationBIC(train, likelihoodDistrs, variables)
#     modelVariables, f1score = findBestVariablesCombinationF1Validation(train, test, likelihoodDistrs, variables)
    
    prioris = getPrioris(train)
    
    println(prioris)
    modelLikelihoodDistrs = []
    for variable in variables
        push!(modelLikelihoodDistrs, likelihoodDistrs[variable]);
    end
#     @show modelLikelihoodDistrs
    y = predictNaiveBayes(test, modelLikelihoodDistrs, prioris, variables);
    return y;
end

trainAndPredictNaiveBayes (generic function with 1 method)

In [204]:
function compareArr(a, b, verbose)
    pourcent = count(a .== b)/size(a,1) * 100;
    if(verbose)
        println(size(a,1))
        println(size(b,1))
        
        println("Le modéle est précis à $pourcent %");
    end
    return pourcent;
end

compareArr (generic function with 1 method)

#### Naive bayes Params

In [205]:
names_ft = [:TP_LAT, :TP_LNG, :TP_Z, :MONTH, :DAY, :PCP_SUM, :PCP_MAX, :PCP_MAX3];

### Validate model on validation set

#### Single validation

In [357]:
y = trainAndPredictNaiveBayes(train_set, val_set, names_ft);
# print(y)
# Comparer nos résultats
compareArr(y, val_set[:SURVERSE], true);
r = roc(val_set[:SURVERSE], y);
println(recall(r))
println(precision(r))
println(f1score(r))

no_0_gt = val_set[:SURVERSE] .+ 1
no_0_pred = y.+1
confusmat(2, no_0_gt, no_0_pred )


The best model is of type Uniform with a BIC of -152541.51364510704
The best model is of type Uniform with a BIC of -5436.80430390067
The best model is of type Uniform with a BIC of -159558.3867488944
The best model is of type Normal with a BIC of -4467.086529136986
The best model is of type Normal with a BIC of -175192.39090162498
The best model is of type Normal with a BIC of -5572.954294527319
The best model is of type Uniform with a BIC of -132033.1741631602
The best model is of type Uniform with a BIC of -4705.856867383811
The best model is of type Uniform with a BIC of -150285.9631051927
The best model is of type Uniform with a BIC of -5356.413159286683
The best model is of type Normal with a BIC of -158350.57058899855
The best model is of type Normal with a BIC of -9306.943331354872
The best model is of type Normal with a BIC of -161360.19792138768
The best model is of type Laplace with a BIC of -8822.699148219668
The best model is of type Normal with a BIC of -159480.0137034237

│   caller = top-level scope at In[357]:2
└ @ Core In[357]:2
│   caller = top-level scope at In[357]:5
└ @ Core In[357]:5
│   caller = top-level scope at In[357]:9
└ @ Core In[357]:9


2×2 Array{Int64,2}:
 29382  1368
   557   569

In [369]:
y = trainOnValidationAndPredictNaiveBayes(train_set, val_set, val_set, names_ft);
# print(y)
# Comparer nos résultats
compareArr(y, val_set[:SURVERSE], true);
r = roc(val_set[:SURVERSE], y);
println(recall(r))
println(precision(r))
println(f1score(r))

no_0_gt = val_set[:SURVERSE] .+ 1
no_0_pred = y.+1
confusmat(2, no_0_gt, no_0_pred )

The best model is of type Uniform with a BIC of -152541.51364510704
The best model is of type Uniform with a BIC of -5436.80430390067
The best model is of type Uniform with a BIC of -159558.3867488944
The best model is of type Normal with a BIC of -4467.086529136986
The best model is of type Normal with a BIC of -175192.39090162498
The best model is of type Normal with a BIC of -5572.954294527319
The best model is of type Uniform with a BIC of -132033.1741631602
The best model is of type Uniform with a BIC of -4705.856867383811
The best model is of type Uniform with a BIC of -150285.9631051927
The best model is of type Uniform with a BIC of -5356.413159286683
The best model is of type Normal with a BIC of -158350.57058899855
The best model is of type Normal with a BIC of -9306.943331354872
The best model is of type Normal with a BIC of -161360.19792138768
The best model is of type Laplace with a BIC of -8822.699148219668
The best model is of type Normal with a BIC of -159480.0137034237

│   caller = findBestVariablesCombinationF1Validation(::DataFrame, ::DataFrame, ::Dict{Any,Any}, ::Array{Symbol,1}) at In[366]:15
└ @ Main ./In[366]:15


Best combination: Symbol[:TP_LNG, :TP_Z, :PCP_SUM, :PCP_MAX] with f1score: 0.3772087991345114
31876
31876
Le modéle est précis à 94.58213075668215 %
0.4644760213143872
0.3175470552519733
0.3772087991345114


│   caller = top-level scope at In[369]:2
└ @ Core In[369]:2
│   caller = top-level scope at In[369]:5
└ @ Core In[369]:5
│   caller = top-level scope at In[369]:9
└ @ Core In[369]:9


MethodError: MethodError: no method matching *(::Array{Int64,2}, ::typeof(trainOnValidationAndPredictNaiveBayes))
Closest candidates are:
  *(::Any, ::Any, !Matched::Any, !Matched::Any...) at operators.jl:529
  *(::Union{DenseArray{T,2}, Base.ReinterpretArray{T,2,S,A} where S where A<:Union{SubArray{T,N,A,I,true} where I<:Union{Tuple{Vararg{Real,N} where N}, Tuple{AbstractUnitRange,Vararg{Any,N} where N}} where A<:DenseArray where N where T, DenseArray}, Base.ReshapedArray{T,2,A,MI} where MI<:Tuple{Vararg{Base.MultiplicativeInverses.SignedMultiplicativeInverse{Int64},N} where N} where A<:Union{Base.ReinterpretArray{T,N,S,A} where S where A<:Union{SubArray{T,N,A,I,true} where I<:Union{Tuple{Vararg{Real,N} where N}, Tuple{AbstractUnitRange,Vararg{Any,N} where N}} where A<:DenseArray where N where T, DenseArray} where N where T, SubArray{T,N,A,I,true} where I<:Union{Tuple{Vararg{Real,N} where N}, Tuple{AbstractUnitRange,Vararg{Any,N} where N}} where A<:DenseArray where N where T, DenseArray}, SubArray{T,2,A,I,L} where L where I<:Tuple{Vararg{Union{Int64, AbstractRange{Int64}, Base.AbstractCartesianIndex},N} where N} where A<:Union{Base.ReinterpretArray{T,N,S,A} where S where A<:Union{SubArray{T,N,A,I,true} where I<:Union{Tuple{Vararg{Real,N} where N}, Tuple{AbstractUnitRange,Vararg{Any,N} where N}} where A<:DenseArray where N where T, DenseArray} where N where T, Base.ReshapedArray{T,N,A,MI} where MI<:Tuple{Vararg{Base.MultiplicativeInverses.SignedMultiplicativeInverse{Int64},N} where N} where A<:Union{Base.ReinterpretArray{T,N,S,A} where S where A<:Union{SubArray{T,N,A,I,true} where I<:Union{Tuple{Vararg{Real,N} where N}, Tuple{AbstractUnitRange,Vararg{Any,N} where N}} where A<:DenseArray where N where T, DenseArray} where N where T, SubArray{T,N,A,I,true} where I<:Union{Tuple{Vararg{Real,N} where N}, Tuple{AbstractUnitRange,Vararg{Any,N} where N}} where A<:DenseArray where N where T, DenseArray} where N where T, DenseArray}} where T, !Matched::LinearAlgebra.AbstractQ) at /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.2/LinearAlgebra/src/qr.jl:668
  *(::Union{DenseArray{T,2}, Base.ReinterpretArray{T,2,S,A} where S where A<:Union{SubArray{T,N,A,I,true} where I<:Union{Tuple{Vararg{Real,N} where N}, Tuple{AbstractUnitRange,Vararg{Any,N} where N}} where A<:DenseArray where N where T, DenseArray}, Base.ReshapedArray{T,2,A,MI} where MI<:Tuple{Vararg{Base.MultiplicativeInverses.SignedMultiplicativeInverse{Int64},N} where N} where A<:Union{Base.ReinterpretArray{T,N,S,A} where S where A<:Union{SubArray{T,N,A,I,true} where I<:Union{Tuple{Vararg{Real,N} where N}, Tuple{AbstractUnitRange,Vararg{Any,N} where N}} where A<:DenseArray where N where T, DenseArray} where N where T, SubArray{T,N,A,I,true} where I<:Union{Tuple{Vararg{Real,N} where N}, Tuple{AbstractUnitRange,Vararg{Any,N} where N}} where A<:DenseArray where N where T, DenseArray}, SubArray{T,2,A,I,L} where L where I<:Tuple{Vararg{Union{Int64, AbstractRange{Int64}, Base.AbstractCartesianIndex},N} where N} where A<:Union{Base.ReinterpretArray{T,N,S,A} where S where A<:Union{SubArray{T,N,A,I,true} where I<:Union{Tuple{Vararg{Real,N} where N}, Tuple{AbstractUnitRange,Vararg{Any,N} where N}} where A<:DenseArray where N where T, DenseArray} where N where T, Base.ReshapedArray{T,N,A,MI} where MI<:Tuple{Vararg{Base.MultiplicativeInverses.SignedMultiplicativeInverse{Int64},N} where N} where A<:Union{Base.ReinterpretArray{T,N,S,A} where S where A<:Union{SubArray{T,N,A,I,true} where I<:Union{Tuple{Vararg{Real,N} where N}, Tuple{AbstractUnitRange,Vararg{Any,N} where N}} where A<:DenseArray where N where T, DenseArray} where N where T, SubArray{T,N,A,I,true} where I<:Union{Tuple{Vararg{Real,N} where N}, Tuple{AbstractUnitRange,Vararg{Any,N} where N}} where A<:DenseArray where N where T, DenseArray} where N where T, DenseArray}} where T, !Matched::LinearAlgebra.Adjoint{#s617,#s616} where #s616<:LinearAlgebra.AbstractQ where #s617) at /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.2/LinearAlgebra/src/qr.jl:708
  ...

# Submission model creation

### Separate features and labels

In [207]:
full_train_features = convert(Matrix{Float64},comb[:, names_ft]);

In [208]:
full_train_labels = comb[:, :SURVERSE];

# Prediction

## Get the test data

In [209]:
test = CSV.read("data/test.csv");
rename!(test, :NO_OUVRAGE => :ID_OUVRAGE);
first(test, 10)

Unnamed: 0_level_0,ID_OUVRAGE,DATE
Unnamed: 0_level_1,String,Date
1,3260-01D,2019-05-02
2,3260-01D,2019-05-09
3,3260-01D,2019-05-10
4,3260-01D,2019-05-15
5,3260-01D,2019-05-20
6,3260-01D,2019-05-23
7,3260-01D,2019-05-24
8,3260-01D,2019-05-26
9,3260-01D,2019-05-30
10,3350-07D,2019-05-01


In [210]:
to_merge = unique(comb[!, [:ID_OUVRAGE, :TP_LAT, :TP_LNG, :TP_Z]], :ID_OUVRAGE);
test_comb = join(test, to_merge, on= [:ID_OUVRAGE]);
nrow(test_comb)

283

In [211]:
first(shuffleDf(test_comb), 10)

Unnamed: 0_level_0,ID_OUVRAGE,DATE,TP_LAT,TP_LNG,TP_Z
Unnamed: 0_level_1,String,Date,Float64,Float64,Float64
1,3260-01D,2019-08-28,1.29293,0.531312,0.0790333
2,4240-01D,2019-07-25,1.28137,1.24035,-1.19178
3,3260-01D,2019-06-09,1.29293,0.531312,0.0790333
4,4350-01D,2019-08-24,-0.465715,0.725081,-0.0467296
5,3350-07D,2019-06-26,0.0802391,-0.325532,0.168267
6,4240-01D,2019-06-30,1.28137,1.24035,-1.19178
7,3260-01D,2019-06-11,1.29293,0.531312,0.0790333
8,3260-01D,2019-07-08,1.29293,0.531312,0.0790333
9,4350-01D,2019-08-19,-0.465715,0.725081,-0.0467296
10,4380-01D,2019-09-30,-0.828933,0.658409,-0.0467296


### Add PCP_SUM and PCP_MAX

#### Initialize default pcp

In [212]:
test_comb.PCP_SUM = zeros(size(test_comb, 1));
test_comb.PCP_MAX = zeros(size(test_comb, 1));
test_comb.PCP_MAX3 = zeros(size(test_comb, 1));
permutecols!(test_comb, [:ID_OUVRAGE, :TP_LAT, :TP_LNG, :TP_Z, :DATE, :PCP_SUM, :PCP_MAX, :PCP_MAX3]);

In [213]:
first(shuffleDf(test_comb), 10)

Unnamed: 0_level_0,ID_OUVRAGE,TP_LAT,TP_LNG,TP_Z,DATE,PCP_SUM,PCP_MAX,PCP_MAX3
Unnamed: 0_level_1,String,Float64,Float64,Float64,Date,Float64,Float64,Float64
1,3260-01D,1.29293,0.531312,0.0790333,2019-06-09,0.0,0.0,0.0
2,4240-01D,1.28137,1.24035,-1.19178,2019-07-25,0.0,0.0,0.0
3,4350-01D,-0.465715,0.725081,-0.0467296,2019-05-04,0.0,0.0,0.0
4,3350-07D,0.0802391,-0.325532,0.168267,2019-05-09,0.0,0.0,0.0
5,4240-01D,1.28137,1.24035,-1.19178,2019-08-02,0.0,0.0,0.0
6,3260-01D,1.29293,0.531312,0.0790333,2019-06-02,0.0,0.0,0.0
7,3350-07D,0.0802391,-0.325532,0.168267,2019-05-19,0.0,0.0,0.0
8,3350-07D,0.0802391,-0.325532,0.168267,2019-05-02,0.0,0.0,0.0
9,4380-01D,-0.828933,0.658409,-0.0467296,2019-08-27,0.0,0.0,0.0
10,4350-01D,-0.465715,0.725081,-0.0467296,2019-08-12,0.0,0.0,0.0


#### Populate pcp

In [214]:
for i=1:size(test_comb, 1)
    id_ouvrage = test_comb[i, 1]; 
    closest_station = "McTavish"; # initial value
    shortest_dist = -1;
    
    # Find closest station
    for j=1:size(station_df, 1)
        dist = findDistance(test_comb[i, :TP_LAT], test_comb[i, :TP_LNG], station_df[j, :LAT], station_df[j, :LNG]);
        
        if shortest_dist == -1 || dist < shortest_dist
            shortest_dist = dist;
            closest_station = station_df[j, :STATION];
        end
    end
    
    # Augment comb with a weighted p_sum, based on the distance to the station
    p_sum = pcp_sum[∈([test_comb[i, :DATE]]).(pcp_sum.date), Symbol(closest_station)];
#     test_comb[i, :PCP_SUM] = p_sum[1] * (1 - shortest_dist); 
    test_comb[i, :PCP_SUM] = p_sum[1]; 
    # Augment comb with a weighted p_max, based on the distance to the station
    p_max = pcp_max[∈([test_comb[i, :DATE]]).(pcp_max.date), Symbol(closest_station)]
#     test_comb[i, :PCP_MAX] = p_max[1] * (1 - shortest_dist);
    test_comb[i, :PCP_MAX] = p_max[1];
    # Augment comb with a weighted p_max3, based on the distance to the station
    p_max3 = pcp_max3h[∈([test_comb[i, :DATE]]).(pcp_max3h.date), Symbol(closest_station)]
#     test_comb[i, :PCP_MAX3] = p_max3[1] * (1 - shortest_dist);
    test_comb[i, :PCP_MAX3] = p_max3[1];
end

In [215]:
first(shuffleDf(test_comb), 10)

Unnamed: 0_level_0,ID_OUVRAGE,TP_LAT,TP_LNG,TP_Z,DATE,PCP_SUM,PCP_MAX,PCP_MAX3
Unnamed: 0_level_1,String,Float64,Float64,Float64,Date,Float64,Float64,Float64
1,3350-07D,0.0802391,-0.325532,0.168267,2019-07-28,15.0,15.0,15.0
2,3260-01D,1.29293,0.531312,0.0790333,2019-08-01,0.0,0.0,0.0
3,3260-01D,1.29293,0.531312,0.0790333,2019-08-22,0.0,0.0,0.0
4,4240-01D,1.28137,1.24035,-1.19178,2019-09-28,39.0,15.0,25.0
5,3350-07D,0.0802391,-0.325532,0.168267,2019-08-14,0.0,0.0,0.0
6,3350-07D,0.0802391,-0.325532,0.168267,2019-07-13,0.0,0.0,0.0
7,4380-01D,-0.828933,0.658409,-0.0467296,2019-05-09,89.0,45.0,89.0
8,4240-01D,1.28137,1.24035,-1.19178,2019-06-08,0.0,0.0,0.0
9,3350-07D,0.0802391,-0.325532,0.168267,2019-05-31,5.0,5.0,5.0
10,4350-01D,-0.465715,0.725081,-0.0467296,2019-07-04,0.0,0.0,0.0


### Standardize PCP

In [216]:
test_comb.PCP_SUM = (test_comb.PCP_SUM .- mean_pcpsum) ./ std_pcpsum;
test_comb.PCP_MAX = (test_comb.PCP_MAX .- mean_pcpmax) ./ std_pcpmax;
test_comb.PCP_MAX3 = (test_comb.PCP_MAX3 .- mean_pcpmax3) ./ std_pcpmax3;

In [217]:
first(test_comb, 20)

Unnamed: 0_level_0,ID_OUVRAGE,TP_LAT,TP_LNG,TP_Z,DATE,PCP_SUM,PCP_MAX
Unnamed: 0_level_1,String,Float64,Float64,Float64,Date,Float64,Float64
1,3260-01D,1.29293,0.531312,0.0790333,2019-05-02,-0.0469042,-0.0325426
2,3260-01D,1.29293,0.531312,0.0790333,2019-05-09,0.808886,0.982661
3,3260-01D,1.29293,0.531312,0.0790333,2019-05-10,4.82974,1.6287
4,3260-01D,1.29293,0.531312,0.0790333,2019-05-15,-0.37292,-0.34018
5,3260-01D,1.29293,0.531312,0.0790333,2019-05-20,0.224775,0.828842
6,3260-01D,1.29293,0.531312,0.0790333,2019-05-23,1.97711,1.53641
7,3260-01D,1.29293,0.531312,0.0790333,2019-05-24,-0.223496,-0.124834
8,3260-01D,1.29293,0.531312,0.0790333,2019-05-26,-0.359336,-0.309416
9,3260-01D,1.29293,0.531312,0.0790333,2019-05-30,-0.305,-0.186361
10,3350-07D,0.0802391,-0.325532,0.168267,2019-05-01,0.238359,0.244331


#### Split dates into month and day

In [218]:
test_comb.MONTH = month.(test_comb.DATE);
test_comb.DAY = day.(test_comb.DATE);

first(shuffleDf(test_comb[!, [:DATE, :MONTH, :DAY]]), 5)

Unnamed: 0_level_0,DATE,MONTH,DAY
Unnamed: 0_level_1,Date,Int64,Int64
1,2019-09-13,9,13
2,2019-05-02,5,2
3,2019-08-06,8,6
4,2019-09-26,9,26
5,2019-06-30,6,30


#### Standardize months and days

In [219]:
test_comb.MONTH = (test_comb.MONTH .- meanmonth) ./ stdmonth;
test_comb.DAY = (test_comb.DAY .- meanday) ./ stdday;

In [220]:
first(shuffleDf(test_comb[!, [:ID_OUVRAGE, :TP_LAT, :TP_LNG, :TP_Z, :MONTH, :DAY, :PCP_SUM, :PCP_MAX, :PCP_MAX3]]), 5)

Unnamed: 0_level_0,ID_OUVRAGE,TP_LAT,TP_LNG,TP_Z,MONTH,DAY,PCP_SUM,PCP_MAX
Unnamed: 0_level_1,String,Float64,Float64,Float64,Float64,Float64,Float64,Float64
1,4240-01D,1.28137,1.24035,-1.19178,-0.875331,-0.88625,-0.400087,-0.401707
2,4240-01D,1.28137,1.24035,-1.19178,-1.45983,-0.999235,-0.400087,-0.401707
3,4380-01D,-0.828933,0.658409,-0.0467296,-0.875331,1.37345,-0.400087,-0.401707
4,4350-01D,-0.465715,0.725081,-0.0467296,0.293667,0.356584,-0.400087,-0.401707
5,3260-01D,1.29293,0.531312,0.0790333,0.878166,-1.2252,-0.400087,-0.401707


## Predict

In [370]:
test_labels = trainOnValidationAndPredictNaiveBayes(train_set,val_set, test_comb, names_ft);
print(test_labels)

The best model is of type Uniform with a BIC of -152541.51364510704
The best model is of type Uniform with a BIC of -5436.80430390067
The best model is of type Uniform with a BIC of -159558.3867488944
The best model is of type Normal with a BIC of -4467.086529136986
The best model is of type Normal with a BIC of -175192.39090162498
The best model is of type Normal with a BIC of -5572.954294527319
The best model is of type Uniform with a BIC of -132033.1741631602
The best model is of type Uniform with a BIC of -4705.856867383811
The best model is of type Uniform with a BIC of -150285.9631051927
The best model is of type Uniform with a BIC of -5356.413159286683
The best model is of type Normal with a BIC of -158350.57058899855
The best model is of type Normal with a BIC of -9306.943331354872
The best model is of type Normal with a BIC of -161360.19792138768
The best model is of type Laplace with a BIC of -8822.699148219668
The best model is of type Normal with a BIC of -159480.0137034237

## Generate submission

In [371]:
ID = test_comb[:,:ID_OUVRAGE].*"_".*string.(test_comb[:,:DATE])
sampleSubmission = DataFrame(ID = ID, Surverse=test_labels)
CSV.write("submissions/mgh-submission-2.csv",sampleSubmission)

"submissions/mgh-submission-2.csv"

In [372]:
mine = CSV.read("submissions/mgh-submission-2.csv");
mc =CSV.read("submissions/mc-submission-10.csv");
compareArr(mine[:, :Surverse], mc[:, :Surverse], true)

283
283
Le modéle est précis à 89.04593639575971 %


89.04593639575971