# Forêt aléatoire - Projet final A19

#### Importer les librairies

In [1]:
using CSV, DataFrames, DecisionTree, Statistics, Dates, Gadfly, Random

### Importer les données

#### Importer le fichier contenant les labels

In [24]:
labels = CSV.read("data/surverses.csv",missingstring="-99999")
first(labels, 10)

Unnamed: 0_level_0,NO_OUVRAGE,DATE,SURVERSE,RAISON
Unnamed: 0_level_1,String,Date,Int64⍰,String⍰
1,0642-01D,2013-05-01,0,missing
2,0642-01D,2013-05-02,0,missing
3,0642-01D,2013-05-03,0,missing
4,0642-01D,2013-05-04,0,missing
5,0642-01D,2013-05-05,0,missing
6,0642-01D,2013-05-06,0,missing
7,0642-01D,2013-05-07,0,missing
8,0642-01D,2013-05-08,0,missing
9,0642-01D,2013-05-09,0,missing
10,0642-01D,2013-05-10,0,missing


### Filtrage des labels

#### Filtrer les mois (Mai à Octobre)

In [25]:
labels = filter(row -> month(row.DATE) > 4, labels)
labels = filter(row -> month(row.DATE) < 11, labels)
first(labels, 10)

Unnamed: 0_level_0,NO_OUVRAGE,DATE,SURVERSE,RAISON
Unnamed: 0_level_1,String,Date,Int64⍰,String⍰
1,0642-01D,2013-05-01,0,missing
2,0642-01D,2013-05-02,0,missing
3,0642-01D,2013-05-03,0,missing
4,0642-01D,2013-05-04,0,missing
5,0642-01D,2013-05-05,0,missing
6,0642-01D,2013-05-06,0,missing
7,0642-01D,2013-05-07,0,missing
8,0642-01D,2013-05-08,0,missing
9,0642-01D,2013-05-09,0,missing
10,0642-01D,2013-05-10,0,missing


#### Garder seulement les surverses dues à la pluie

In [26]:
raison = coalesce.(labels[:,:RAISON],"Inconnue")
labels[!,:RAISON] = raison

labels = filter(row -> row.RAISON ∈ ["P","Inconnue","TS"], labels)
select!(labels, [:NO_OUVRAGE, :DATE, :SURVERSE])
first(labels,10)

Unnamed: 0_level_0,NO_OUVRAGE,DATE,SURVERSE
Unnamed: 0_level_1,String,Date,Int64⍰
1,0642-01D,2013-05-01,0
2,0642-01D,2013-05-02,0
3,0642-01D,2013-05-03,0
4,0642-01D,2013-05-04,0
5,0642-01D,2013-05-05,0
6,0642-01D,2013-05-06,0
7,0642-01D,2013-05-07,0
8,0642-01D,2013-05-08,0
9,0642-01D,2013-05-09,0
10,0642-01D,2013-05-10,0


In [27]:
labels = dropmissing(labels, disallowmissing=true)
rename!(labels, :NO_OUVRAGE => :ID_OUVRAGE)
first(labels,10)

Unnamed: 0_level_0,ID_OUVRAGE,DATE,SURVERSE
Unnamed: 0_level_1,String,Date,Int64
1,0642-01D,2013-05-01,0
2,0642-01D,2013-05-02,0
3,0642-01D,2013-05-03,0
4,0642-01D,2013-05-04,0
5,0642-01D,2013-05-05,0
6,0642-01D,2013-05-06,0
7,0642-01D,2013-05-07,0
8,0642-01D,2013-05-08,0
9,0642-01D,2013-05-09,0
10,0642-01D,2013-05-10,0


### Load features

In [45]:
features = CSV.read("data/ouvrages-surverses.csv")
colnames = ["N_Env", "ID_SOMA", "ID_OUVRAGE", "NOM", "SOMA_SEC", "REGION", "TP_X", "TP_Y", "TP_Z", "TP_LAT", "TP_LNG", "EMI_X", "EMI_Y", "EMI_LNG", "EMI_LAT"]
names!(features, Symbol.(colnames))
select!(features, [:ID_OUVRAGE, :TP_LAT, :TP_LNG, :TP_Z])
first(features, 10)

Unnamed: 0_level_0,ID_OUVRAGE,TP_LAT,TP_LNG,TP_Z
Unnamed: 0_level_1,String,Float64,Float64,Float64⍰
1,0642-01D,45.6727,-73.5262,missing
2,0672-01D,45.6823,-73.531,9.12
3,0672-02D,45.6939,-73.5214,10.23
4,0672-03D,45.6732,-73.5402,9.99
5,0801-01D,45.519,-73.5275,13.71
6,0801-02D,45.5174,-73.5281,12.34
7,0801-03D,45.5081,-73.5273,11.94
8,0801-04D,45.5029,-73.5238,12.36
9,0801-05D,45.5173,-73.5312,12.7
10,0801-06D,45.5187,-73.533,18.66


In [86]:
comb = join(features, labels, on = :ID_OUVRAGE)
first(comb, 10)

Unnamed: 0_level_0,ID_OUVRAGE,TP_LAT,TP_LNG,TP_Z,DATE,SURVERSE
Unnamed: 0_level_1,String,Float64,Float64,Float64⍰,Date,Int64
1,0642-01D,45.6727,-73.5262,missing,2013-05-01,0
2,0642-01D,45.6727,-73.5262,missing,2013-05-02,0
3,0642-01D,45.6727,-73.5262,missing,2013-05-03,0
4,0642-01D,45.6727,-73.5262,missing,2013-05-04,0
5,0642-01D,45.6727,-73.5262,missing,2013-05-05,0
6,0642-01D,45.6727,-73.5262,missing,2013-05-06,0
7,0642-01D,45.6727,-73.5262,missing,2013-05-07,0
8,0642-01D,45.6727,-73.5262,missing,2013-05-08,0
9,0642-01D,45.6727,-73.5262,missing,2013-05-09,0
10,0642-01D,45.6727,-73.5262,missing,2013-05-10,0


#### Replace missing TP_Z with mean

In [101]:
comb.TP_Z = coalesce.(comb.TP_Z, mean(comb[completecases(comb), :].TP_Z))
first(comb, 10)

Unnamed: 0_level_0,ID_OUVRAGE,TP_LAT,TP_LNG,TP_Z,DATE,SURVERSE
Unnamed: 0_level_1,String,Float64,Float64,Float64,Date,Int64
1,0642-01D,45.6727,-73.5262,19.6862,2013-05-01,0
2,0642-01D,45.6727,-73.5262,19.6862,2013-05-02,0
3,0642-01D,45.6727,-73.5262,19.6862,2013-05-03,0
4,0642-01D,45.6727,-73.5262,19.6862,2013-05-04,0
5,0642-01D,45.6727,-73.5262,19.6862,2013-05-05,0
6,0642-01D,45.6727,-73.5262,19.6862,2013-05-06,0
7,0642-01D,45.6727,-73.5262,19.6862,2013-05-07,0
8,0642-01D,45.6727,-73.5262,19.6862,2013-05-08,0
9,0642-01D,45.6727,-73.5262,19.6862,2013-05-09,0
10,0642-01D,45.6727,-73.5262,19.6862,2013-05-10,0


#### Test avec un random sample

In [190]:
sample_test = comb[shuffle(1:size(comb, 1)),:]
sample_test = sample_test[!, [:ID_OUVRAGE, :DATE, :TP_LAT, :TP_LNG, :TP_Z, :SURVERSE]]
first(sample_test, 20)

Unnamed: 0_level_0,ID_OUVRAGE,DATE,TP_LAT,TP_LNG,TP_Z,SURVERSE
Unnamed: 0_level_1,String,Date,Float64,Float64,Float64,Int64
1,4310-02D,2018-06-22,45.6067,-73.5865,32.4,0
2,3275-02D,2014-05-25,45.637,-73.607,14.53,0
3,3250-02D,2014-06-06,45.6437,-73.5541,38.93,0
4,4240-01D,2015-08-01,45.6497,-73.4877,11.91,0
5,3280-01D,2015-06-04,45.634,-73.6059,23.86,0
6,4630-01D,2017-08-11,45.4915,-73.7707,26.59,0
7,4430-04D,2017-10-19,45.4197,-73.6487,31.54,0
8,3560-01D,2014-10-14,45.4655,-73.8933,25.32,0
9,4430-01D,2018-05-23,45.4324,-73.5885,19.36,0
10,4320-01D,2014-06-16,45.5488,-73.5307,13.68,0


### Distance to Stations

#### Stations coordinates

In [155]:
station_df = DataFrame(STATION = String[], LAT = Float64[], LNG = Float64[]);

push!(station_df, ["McTavish", 45.504742, -73.579167]);
push!(station_df, ["Bellevue", 45.427222, -73.929167]);
push!(station_df, ["Assomption", 45.809444, -73.434722]);
push!(station_df, ["Trudeau", 45.467778, -73.741667]);
push!(station_df, ["StHubert", 45.5175, -73.416944]);

station_df

Unnamed: 0_level_0,STATION,LAT,LNG
Unnamed: 0_level_1,String,Float64,Float64
1,McTavish,45.5047,-73.5792
2,Bellevue,45.4272,-73.9292
3,Assomption,45.8094,-73.4347
4,Trudeau,45.4678,-73.7417
5,StHubert,45.5175,-73.4169


#### Get the precipitation data

In [156]:
precipitation = CSV.read("data/precipitations.csv",missingstring="-99999")
rename!(precipitation, Symbol("St-Hubert")=>:StHubert)
precipitation = filter(row -> month(row.date) > 4, precipitation) 
precipitation = filter(row -> month(row.date) < 11, precipitation) 
first(precipitation, 5)

Unnamed: 0_level_0,date,heure,McTavish,Bellevue,Assomption,Trudeau,StHubert
Unnamed: 0_level_1,Date,Int64,Int64⍰,Int64⍰,Int64⍰,Int64⍰,Int64⍰
1,2013-05-01,0,0,0,0,0,missing
2,2013-05-01,1,0,0,0,0,missing
3,2013-05-01,2,0,0,0,0,missing
4,2013-05-01,3,0,0,0,0,missing
5,2013-05-01,4,0,0,0,0,missing


#### Remplacement des missing par 0 (Arbitraire)

In [165]:
precipitation[!,:McTavish] = coalesce.(precipitation[:,:McTavish], 0)
precipitation[!,:Bellevue] = coalesce.(precipitation[:,:Bellevue], 0)
precipitation[!,:Assomption] = coalesce.(precipitation[:,:Assomption], 0)
precipitation[!,:Trudeau] = coalesce.(precipitation[:,:Trudeau], 0)
precipitation[!,:StHubert] = coalesce.(precipitation[:,:StHubert], 0)

first(precipitation, 5)

Unnamed: 0_level_0,date,heure,McTavish,Bellevue,Assomption,Trudeau,StHubert
Unnamed: 0_level_1,Date,Int64,Int64,Int64,Int64,Int64,Int64
1,2013-05-01,0,0,0,0,0,0
2,2013-05-01,1,0,0,0,0,0
3,2013-05-01,2,0,0,0,0,0
4,2013-05-01,3,0,0,0,0,0
5,2013-05-01,4,0,0,0,0,0


#### Quantite journaliere des precipitations par station

In [167]:
pcp_sum = by(precipitation, :date,  McTavish = :McTavish=>sum, Bellevue = :Bellevue=>sum, 
   Assomption = :Assomption=>sum, Trudeau = :Trudeau=>sum, StHubert = :StHubert=>sum)
first(pcp_sum, 20)

Unnamed: 0_level_0,date,McTavish,Bellevue,Assomption,Trudeau,StHubert
Unnamed: 0_level_1,Date,Int64,Int64,Int64,Int64,Int64
1,2013-05-01,0,0,0,0,0
2,2013-05-02,0,0,0,0,0
3,2013-05-03,0,0,0,0,0
4,2013-05-04,0,0,0,0,0
5,2013-05-05,0,0,0,0,0
6,2013-05-06,0,0,0,0,0
7,2013-05-07,0,0,0,0,0
8,2013-05-08,0,0,0,0,0
9,2013-05-09,10,0,19,0,0
10,2013-05-10,0,4,20,0,0


#### Taux horaire max par station

In [171]:
pcp_max = by(precipitation, :date,  McTavish = :McTavish=>maximum, Bellevue = :Bellevue=>maximum, 
   Assomption = :Assomption=>maximum, Trudeau = :Trudeau=>maximum, StHubert = :StHubert=>maximum)
first(pcp_max,20)

Unnamed: 0_level_0,date,McTavish,Bellevue,Assomption,Trudeau,StHubert
Unnamed: 0_level_1,Date,Int64,Int64,Int64,Int64,Int64
1,2013-05-01,0,0,0,0,0
2,2013-05-02,0,0,0,0,0
3,2013-05-03,0,0,0,0,0
4,2013-05-04,0,0,0,0,0
5,2013-05-05,0,0,0,0,0
6,2013-05-06,0,0,0,0,0
7,2013-05-07,0,0,0,0,0
8,2013-05-08,0,0,0,0,0
9,2013-05-09,10,0,19,0,0
10,2013-05-10,0,4,20,0,0


#### Création des features et labels pour decision tree

In [56]:
function dateToDay(dt)::Float64 
    return year(dt) * 365 + month(dt) * 30 + day(dt);
end;    

In [103]:
dates = comb[:, [:DATE]];
n = nrow(dates)
days = zeros(n)
for i in 1:n
    days[i] = dateToDay(dates[i, 1])
end

In [191]:
lbl = comb[:, :SURVERSE];
ftr = convert(Matrix{Float64},comb[:, [:TP_LAT, :TP_LNG, :TP_Z]]);
ftr = hcat(days, ftr);

#### Test Decision Tree

In [105]:
model = build_tree(lbl,ftr)

Decision Tree
Leaves: 6118
Depth:  57

In [106]:
modelp = prune_tree(model,.80)

Decision Tree
Leaves: 5178
Depth:  57

In [107]:
print_tree(modelp,4)

Feature 3, Threshold -73.74178122771505
L-> Feature 2, Threshold 45.4737306942081
    L-> Feature 1, Threshold 736095.5
        L-> Feature 2, Threshold 45.4687971888686
            L-> 0 : 11706/11707
            R-> 
        R-> Feature 1, Threshold 736097.5
            L-> 
            R-> 
    R-> Feature 2, Threshold 45.48227864854675
        L-> Feature 1, Threshold 736796.5
            L-> 
            R-> 
        R-> Feature 3, Threshold -73.8338415477965
            L-> 
            R-> 
R-> Feature 2, Threshold 45.5734029500813
    L-> Feature 2, Threshold 45.5165318661206
        L-> Feature 4, Threshold 14.885
            L-> 
            R-> 
        R-> Feature 2, Threshold 45.51732069240945
            L-> 
            R-> 
    R-> Feature 4, Threshold 25.475
        L-> Feature 4, Threshold 19.84
            L-> 
            R-> 
        R-> Feature 1, Threshold 736804.5
            L-> 
            R-> 


### Prediction

#### Load data

In [108]:
test = CSV.read("data/test.csv")
rename!(test, :NO_OUVRAGE => :ID_OUVRAGE)
first(test, 10)

Unnamed: 0_level_0,ID_OUVRAGE,DATE
Unnamed: 0_level_1,String,Date
1,3260-01D,2019-05-02
2,3260-01D,2019-05-09
3,3260-01D,2019-05-10
4,3260-01D,2019-05-15
5,3260-01D,2019-05-20
6,3260-01D,2019-05-23
7,3260-01D,2019-05-24
8,3260-01D,2019-05-26
9,3260-01D,2019-05-30
10,3350-07D,2019-05-01


#### Augment with lat long z

In [116]:
test_comb = join(test, features, on = :ID_OUVRAGE)
test_comb.TP_Z = coalesce.(test_comb.TP_Z, mean(test_comb[completecases(test_comb), :].TP_Z))
first(test_comb, 10)

Unnamed: 0_level_0,ID_OUVRAGE,DATE,TP_LAT,TP_LNG,TP_Z
Unnamed: 0_level_1,String,Date,Float64,Float64,Float64
1,3260-01D,2019-05-02,45.6507,-73.5803,20.17
2,3260-01D,2019-05-09,45.6507,-73.5803,20.17
3,3260-01D,2019-05-10,45.6507,-73.5803,20.17
4,3260-01D,2019-05-15,45.6507,-73.5803,20.17
5,3260-01D,2019-05-20,45.6507,-73.5803,20.17
6,3260-01D,2019-05-23,45.6507,-73.5803,20.17
7,3260-01D,2019-05-24,45.6507,-73.5803,20.17
8,3260-01D,2019-05-26,45.6507,-73.5803,20.17
9,3260-01D,2019-05-30,45.6507,-73.5803,20.17
10,3350-07D,2019-05-01,45.5461,-73.6921,20.75


#### Get test_comb as array for prediction

In [117]:
dates_t = test_comb[:, [:DATE]];
first(dates_t, 5)
nt = nrow(dates_t)
days_t = zeros(nt)
for i in 1:nt
    days_t[i] = dateToDay(dates_t[i, 1])
end

days_t[1:20]

20-element Array{Float64,1}:
 737087.0
 737094.0
 737095.0
 737100.0
 737105.0
 737108.0
 737109.0
 737111.0
 737115.0
 737086.0
 737087.0
 737093.0
 737094.0
 737095.0
 737096.0
 737098.0
 737099.0
 737103.0
 737104.0
 737105.0

In [120]:
ftr_t = convert(Matrix{Float64},test_comb[:, [:TP_LAT, :TP_LNG, :TP_Z]]);
ftr_t = hcat(days_t, ftr_t);
size(ftr_t)

(283, 4)

#### Station la plus proche pour chaque ouvrage

In [186]:
ouvrages = unique(test_comb[!, [:ID_OUVRAGE]], :ID_OUVRAGE)

Unnamed: 0_level_0,ID_OUVRAGE
Unnamed: 0_level_1,String
1,3260-01D
2,3350-07D
3,4240-01D
4,4350-01D
5,4380-01D


#### Pour chaque ouvrage / date, trouver la station la plus proche (lat long) et obtenir sum precipitation et taux horaire

In [None]:
for i=1:size(test_comb)
    
end

### Forest

In [136]:
model_rf = build_forest(lbl,ftr, 3, 10, 0.5, 48)

Ensemble of Decision Trees
Trees:      10
Avg Leaves: 3626.0
Avg Depth:  47.4

### Submission

In [137]:
res_t = apply_forest(model_rf, ftr_t)

283-element Array{Int64,1}:
 0
 0
 0
 0
 0
 0
 0
 0
 0
 1
 1
 1
 1
 ⋮
 0
 0
 0
 0
 0
 0
 0
 0
 0
 0
 0
 0

In [139]:
ID = test_comb[:,:ID_OUVRAGE].*"_".*string.(test_comb[:,:DATE])
sampleSubmission = DataFrame(ID = ID, Surverse=res_t)
CSV.write("submissions/mc-submission-3.csv",sampleSubmission)

"submissions/mc-submission-3.csv"