# Forêt aléatoire - Projet final A19

#### Importer les librairies

In [43]:
using CSV, DataFrames, DecisionTree, Statistics, Dates, Gadfly, Random

### Importer les données

#### Importer le fichier contenant les labels

In [17]:
labels = CSV.read("data/surverses.csv",missingstring="-99999")
first(labels, 10)

Unnamed: 0_level_0,NO_OUVRAGE,DATE,SURVERSE,RAISON
Unnamed: 0_level_1,String,Date,Int64⍰,String⍰
1,0642-01D,2013-05-01,0,missing
2,0642-01D,2013-05-02,0,missing
3,0642-01D,2013-05-03,0,missing
4,0642-01D,2013-05-04,0,missing
5,0642-01D,2013-05-05,0,missing
6,0642-01D,2013-05-06,0,missing
7,0642-01D,2013-05-07,0,missing
8,0642-01D,2013-05-08,0,missing
9,0642-01D,2013-05-09,0,missing
10,0642-01D,2013-05-10,0,missing


### Filtrage des labels

#### Filtrer les mois (Mai à Octobre)

In [19]:
labels = filter(row -> month(row.DATE) > 4, labels)
labels = filter(row -> month(row.DATE) < 11, labels)
first(labels, 10)

Unnamed: 0_level_0,NO_OUVRAGE,DATE,SURVERSE,RAISON
Unnamed: 0_level_1,String,Date,Int64⍰,String⍰
1,0642-01D,2013-05-01,0,missing
2,0642-01D,2013-05-02,0,missing
3,0642-01D,2013-05-03,0,missing
4,0642-01D,2013-05-04,0,missing
5,0642-01D,2013-05-05,0,missing
6,0642-01D,2013-05-06,0,missing
7,0642-01D,2013-05-07,0,missing
8,0642-01D,2013-05-08,0,missing
9,0642-01D,2013-05-09,0,missing
10,0642-01D,2013-05-10,0,missing


#### Garder seulement les surverses dues à la pluie

In [20]:
raison = coalesce.(labels[:,:RAISON],"Inconnue")
labels[!,:RAISON] = raison

labels = filter(row -> row.RAISON ∈ ["P","Inconnue","TS"], labels)
select!(labels, [:NO_OUVRAGE, :DATE, :SURVERSE])
first(labels,10)

Unnamed: 0_level_0,NO_OUVRAGE,DATE,SURVERSE
Unnamed: 0_level_1,String,Date,Int64⍰
1,0642-01D,2013-05-01,0
2,0642-01D,2013-05-02,0
3,0642-01D,2013-05-03,0
4,0642-01D,2013-05-04,0
5,0642-01D,2013-05-05,0
6,0642-01D,2013-05-06,0
7,0642-01D,2013-05-07,0
8,0642-01D,2013-05-08,0
9,0642-01D,2013-05-09,0
10,0642-01D,2013-05-10,0


In [32]:
labels = dropmissing(labels, disallowmissing=true)
rename!(labels, :NO_OUVRAGE => :ID_OUVRAGE)
first(labels,10)

Unnamed: 0_level_0,ID_OUVRAGE,DATE,SURVERSE
Unnamed: 0_level_1,String,Date,Int64
1,0642-01D,2013-05-01,0
2,0642-01D,2013-05-02,0
3,0642-01D,2013-05-03,0
4,0642-01D,2013-05-04,0
5,0642-01D,2013-05-05,0
6,0642-01D,2013-05-06,0
7,0642-01D,2013-05-07,0
8,0642-01D,2013-05-08,0
9,0642-01D,2013-05-09,0
10,0642-01D,2013-05-10,0


In [33]:
nrow(labels)

161098

### Load features

In [28]:
features = CSV.read("data/ouvrages-surverses.csv")
colnames = ["N_Env", "ID_SOMA", "ID_OUVRAGE", "NOM", "SOMA_SEC", "REGION", "TP_X", "TP_Y", "TP_Z", "TP_LAT", "TP_LNG", "EMI_X", "EMI_Y", "EMI_LNG", "EMI_LAT"]
names!(features, Symbol.(colnames))
select!(features, [:ID_OUVRAGE, :TP_LAT, :TP_LNG])
first(features, 10)

Unnamed: 0_level_0,ID_OUVRAGE,TP_LAT,TP_LNG
Unnamed: 0_level_1,String,Float64,Float64
1,0642-01D,45.6727,-73.5262
2,0672-01D,45.6823,-73.531
3,0672-02D,45.6939,-73.5214
4,0672-03D,45.6732,-73.5402
5,0801-01D,45.519,-73.5275
6,0801-02D,45.5174,-73.5281
7,0801-03D,45.5081,-73.5273
8,0801-04D,45.5029,-73.5238
9,0801-05D,45.5173,-73.5312
10,0801-06D,45.5187,-73.533


In [56]:
comb = join(features, labels, on = :ID_OUVRAGE)
first(comb, 10)

Unnamed: 0_level_0,ID_OUVRAGE,TP_LAT,TP_LNG,DATE,SURVERSE
Unnamed: 0_level_1,String,Float64,Float64,Date,Int64
1,0642-01D,45.6727,-73.5262,2013-05-01,0
2,0642-01D,45.6727,-73.5262,2013-05-02,0
3,0642-01D,45.6727,-73.5262,2013-05-03,0
4,0642-01D,45.6727,-73.5262,2013-05-04,0
5,0642-01D,45.6727,-73.5262,2013-05-05,0
6,0642-01D,45.6727,-73.5262,2013-05-06,0
7,0642-01D,45.6727,-73.5262,2013-05-07,0
8,0642-01D,45.6727,-73.5262,2013-05-08,0
9,0642-01D,45.6727,-73.5262,2013-05-09,0
10,0642-01D,45.6727,-73.5262,2013-05-10,0


#### Test avec un random sample

In [55]:
sample_test = comb[shuffle(1:size(comb, 1)),:]
sample_test = sample_test[!, [:ID_OUVRAGE, :DATE, :TP_LAT, :TP_LNG, :SURVERSE]]
first(sample_test, 20)

Unnamed: 0_level_0,ID_OUVRAGE,DATE,TP_LAT,TP_LNG,SURVERSE
Unnamed: 0_level_1,String,Date,Float64,Float64,Int64
1,4400-01D,2013-05-02,45.4586,-73.5621,0
2,4610-06D,2017-07-27,45.4305,-73.8563,0
3,3490-02D,2014-07-18,45.5054,-73.8061,0
4,4280-02D,2015-09-18,45.5986,-73.5102,0
5,4790-01D,2018-05-05,45.4891,-73.5419,0
6,3350-04D,2017-10-10,45.5761,-73.6603,0
7,0642-01D,2018-09-16,45.6727,-73.5262,0
8,3762-01D,2013-08-25,45.5764,-73.6619,0
9,4790-01D,2015-09-09,45.4891,-73.5419,0
10,4710-01D,2015-06-23,45.6976,-73.4968,0


#### Création des features et labels pour decision tree

In [110]:
function dateToDay(dt)::Float64 
    return year(dt) * 365 + month(dt) * 30 + day(dt);
end;    

In [115]:
dates = comb[:, [:DATE]];
# first(dates, 5);
n = nrow(dates)
days = zeros(n)
for i in 1:n
    days[i] = dateToDay(dates[i, 1])
end

In [117]:
lbl = comb[:, :SURVERSE];
ftr = convert(Matrix{Float64},comb[:, [:TP_LAT, :TP_LNG]]);
ftr = hcat(days, ftr)
ftr[10:30, :]

21×3 Array{Float64,2}:
 734905.0  45.6727  -73.5262
 734906.0  45.6727  -73.5262
 734907.0  45.6727  -73.5262
 734908.0  45.6727  -73.5262
 734909.0  45.6727  -73.5262
 734910.0  45.6727  -73.5262
 734911.0  45.6727  -73.5262
 734912.0  45.6727  -73.5262
 734913.0  45.6727  -73.5262
 734914.0  45.6727  -73.5262
 734915.0  45.6727  -73.5262
 734916.0  45.6727  -73.5262
 734917.0  45.6727  -73.5262
 734918.0  45.6727  -73.5262
 734919.0  45.6727  -73.5262
 734920.0  45.6727  -73.5262
 734921.0  45.6727  -73.5262
 734922.0  45.6727  -73.5262
 734923.0  45.6727  -73.5262
 734924.0  45.6727  -73.5262
 734925.0  45.6727  -73.5262

#### Test Decision Tree

In [118]:
model = build_tree(lbl,ftr)

Decision Tree
Leaves: 6398
Depth:  57

In [126]:
modelp = prune_tree(model,.80)

Decision Tree
Leaves: 5419
Depth:  57

In [121]:
print_tree(modelp,4)

Feature 3, Threshold -73.74178122771505
L-> Feature 2, Threshold 45.4737306942081
    L-> Feature 1, Threshold 736095.5
        L-> Feature 2, Threshold 45.4687971888686
            L-> 0 : 11706/11707
            R-> 
        R-> Feature 1, Threshold 736097.5
            L-> 
            R-> 
    R-> Feature 3, Threshold -73.8706960818827
        L-> Feature 1, Threshold 736796.5
            L-> 
            R-> 
        R-> Feature 3, Threshold -73.8338415477965
            L-> 
            R-> 
R-> Feature 2, Threshold 45.5734029500813
    L-> Feature 2, Threshold 45.5165318661206
        L-> Feature 2, Threshold 45.417339872290555
            L-> 0 : 2208/2208
            R-> 
        R-> Feature 2, Threshold 45.51732069240945
            L-> 
            R-> 
    R-> Feature 2, Threshold 45.69829916479875
        L-> Feature 2, Threshold 45.6589170358007
            L-> 
            R-> 
        R-> Feature 1, Threshold 736101.5
            L-> 
            R-> 


### Prediction

#### Load data

In [130]:
test = CSV.read("data/test.csv")
rename!(test, :NO_OUVRAGE => :ID_OUVRAGE)
first(test, 10)

Unnamed: 0_level_0,ID_OUVRAGE,DATE
Unnamed: 0_level_1,String,Date
1,3260-01D,2019-05-02
2,3260-01D,2019-05-09
3,3260-01D,2019-05-10
4,3260-01D,2019-05-15
5,3260-01D,2019-05-20
6,3260-01D,2019-05-23
7,3260-01D,2019-05-24
8,3260-01D,2019-05-26
9,3260-01D,2019-05-30
10,3350-07D,2019-05-01


#### Augment with lat long

In [142]:
test_comb = join(test, features, on = :ID_OUVRAGE)

first(test_comb, 10)

Unnamed: 0_level_0,ID_OUVRAGE,DATE,TP_LAT,TP_LNG
Unnamed: 0_level_1,String,Date,Float64,Float64
1,3260-01D,2019-05-02,45.6507,-73.5803
2,3260-01D,2019-05-09,45.6507,-73.5803
3,3260-01D,2019-05-10,45.6507,-73.5803
4,3260-01D,2019-05-15,45.6507,-73.5803
5,3260-01D,2019-05-20,45.6507,-73.5803
6,3260-01D,2019-05-23,45.6507,-73.5803
7,3260-01D,2019-05-24,45.6507,-73.5803
8,3260-01D,2019-05-26,45.6507,-73.5803
9,3260-01D,2019-05-30,45.6507,-73.5803
10,3350-07D,2019-05-01,45.5461,-73.6921


#### Get test_comb as array for prediction

In [154]:
dates_t = test_comb[:, [:DATE]];
first(dates_t, 5)
nt = nrow(dates_t)
days_t = zeros(nt)
for i in 1:nt
    days_t[i] = dateToDay(dates_t[i, 1])
end

days_t[1:20]

20-element Array{Float64,1}:
 737087.0
 737094.0
 737095.0
 737100.0
 737105.0
 737108.0
 737109.0
 737111.0
 737115.0
 737086.0
 737087.0
 737093.0
 737094.0
 737095.0
 737096.0
 737098.0
 737099.0
 737103.0
 737104.0
 737105.0

In [158]:
ftr_t = convert(Matrix{Float64},test_comb[:, [:TP_LAT, :TP_LNG]]);
ftr_t = hcat(days_t, ftr_t)
size(ftr_t)

(283, 3)

In [160]:
res_t = apply_tree(modelp, ftr_t)

283-element Array{Int64,1}:
 0
 0
 0
 0
 0
 0
 0
 0
 0
 1
 1
 1
 1
 ⋮
 0
 0
 0
 0
 0
 0
 0
 0
 0
 0
 0
 0

### Prepare for submission

In [162]:
ID = test_comb[:,:ID_OUVRAGE].*"_".*string.(test_comb[:,:DATE])
sampleSubmission = DataFrame(ID = ID, Surverse=res_t)
CSV.write("sampleSubmission.csv",sampleSubmission)

"sampleSubmission.csv"