In [1]:
using CSV, DataFrames, GLM, Statistics, Dates, Gadfly, Random, MLBase, DecisionTree;
include("utils/precipitation.jl");

On garde les ouvrages d'intérêt.

In [11]:
ouvrages = CSV.read("data/ouvrages-surverses.csv");
colnames = ["N_Env", "ID_SOMA", "ID_OUVRAGE", "NOM", "SOMA_SEC", "REGION", "TP_X", "TP_Y", "TP_Z", "TP_LAT", "TP_LNG", "EMI_X", "EMI_Y", "EMI_LNG", "EMI_LAT"];
names!(ouvrages, Symbol.(colnames));
select!(ouvrages, [:ID_OUVRAGE, :TP_LAT, :TP_LNG, :TP_Z]);
ouvrages.TP_Z = coalesce.(ouvrages.TP_Z, mean(ouvrages[completecases(ouvrages), :].TP_Z));

In [12]:
size(ouvrages)

(167, 4)

In [13]:
important_ouvrages = ["3260-01D", "3350-07D", "4240-01D", "4350-01D", "4380-01D"];
ouvrages = filter(row -> row.ID_OUVRAGE ∈ important_ouvrages, ouvrages);

In [14]:
first(ouvrages, 5)

Unnamed: 0_level_0,ID_OUVRAGE,TP_LAT,TP_LNG,TP_Z
Unnamed: 0_level_1,String,Float64,Float64,Float64
1,3260-01D,45.6507,-73.5803,20.17
2,3350-07D,45.5461,-73.6921,20.75
3,4240-01D,45.6497,-73.4877,11.91
4,4350-01D,45.4991,-73.555,19.3526
5,4380-01D,45.4677,-73.5637,19.3526


In [246]:
surverses = CSV.read("data/surverses.csv", missingstring="-99999");
surverses = filter(row -> month(row.DATE) > 4, surverses);
surverses = filter(row -> month(row.DATE) < 11, surverses);
surverses[!,:RAISON] = coalesce.(surverses[:,:RAISON],"Inconnue");

surverses = filter(row -> row.RAISON ∈ ["P","Inconnue","TS"], surverses);
select!(surverses, [:NO_OUVRAGE, :DATE, :SURVERSE]);
rename!(surverses, :NO_OUVRAGE => :ID_OUVRAGE);

In [247]:
surverses = filter(row -> row.ID_OUVRAGE ∈ important_ouvrages, surverses);
dropmissing!(surverses);

In [248]:
describe(surverses[!, :SURVERSE])

Summary Stats:
Length:         5129
Missing Count:  0
Mean:           0.085202
Minimum:        0.000000
1st Quartile:   0.000000
Median:         0.000000
3rd Quartile:   0.000000
Maximum:        1.000000
Type:           Int64


In [249]:
curr = filter(row -> row.ID_OUVRAGE == important_ouvrages[3], surverses);
describe(curr[!, :SURVERSE])

Summary Stats:
Length:         1100
Missing Count:  0
Mean:           0.062727
Minimum:        0.000000
1st Quartile:   0.000000
Median:         0.000000
3rd Quartile:   0.000000
Maximum:        1.000000
Type:           Int64


Le mean correspond au taux de surverses ici -> Beaucoup plus de non surverses que de surverses

### Beaucoup plus de 0 que de 1 -> Class imbalance problem
On le solve avec du over sampling de 1 et du under sampling de 0, plus tard

## Précipitations

In [293]:
precipitations = CSV.read("data/precipitations.csv",missingstring="-99999");
rename!(precipitations, Symbol("St-Hubert")=>:StHubert);

precipitations = filter(row -> month(row.date) > 4, precipitations);
precipitations = filter(row -> month(row.date) < 11, precipitations); 
names(precipitations)

7-element Array{Symbol,1}:
 :date      
 :heure     
 :McTavish  
 :Bellevue  
 :Assomption
 :Trudeau   
 :StHubert  

In [294]:
describe(precipitations[!, :StHubert])

Summary Stats:
Length:         30912
Missing Count:  5206
Mean:           1.223683
Minimum:        0.000000
1st Quartile:   0.000000
Median:         0.000000
3rd Quartile:   0.000000
Maximum:        307.000000
Type:           Union{Missing, Int64}


In [296]:
precipitation_by_day = by(precipitations, :date,  
                            McTavish = :McTavish=>mean_wo_missing, 
                            Bellevue = :Bellevue=>mean_wo_missing, 
                            Assomption = :Assomption=>mean_wo_missing,
                            Trudeau = :Trudeau=>mean_wo_missing,
                            StHubert = :StHubert=>mean_wo_missing)

for i=1:size(precipitations,1)
    if isequal(precipitations[i, :McTavish], missing)
        precipitations[i,:McTavish] = filter(row-> row.date == precipitations[i,:date], precipitation_by_day)[!,:McTavish][1]
    end
    if isequal(precipitations[i, :Bellevue], missing)
        precipitations[i,:Bellevue] = filter(row-> row.date == precipitations[i,:date], precipitation_by_day)[!,:Bellevue][1]
    end
    if isequal(precipitations[i, :Assomption], missing)
        precipitations[i,:Assomption] = filter(row-> row.date == precipitations[i,:date], precipitation_by_day)[!,:Assomption][1]
    end
    if isequal(precipitations[i, :Trudeau], missing)
        precipitations[i,:Trudeau] = filter(row-> row.date == precipitations[i,:date], precipitation_by_day)[!,:Trudeau][1]
    end
    if isequal(precipitations[i, :StHubert], missing)
        precipitations[i,:StHubert] = filter(row-> row.date == precipitations[i,:date], precipitation_by_day)[!,:StHubert][1]
    end
end

In [297]:
describe(precipitations[!, :StHubert])

Summary Stats:
Length:         30912
Missing Count:  0
Mean:           1.018957
Minimum:        0.000000
1st Quartile:   0.000000
Median:         0.000000
3rd Quartile:   0.000000
Maximum:        307.000000
Type:           Union{Missing, Int64}


In [298]:
first(shuffleDf(precipitations), 10)

Unnamed: 0_level_0,date,heure,McTavish,Bellevue,Assomption,Trudeau,StHubert
Unnamed: 0_level_1,Date,Int64,Int64⍰,Int64⍰,Int64⍰,Int64⍰,Int64⍰
1,2016-05-05,16,0,0,0,0,0
2,2014-05-04,21,0,12,20,0,0
3,2014-05-03,9,0,0,0,0,0
4,2015-06-03,19,0,0,0,0,0
5,2014-08-06,16,0,0,0,0,0
6,2016-05-13,20,0,0,0,0,0
7,2019-10-05,4,0,0,0,0,0
8,2013-10-08,5,0,0,0,0,0
9,2018-06-04,23,0,0,7,0,0
10,2014-07-03,15,0,0,0,0,0


In [299]:
pcp_sum = by(precipitations, :date,  
            McTavish = :McTavish=>sum, 
            Bellevue = :Bellevue=>sum,
            Assomption = :Assomption=>sum, 
            Trudeau = :Trudeau=>sum, 
            StHubert = :StHubert=>sum);

In [300]:
first(shuffleDf(pcp_sum), 5)

Unnamed: 0_level_0,date,McTavish,Bellevue,Assomption,Trudeau,StHubert
Unnamed: 0_level_1,Date,Int64,Int64,Int64,Int64,Int64
1,2014-08-16,182,134,132,118,77
2,2017-05-24,0,0,0,0,0
3,2015-06-02,48,41,22,13,13
4,2013-09-23,0,0,0,0,0
5,2016-05-23,0,0,0,0,0


In [301]:
pcp_max = by(precipitations, :date,  
            McTavish = :McTavish=>maximum,
            Bellevue = :Bellevue=>maximum, 
            Assomption = :Assomption=>maximum,
            Trudeau = :Trudeau=>maximum,
            StHubert = :StHubert=>maximum);

In [302]:
first(shuffleDf(pcp_max), 5)

Unnamed: 0_level_0,date,McTavish,Bellevue,Assomption,Trudeau,StHubert
Unnamed: 0_level_1,Date,Int64,Int64,Int64,Int64,Int64
1,2019-08-05,0,0,0,0,0
2,2014-09-29,0,0,0,0,0
3,2017-08-12,45,25,110,31,25
4,2014-08-03,0,0,0,0,0
5,2015-07-06,0,0,0,0,0


In [303]:
pcp_max3 = by(precipitations, :date,
                McTavish = :McTavish=>maximum3,
                Bellevue = :Bellevue=>maximum3,
                Assomption = :Assomption=>maximum3,
                Trudeau = :Trudeau=>maximum3,
                StHubert = :StHubert=>maximum3);

In [304]:
first(shuffleDf(pcp_max3), 5)

Unnamed: 0_level_0,date,McTavish,Bellevue,Assomption,Trudeau,StHubert
Unnamed: 0_level_1,Date,Int64,Int64,Int64,Int64,Int64
1,2013-08-04,12,0,11,0,0
2,2016-10-14,0,0,0,2,0
3,2014-06-01,0,0,0,0,0
4,2016-07-09,103,66,190,71,78
5,2016-09-24,0,0,0,0,0


In [305]:
station_df = DataFrame(STATION = String[], LAT = Float64[], LNG = Float64[]);

push!(station_df, ["McTavish", 45.504742, -73.579167]);
push!(station_df, ["Bellevue", 45.427222, -73.929167]);
push!(station_df, ["Assomption", 45.809444, -73.434722]);
push!(station_df, ["Trudeau", 45.467778, -73.741667]);
push!(station_df, ["StHubert", 45.5175, -73.416944]);

station_df

Unnamed: 0_level_0,STATION,LAT,LNG
Unnamed: 0_level_1,String,Float64,Float64
1,McTavish,45.5047,-73.5792
2,Bellevue,45.4272,-73.9292
3,Assomption,45.8094,-73.4347
4,Trudeau,45.4678,-73.7417
5,StHubert,45.5175,-73.4169


### On ajoute les colonnes de précipitations

In [306]:
stations_names = ["McTavish", "Bellevue", "Assomption", "Trudeau", "StHubert"];
train_data = surverses;
for i=1:5
#     train_data[!, Symbol(string(stations_names[i],"_proxy"))] = zeros(size(train_data, 1));
    train_data[!, Symbol(string(stations_names[i],"_sum"))] = zeros(size(train_data, 1));
    train_data[!, Symbol(string(stations_names[i],"_max"))] = zeros(size(train_data, 1));
    train_data[!, Symbol(string(stations_names[i],"_max3"))] = zeros(size(train_data, 1));
end

In [309]:
# proxies = [:ID_OUVRAGE, :McTavish_proxy, :Bellevue_proxy, :Assomption_proxy, :Trudeau_proxy, :StHubert_proxy]; 
# first(shuffleDf(train_data[!, proxies]), 5)
first(shuffleDf(train_data), 5)

Unnamed: 0_level_0,ID_OUVRAGE,DATE,SURVERSE,McTavish_sum,McTavish_max,McTavish_max3,Bellevue_sum
Unnamed: 0_level_1,String,Date,Int64,Float64,Float64,Float64,Float64
1,3260-01D,2014-06-08,0,0.0,0.0,0.0,0.0
2,4240-01D,2015-08-22,0,0.0,0.0,0.0,0.0
3,4240-01D,2016-10-27,0,0.0,0.0,0.0,0.0
4,4350-01D,2017-05-20,0,0.0,0.0,0.0,0.0
5,4350-01D,2018-08-27,0,0.0,0.0,0.0,0.0


In [310]:
describe(train_data[!, :SURVERSE])

Summary Stats:
Length:         5129
Missing Count:  0
Mean:           0.085202
Minimum:        0.000000
1st Quartile:   0.000000
Median:         0.000000
3rd Quartile:   0.000000
Maximum:        1.000000
Type:           Int64


Populate les fields de chaque data

In [311]:
for i=1:size(train_data, 1)
    curr_ouvrage = train_data[i, 1];
    ouvrage_data = filter(row -> row.ID_OUVRAGE == curr_ouvrage, ouvrages);
    # Pour chaque station
    for j=1:5
        current_station = station_df[j, :STATION];
#         dist = findDistance(ouvrage_data[1, :TP_LAT], ouvrage_data[1, :TP_LNG], station_df[j, :LAT], station_df[j, :LNG]);
#         train_data[i, Symbol(string(current_station, "_proxy"))] = dist;
        
        # Somme des précipitations
        sum_p = pcp_sum[∈([train_data[i, :DATE]]).(pcp_sum.date), Symbol(current_station)];
        train_data[i, Symbol(string(current_station, "_sum"))] = sum_p[1];
        
        # Maximum des précipitations sur 1h
        max_p = pcp_max[∈([train_data[i, :DATE]]).(pcp_max.date), Symbol(current_station)];
        train_data[i, Symbol(string(current_station, "_max"))] = max_p[1];
        
        # Maximum des précipitations sur 3h
        max3_p = pcp_max3[∈([train_data[i, :DATE]]).(pcp_max3.date), Symbol(current_station)];
        train_data[i, Symbol(string(current_station, "_max3"))] = max3_p[1];
    end
end

In [312]:
cols = [:ID_OUVRAGE, :DATE, :McTavish_sum, :McTavish_max, :McTavish_max3];
first(shuffleDf(train_data[!, cols]), 10)

Unnamed: 0_level_0,ID_OUVRAGE,DATE,McTavish_sum,McTavish_max,McTavish_max3
Unnamed: 0_level_1,String,Date,Float64,Float64,Float64
1,4240-01D,2016-07-04,0.0,0.0,0.0
2,4240-01D,2018-05-15,8.0,4.0,6.0
3,4350-01D,2016-09-25,0.0,0.0,0.0
4,3260-01D,2018-09-06,0.0,0.0,0.0
5,4240-01D,2016-10-17,3.0,3.0,3.0
6,3260-01D,2018-06-17,0.0,0.0,0.0
7,3350-07D,2016-07-23,183.0,128.0,157.0
8,4350-01D,2017-10-09,324.0,82.0,204.0
9,4240-01D,2014-05-05,11.0,11.0,11.0
10,4350-01D,2018-06-02,0.0,0.0,0.0


### Save dataframes in files per ouvrage

In [333]:
ouvrage_3260 = filter(row -> row.ID_OUVRAGE == "3260-01D", train_data);
select!(ouvrage_3260, Not(:ID_OUVRAGE));
CSV.write("data/parsed/ouvrage_3260.csv",ouvrage_3260)

"data/parsed/ouvrage_3260.csv"

In [334]:
ouvrage_3350 = filter(row -> row.ID_OUVRAGE == "3350-07D", train_data)
select!(ouvrage_3350, Not(:ID_OUVRAGE));
CSV.write("data/parsed/ouvrage_3350.csv",ouvrage_3350)

"data/parsed/ouvrage_3350.csv"

In [335]:
ouvrage_4240 = filter(row -> row.ID_OUVRAGE == "4240-01D", train_data)
select!(ouvrage_4240, Not(:ID_OUVRAGE));
CSV.write("data/parsed/ouvrage_4240.csv",ouvrage_4240)

"data/parsed/ouvrage_4240.csv"

In [336]:
ouvrage_4350 = filter(row -> row.ID_OUVRAGE == "4350-01D", train_data)
select!(ouvrage_4350, Not(:ID_OUVRAGE));
CSV.write("data/parsed/ouvrage_4350.csv",ouvrage_4350)

"data/parsed/ouvrage_4350.csv"

In [337]:
ouvrage_4380 = filter(row -> row.ID_OUVRAGE == "4380-01D", train_data)
select!(ouvrage_4380, Not(:ID_OUVRAGE));
CSV.write("data/parsed/ouvrage_4380.csv",ouvrage_4380)

"data/parsed/ouvrage_4380.csv"

### Tests

In [338]:
test_data = CSV.read("data/test.csv");
rename!(test_data, :NO_OUVRAGE => :ID_OUVRAGE);

In [339]:
levels(test_data[:,:ID_OUVRAGE])

5-element Array{String,1}:
 "3260-01D"
 "3350-07D"
 "4240-01D"
 "4350-01D"
 "4380-01D"

In [340]:
for i=1:5
#     train_data[!, Symbol(string(stations_names[i],"_proxy"))] = zeros(size(train_data, 1));
    test_data[!, Symbol(string(stations_names[i],"_sum"))] = zeros(size(test_data, 1));
    test_data[!, Symbol(string(stations_names[i],"_max"))] = zeros(size(test_data, 1));
    test_data[!, Symbol(string(stations_names[i],"_max3"))] = zeros(size(test_data, 1));
end

In [341]:
for i=1:size(test_data, 1)
    curr_ouvrage = test_data[i, 1];
    ouvrage_data = filter(row -> row.ID_OUVRAGE == curr_ouvrage, ouvrages);
    # Pour chaque station
    for j=1:5
        current_station = station_df[j, :STATION];
#         dist = findDistance(ouvrage_data[1, :TP_LAT], ouvrage_data[1, :TP_LNG], station_df[j, :LAT], station_df[j, :LNG]);
#         test_data[i, Symbol(string(current_station, "_proxy"))] = dist;
        
        # Somme des précipitations
        sum_p = pcp_sum[∈([test_data[i, :DATE]]).(pcp_sum.date), Symbol(current_station)];
        test_data[i, Symbol(string(current_station, "_sum"))] = sum_p[1];
        
        # Maximum des précipitations sur 1h
        max_p = pcp_max[∈([test_data[i, :DATE]]).(pcp_max.date), Symbol(current_station)];
        test_data[i, Symbol(string(current_station, "_max"))] = max_p[1];
        
        # Maximum des précipitations sur 3h
        max3_p = pcp_max3[∈([test_data[i, :DATE]]).(pcp_max3.date), Symbol(current_station)];
        test_data[i, Symbol(string(current_station, "_max3"))] = max3_p[1];
    end
end

In [342]:
cols = [:ID_OUVRAGE, :DATE, :McTavish_sum, :McTavish_max, :McTavish_max3];
first(shuffleDf(test_data[!, cols]), 10)

Unnamed: 0_level_0,ID_OUVRAGE,DATE,McTavish_sum,McTavish_max,McTavish_max3
Unnamed: 0_level_1,String,Date,Float64,Float64,Float64
1,3350-07D,2019-08-19,0.0,0.0,0.0
2,4380-01D,2019-07-01,0.0,0.0,0.0
3,3350-07D,2019-08-28,158.0,41.0,92.0
4,4350-01D,2019-08-23,0.0,0.0,0.0
5,4350-01D,2019-06-25,153.0,65.0,121.0
6,4240-01D,2019-08-02,0.0,0.0,0.0
7,4240-01D,2019-05-24,13.0,9.0,11.0
8,3350-07D,2019-05-11,2.0,2.0,2.0
9,4350-01D,2019-07-17,0.0,0.0,0.0
10,3260-01D,2019-08-09,86.0,34.0,57.0


In [343]:
test_3260 = filter(row -> row.ID_OUVRAGE == "3260-01D", test_data);
select!(test_3260, Not(:ID_OUVRAGE));
CSV.write("data/parsed/test_3260.csv",test_3260)

"data/parsed/test_3260.csv"

In [344]:
test_3350 = filter(row -> row.ID_OUVRAGE == "3350-07D", test_data);
select!(test_3350, Not(:ID_OUVRAGE));
CSV.write("data/parsed/test_3350.csv",test_3350)

"data/parsed/test_3350.csv"

In [345]:
test_4240 = filter(row -> row.ID_OUVRAGE == "4240-01D", test_data);
select!(test_4240, Not(:ID_OUVRAGE));
CSV.write("data/parsed/test_4240.csv",test_4240)

"data/parsed/test_4240.csv"

In [346]:
test_4350 = filter(row -> row.ID_OUVRAGE == "4350-01D", test_data);
select!(test_4350, Not(:ID_OUVRAGE));
CSV.write("data/parsed/test_4350.csv",test_4350)

"data/parsed/test_4350.csv"

In [347]:
test_4380 = filter(row -> row.ID_OUVRAGE == "4380-01D", test_data);
select!(test_4380, Not(:ID_OUVRAGE));
CSV.write("data/parsed/test_4380.csv",test_4380)

"data/parsed/test_4380.csv"