In [1]:
using CSV, DataFrames, GLM, Statistics, Dates, Gadfly, Random, MLBase, DecisionTree;
include("utils/precipitation.jl");

On garde les ouvrages d'intérêt.

In [11]:
ouvrages = CSV.read("data/ouvrages-surverses.csv");
colnames = ["N_Env", "ID_SOMA", "ID_OUVRAGE", "NOM", "SOMA_SEC", "REGION", "TP_X", "TP_Y", "TP_Z", "TP_LAT", "TP_LNG", "EMI_X", "EMI_Y", "EMI_LNG", "EMI_LAT"];
names!(ouvrages, Symbol.(colnames));
select!(ouvrages, [:ID_OUVRAGE, :TP_LAT, :TP_LNG, :TP_Z]);
ouvrages.TP_Z = coalesce.(ouvrages.TP_Z, mean(ouvrages[completecases(ouvrages), :].TP_Z));

In [12]:
size(ouvrages)

(167, 4)

In [13]:
important_ouvrages = ["3260-01D", "3350-07D", "4240-01D", "4350-01D", "4380-01D"];
ouvrages = filter(row -> row.ID_OUVRAGE ∈ important_ouvrages, ouvrages);

In [14]:
first(ouvrages, 5)

Unnamed: 0_level_0,ID_OUVRAGE,TP_LAT,TP_LNG,TP_Z
Unnamed: 0_level_1,String,Float64,Float64,Float64
1,3260-01D,45.6507,-73.5803,20.17
2,3350-07D,45.5461,-73.6921,20.75
3,4240-01D,45.6497,-73.4877,11.91
4,4350-01D,45.4991,-73.555,19.3526
5,4380-01D,45.4677,-73.5637,19.3526


In [197]:
surverses = CSV.read("data/surverses.csv", missingstring="-99999");
surverses = filter(row -> month(row.DATE) > 4, surverses);
surverses = filter(row -> month(row.DATE) < 11, surverses);
surverses[!,:RAISON] = coalesce.(surverses[:,:RAISON],"Inconnue");

surverses = filter(row -> row.RAISON ∈ ["P","Inconnue","TS"], surverses);
select!(surverses, [:NO_OUVRAGE, :DATE, :SURVERSE]);
rename!(surverses, :NO_OUVRAGE => :ID_OUVRAGE);

In [198]:
surverses = filter(row -> row.ID_OUVRAGE ∈ important_ouvrages, surverses);
dropmissing!(surverses);

In [199]:
describe(surverses[!, :SURVERSE])

Summary Stats:
Length:         5129
Missing Count:  0
Mean:           0.085202
Minimum:        0.000000
1st Quartile:   0.000000
Median:         0.000000
3rd Quartile:   0.000000
Maximum:        1.000000
Type:           Int64


In [200]:
curr = filter(row -> row.ID_OUVRAGE == important_ouvrages[3], surverses);
describe(curr[!, :SURVERSE])

Summary Stats:
Length:         1100
Missing Count:  0
Mean:           0.062727
Minimum:        0.000000
1st Quartile:   0.000000
Median:         0.000000
3rd Quartile:   0.000000
Maximum:        1.000000
Type:           Int64


Le mean correspond au taux de surverses ici -> Beaucoup plus de non surverses que de surverses

### Beaucoup plus de 0 que de 1 -> Class imbalance problem
On le solve avec du over sampling de 1 et du under sampling de 0, plus tard

## Précipitations

In [84]:
precipitations = CSV.read("data/precipitations.csv",missingstring="-99999");
rename!(precipitations, Symbol("St-Hubert")=>:StHubert);

precipitations = filter(row -> month(row.date) > 4, precipitations);
precipitations = filter(row -> month(row.date) < 11, precipitations); 
names(precipitations)

7-element Array{Symbol,1}:
 :date      
 :heure     
 :McTavish  
 :Bellevue  
 :Assomption
 :Trudeau   
 :StHubert  

In [88]:
describe(precipitations[!, :StHubert])

Summary Stats:
Length:         30912
Missing Count:  5206
Mean:           1.223683
Minimum:        0.000000
1st Quartile:   0.000000
Median:         0.000000
3rd Quartile:   0.000000
Maximum:        307.000000
Type:           Union{Missing, Int64}


#### On garde juste les précipitations pour les dates dont on a les données de surverses

In [161]:
dates_a_retenir = unique(surverses[!, :DATE]);
precipitations = filter(row -> row.date ∈ dates_a_retenir, precipitations);

In [162]:
precipitation_by_day = by(precipitations, :date,  
                            McTavish = :McTavish=>mean_wo_missing, 
                            Bellevue = :Bellevue=>mean_wo_missing, 
                            Assomption = :Assomption=>mean_wo_missing,
                            Trudeau = :Trudeau=>mean_wo_missing,
                            StHubert = :StHubert=>mean_wo_missing)

for i=1:size(precipitations,1)
    if isequal(precipitations[i, :McTavish], missing)
        precipitations[i,:McTavish] = filter(row-> row.date == precipitations[i,:date], precipitation_by_day)[!,:McTavish][1]
    end
    if isequal(precipitations[i, :Bellevue], missing)
        precipitations[i,:Bellevue] = filter(row-> row.date == precipitations[i,:date], precipitation_by_day)[!,:Bellevue][1]
    end
    if isequal(precipitations[i, :Assomption], missing)
        precipitations[i,:Assomption] = filter(row-> row.date == precipitations[i,:date], precipitation_by_day)[!,:Assomption][1]
    end
    if isequal(precipitations[i, :Trudeau], missing)
        precipitations[i,:Trudeau] = filter(row-> row.date == precipitations[i,:date], precipitation_by_day)[!,:Trudeau][1]
    end
    if isequal(precipitations[i, :StHubert], missing)
        precipitations[i,:StHubert] = filter(row-> row.date == precipitations[i,:date], precipitation_by_day)[!,:StHubert][1]
    end
end

In [163]:
describe(precipitations[!, :StHubert])

Summary Stats:
Length:         26496
Missing Count:  0
Mean:           0.944067
Minimum:        0.000000
1st Quartile:   0.000000
Median:         0.000000
3rd Quartile:   0.000000
Maximum:        231.000000
Type:           Union{Missing, Int64}


In [164]:
first(shuffleDf(precipitations), 10)

Unnamed: 0_level_0,date,heure,McTavish,Bellevue,Assomption,Trudeau,StHubert
Unnamed: 0_level_1,Date,Int64,Int64⍰,Int64⍰,Int64⍰,Int64⍰,Int64⍰
1,2017-05-24,2,0,0,0,0,0
2,2016-05-15,13,0,0,0,0,0
3,2013-08-01,21,10,0,0,0,0
4,2017-10-05,0,0,0,0,0,0
5,2017-10-19,12,0,0,0,0,0
6,2013-08-20,16,0,0,0,0,0
7,2014-07-19,12,0,0,0,0,0
8,2018-07-06,11,0,0,0,0,0
9,2013-09-10,8,0,0,0,0,0
10,2014-07-18,6,0,0,0,0,0


In [165]:
pcp_sum = by(precipitations, :date,  
            McTavish = :McTavish=>sum, 
            Bellevue = :Bellevue=>sum,
            Assomption = :Assomption=>sum, 
            Trudeau = :Trudeau=>sum, 
            StHubert = :StHubert=>sum);

In [166]:
first(shuffleDf(pcp_sum), 5)

Unnamed: 0_level_0,date,McTavish,Bellevue,Assomption,Trudeau,StHubert
Unnamed: 0_level_1,Date,Int64,Int64,Int64,Int64,Int64
1,2018-08-20,2,0,16,0,0
2,2017-08-29,2,0,0,0,0
3,2017-09-04,16,22,30,15,19
4,2018-10-17,0,0,0,0,2
5,2013-09-11,52,98,80,53,0


In [167]:
pcp_max = by(precipitations, :date,  
            McTavish = :McTavish=>maximum,
            Bellevue = :Bellevue=>maximum, 
            Assomption = :Assomption=>maximum,
            Trudeau = :Trudeau=>maximum,
            StHubert = :StHubert=>maximum);

In [168]:
first(shuffleDf(pcp_max), 5)

Unnamed: 0_level_0,date,McTavish,Bellevue,Assomption,Trudeau,StHubert
Unnamed: 0_level_1,Date,Int64,Int64,Int64,Int64,Int64
1,2018-07-29,0,0,7,0,0
2,2018-05-28,6,0,36,9,58
3,2018-06-13,15,30,64,11,12
4,2015-06-03,0,2,0,0,0
5,2018-05-09,0,0,9,0,0


In [169]:
pcp_max3 = by(precipitations, :date,
                McTavish = :McTavish=>maximum3,
                Bellevue = :Bellevue=>maximum3,
                Assomption = :Assomption=>maximum3,
                Trudeau = :Trudeau=>maximum3,
                StHubert = :StHubert=>maximum3);

In [170]:
first(shuffleDf(pcp_max3), 5)

Unnamed: 0_level_0,date,McTavish,Bellevue,Assomption,Trudeau,StHubert
Unnamed: 0_level_1,Date,Int64,Int64,Int64,Int64,Int64
1,2014-06-17,0,175,78,150,113
2,2018-08-11,0,0,10,0,0
3,2013-05-04,0,0,0,0,0
4,2013-06-22,44,49,40,52,0
5,2016-08-11,0,0,0,0,0


In [171]:
station_df = DataFrame(STATION = String[], LAT = Float64[], LNG = Float64[]);

push!(station_df, ["McTavish", 45.504742, -73.579167]);
push!(station_df, ["Bellevue", 45.427222, -73.929167]);
push!(station_df, ["Assomption", 45.809444, -73.434722]);
push!(station_df, ["Trudeau", 45.467778, -73.741667]);
push!(station_df, ["StHubert", 45.5175, -73.416944]);

station_df

Unnamed: 0_level_0,STATION,LAT,LNG
Unnamed: 0_level_1,String,Float64,Float64
1,McTavish,45.5047,-73.5792
2,Bellevue,45.4272,-73.9292
3,Assomption,45.8094,-73.4347
4,Trudeau,45.4678,-73.7417
5,StHubert,45.5175,-73.4169


### On ajoute les colonnes de précipitations

In [205]:
stations_names = ["McTavish", "Bellevue", "Assomption", "Trudeau", "StHubert"];
train_data = surverses;
for i=1:5
    train_data[!, Symbol(string(stations_names[i],"_proxy"))] = zeros(size(train_data, 1));
    train_data[!, Symbol(string(stations_names[i],"_sum"))] = zeros(size(train_data, 1));
    train_data[!, Symbol(string(stations_names[i],"_max"))] = zeros(size(train_data, 1));
    train_data[!, Symbol(string(stations_names[i],"_max3"))] = zeros(size(train_data, 1));
end

In [225]:
proxies = [:ID_OUVRAGE, :McTavish_proxy, :Bellevue_proxy, :Assomption_proxy, :Trudeau_proxy, :StHubert_proxy]; 
first(shuffleDf(train_data[!, proxies]), 5)
# first(shuffleDf(train_data), 5)

Unnamed: 0_level_0,ID_OUVRAGE,McTavish_proxy,Bellevue_proxy,Assomption_proxy,Trudeau_proxy,StHubert_proxy
Unnamed: 0_level_1,String,Float64,Float64,Float64,Float64,Float64
1,4350-01D,0.0248354,0.381014,0.332861,0.189283,0.139269
2,3260-01D,0.145981,0.41434,0.215358,0.243957,0.210769
3,3350-07D,0.12027,0.265238,0.368185,0.0927373,0.276628
4,4240-01D,0.171395,0.494323,0.168295,0.312374,0.149987
5,3260-01D,0.145981,0.41434,0.215358,0.243957,0.210769


In [209]:
describe(train_data[!, :SURVERSE])

Summary Stats:
Length:         5129
Missing Count:  0
Mean:           0.085202
Minimum:        0.000000
1st Quartile:   0.000000
Median:         0.000000
3rd Quartile:   0.000000
Maximum:        1.000000
Type:           Int64


Populate les fields de chaque data

In [219]:
for i=1:size(train_data, 1)
    curr_ouvrage = train_data[i, 1];
    ouvrage_data = filter(row -> row.ID_OUVRAGE == curr_ouvrage, ouvrages);
    # Pour chaque station
    for j=1:5
        dist = findDistance(ouvrage_data[1, :TP_LAT], ouvrage_data[1, :TP_LNG], station_df[j, :LAT], station_df[j, :LNG]);
        train_data[i, Symbol(string(station_df[j, :STATION]), "_proxy")] = dist;
        
        # Somme des précipitations
        
        # Maximum des précipitations
        
        # Maximum sur 3h des précipitations
    end
end

In [215]:
filter(row -> row.ID_OUVRAGE == train_data[1, 1], ouvrages)[1, :TP_LAT]

45.6507187104077