# Data Processing

In [145]:
using CSV, DataFrames, DecisionTree, Statistics, Dates, Gadfly, Random;
include("utils/precipitation.jl");

## Build features

### Get and filter the features

#### Latitude, Longitude, Height

In [146]:
features = CSV.read("data/ouvrages-surverses.csv");
colnames = ["N_Env", "ID_SOMA", "ID_OUVRAGE", "NOM", "SOMA_SEC", "REGION", "TP_X", "TP_Y", "TP_Z", "TP_LAT", "TP_LNG", "EMI_X", "EMI_Y", "EMI_LNG", "EMI_LAT"];
names!(features, Symbol.(colnames));
select!(features, [:ID_OUVRAGE, :TP_LAT, :TP_LNG, :TP_Z]);

#### Replace missing Z index with mean

In [147]:
features.TP_Z = coalesce.(features.TP_Z, mean(features[completecases(features), :].TP_Z));
first(shuffleDf(features), 10)

Unnamed: 0_level_0,ID_OUVRAGE,TP_LAT,TP_LNG,TP_Z
Unnamed: 0_level_1,String,Float64,Float64,Float64
1,3790-02D,45.4025,-73.9461,23.54
2,3480-05D,45.4986,-73.7835,24.85
3,4430-03D,45.4149,-73.6303,22.25
4,3480-04D,45.5001,-73.7873,24.6
5,4420-02D,45.443,-73.5764,17.08
6,3305-02D,45.6233,-73.628,12.51
7,4240-02D,45.6498,-73.4877,19.29
8,3500-03D,45.5104,-73.8385,26.02
9,4260-01D,45.6308,-73.495,12.21
10,3250-01D,45.6562,-73.5731,17.16


### Load dates and surverses

In [148]:
surverses = CSV.read("data/surverses.csv",missingstring="-99999");

#### Filter months

In [149]:
surverses = filter(row -> month(row.DATE) > 4, surverses);
surverses = filter(row -> month(row.DATE) < 11, surverses);

#### Filter non rain surverses

In [150]:
raison = coalesce.(surverses[:,:RAISON],"Inconnue");
surverses[!,:RAISON] = raison;

surverses = filter(row -> row.RAISON ∈ ["P","Inconnue","TS"], surverses);
select!(surverses, [:NO_OUVRAGE, :DATE, :SURVERSE]);

#### Remove missing data and rename

In [151]:
surverses = dropmissing(surverses, disallowmissing=true);
rename!(surverses, :NO_OUVRAGE => :ID_OUVRAGE);
first(shuffleDf(surverses),10)

Unnamed: 0_level_0,ID_OUVRAGE,DATE,SURVERSE
Unnamed: 0_level_1,String,Date,Int64
1,4420-01D,2014-10-12,0
2,4560-03D,2013-07-28,0
3,3480-01D,2017-05-08,0
4,4600-01D,2014-07-29,0
5,4560-01D,2014-06-17,0
6,3305-02D,2018-06-16,0
7,3500-01D,2014-10-24,0
8,3410-01D,2015-09-13,1
9,3350-03D,2015-08-25,0
10,0801-04D,2013-05-08,0


### Augment features with dates and label

In [182]:
comb = join(features, surverses, on = :ID_OUVRAGE);
first(shuffleDf(comb), 10)

Unnamed: 0_level_0,ID_OUVRAGE,TP_LAT,TP_LNG,TP_Z,DATE,SURVERSE
Unnamed: 0_level_1,String,Float64,Float64,Float64,Date,Int64
1,4430-03D,45.4149,-73.6303,22.25,2015-07-13,0
2,4770-01D,45.6574,-73.4902,10.77,2014-07-15,0
3,3370-01D,45.5653,-73.6631,18.21,2017-05-09,0
4,0672-03D,45.6732,-73.5402,9.99,2013-07-25,0
5,4265-01D,45.626,-73.4991,11.78,2013-10-17,1
6,3540-02D,45.4751,-73.8727,26.52,2016-08-17,0
7,3380-01D,45.5533,-73.6703,22.23,2017-06-15,0
8,3350-03D,45.5785,-73.658,17.84,2015-06-18,0
9,3480-01D,45.5087,-73.7834,21.54,2015-07-29,0
10,3790-02D,45.4025,-73.9461,23.54,2014-10-25,0


### Load precipitation data

#### Load and filter months between May & October included

In [160]:
precipitation = CSV.read("data/precipitations.csv",missingstring="-99999");
rename!(precipitation, Symbol("St-Hubert")=>:StHubert);

precipitation = filter(row -> month(row.date) > 4, precipitation);
precipitation = filter(row -> month(row.date) < 11, precipitation); 

#### Replace missing data by 0

In [161]:
precipitation[!,:McTavish] = coalesce.(precipitation[:,:McTavish], 0);
precipitation[!,:Bellevue] = coalesce.(precipitation[:,:Bellevue], 0);
precipitation[!,:Assomption] = coalesce.(precipitation[:,:Assomption], 0);
precipitation[!,:Trudeau] = coalesce.(precipitation[:,:Trudeau], 0);
precipitation[!,:StHubert] = coalesce.(precipitation[:,:StHubert], 0);

first(shuffleDf(precipitation), 5)

Unnamed: 0_level_0,date,heure,McTavish,Bellevue,Assomption,Trudeau,StHubert
Unnamed: 0_level_1,Date,Int64,Int64,Int64,Int64,Int64,Int64
1,2015-08-30,14,0,0,0,0,0
2,2019-06-27,10,0,0,0,0,0
3,2014-06-22,11,0,0,0,0,0
4,2016-06-15,14,0,0,0,0,0
5,2019-10-31,7,0,0,0,0,0


### Extract features from precipitation

#### Sum of precipitation for the day

In [224]:
pcp_sum = by(precipitation, :date,  McTavish = :McTavish=>sum, Bellevue = :Bellevue=>sum, 
   Assomption = :Assomption=>sum, Trudeau = :Trudeau=>sum, StHubert = :StHubert=>sum);
first(shuffleDf(pcp_sum), 5)

Unnamed: 0_level_0,date,McTavish,Bellevue,Assomption,Trudeau,StHubert
Unnamed: 0_level_1,Date,Int64,Int64,Int64,Int64,Int64
1,2016-06-16,0,0,0,0,0
2,2014-06-21,0,0,0,0,0
3,2015-05-02,0,0,0,2,0
4,2019-10-03,26,0,0,30,17
5,2013-10-17,114,78,80,113,0


#### Maximum precipitation in an hour for the day

In [255]:
pcp_max = by(precipitation, :date,  McTavish = :McTavish=>maximum, Bellevue = :Bellevue=>maximum, 
   Assomption = :Assomption=>maximum, Trudeau = :Trudeau=>maximum, StHubert = :StHubert=>maximum)
first(shuffleDf(pcp_max),5)

Unnamed: 0_level_0,date,McTavish,Bellevue,Assomption,Trudeau,StHubert
Unnamed: 0_level_1,Date,Int64,Int64,Int64,Int64,Int64
1,2014-07-08,0,96,136,51,15
2,2018-10-26,0,0,0,0,0
3,2017-09-27,42,36,50,50,10
4,2019-05-19,23,32,16,27,25
5,2017-10-18,0,0,0,0,0


### Add precipitation data to features

#### Get stations lat-lng

In [183]:
station_df = DataFrame(STATION = String[], LAT = Float64[], LNG = Float64[]);

push!(station_df, ["McTavish", 45.504742, -73.579167]);
push!(station_df, ["Bellevue", 45.427222, -73.929167]);
push!(station_df, ["Assomption", 45.809444, -73.434722]);
push!(station_df, ["Trudeau", 45.467778, -73.741667]);
push!(station_df, ["StHubert", 45.5175, -73.416944]);

station_df

Unnamed: 0_level_0,STATION,LAT,LNG
Unnamed: 0_level_1,String,Float64,Float64
1,McTavish,45.5047,-73.5792
2,Bellevue,45.4272,-73.9292
3,Assomption,45.8094,-73.4347
4,Trudeau,45.4678,-73.7417
5,StHubert,45.5175,-73.4169


### Normalize elements

#### Get extreme values

In [184]:
min_lat = min(minimum(comb.TP_LAT), minimum(station_df.LAT));
max_lat = max(maximum(comb.TP_LAT), maximum(station_df.LAT));

min_lng = min(minimum(comb.TP_LNG), minimum(station_df.LNG));
max_lng = max(maximum(comb.TP_LNG), maximum(station_df.LNG));

min_z = minimum(comb.TP_Z);
max_z = maximum(comb.TP_Z);

#### Normalize every value of comb between 0 and 1

In [185]:
comb.TP_LAT = normalize.(comb.TP_LAT, min_lat, max_lat);
comb.TP_LNG = normalize.(comb.TP_LNG, min_lng, max_lng);
comb.TP_Z = normalize.(comb.TP_Z, min_z, max_z);

first(shuffleDf(comb), 10)

Unnamed: 0_level_0,ID_OUVRAGE,TP_LAT,TP_LNG,TP_Z,DATE,SURVERSE
Unnamed: 0_level_1,String,Float64,Float64,Float64,Date,Int64
1,3350-10D,0.33098,0.45193,0.502992,2014-10-07,0
2,3768-01D,0.349823,0.473192,0.452637,2014-07-26,0
3,3350-05D,0.412789,0.545451,0.422555,2013-07-26,0
4,3790-02D,0.0,0.0198891,0.496779,2016-07-13,0
5,4260-01D,0.561116,0.855507,0.126312,2014-10-05,0
6,3480-02D,0.236212,0.307589,0.558578,2014-07-17,0
7,4360-01D,0.21681,0.752166,0.222444,2015-09-10,0
8,4230-01D,0.637935,0.849315,0.112579,2017-08-27,0
9,4520-01D,0.0934202,0.422034,0.505935,2015-08-07,0
10,0801-04D,0.246824,0.802071,0.131217,2013-07-31,0


#### Normalize every value of station_df between 0 and 1

In [186]:
station_df.LAT = normalize.(station_df.LAT, min_lat, max_lat);
station_df.LNG = normalize.(station_df.LNG, min_lng, max_lng);

station_df

Unnamed: 0_level_0,STATION,LAT,LNG
Unnamed: 0_level_1,String,Float64,Float64
1,McTavish,0.251316,0.699536
2,Bellevue,0.0608415,0.0512791
3,Assomption,1.0,0.967072
4,Trudeau,0.160492,0.39856
5,StHubert,0.282664,1.0


### Augment Features

#### Add pcp_sum and pcp_max columns

In [190]:
comb.PCP_MAX = zeros(size(comb, 1));
comb.PCP_SUM = zeros(size(comb, 1));
permutecols!(comb, [:ID_OUVRAGE, :TP_LAT, :TP_LNG, :TP_Z, :DATE, :PCP_SUM, :PCP_MAX, :SURVERSE]);

In [191]:
first(shuffleDf(comb), 10)

Unnamed: 0_level_0,ID_OUVRAGE,TP_LAT,TP_LNG,TP_Z,DATE,PCP_SUM,PCP_MAX,SURVERSE
Unnamed: 0_level_1,String,Float64,Float64,Float64,Date,Float64,Float64,Int64
1,3530-01D,0.253467,0.174368,0.611549,2015-05-31,0.0,0.0,0
2,4230-07D,0.703498,0.87159,0.0494719,2015-06-12,0.0,0.0,0
3,0672-03D,0.66527,0.771707,0.0537227,2013-09-06,0.0,0.0,0
4,3350-09D,0.330728,0.451592,0.502665,2013-10-05,0.0,0.0,0
5,4600-03D,0.0832998,0.248675,0.590949,2014-06-29,0.0,0.0,0
6,4240-01D,0.607546,0.868856,0.116503,2018-10-16,0.0,0.0,0
7,3275-02D,0.5763,0.647929,0.202171,2017-08-30,0.0,0.0,0
8,3305-03D,0.533562,0.600092,0.111598,2018-08-27,0.0,0.0,0
9,4270-01D,0.511065,0.830105,0.0923062,2018-06-12,0.0,0.0,0
10,3350-08D,0.338262,0.459852,0.533074,2013-09-16,0.0,0.0,0


#### Find closest station to each ouvrage and add pcp_sum and pcp_max to it

In [215]:
for i=1:size(comb, 1)
    id_ouvrage = comb[i, 1]; 
    closest_station = "McTavish"; # initial value
    shortest_dist = -1;
    
    # Find closest station
    for j=1:size(station_df, 1)
        dist = findDistance(comb[i, :TP_LAT], comb[i, :TP_LNG], station_df[j, :LAT], station_df[j, :LNG]);
        
        if shortest_dist == -1 || dist < shortest_dist
            shortest_dist = dist;
            closest_station = station_df[j, :STATION];
        end
    end
    
    # Augment comb with a weighted p_sum, based on the distance to the station
    p_sum = pcp_sum[∈([comb[i, :DATE]]).(pcp_sum.date), Symbol(closest_station)];
    comb[i, :PCP_SUM] = p_sum[1] * (1 - shortest_dist); 
    # Augment comb with a weighted p_max, based on the distance to the station
    p_max = pcp_max[∈([comb[i, :DATE]]).(pcp_max.date), Symbol(closest_station)]
    comb[i, :PCP_MAX] = p_max[1] * (1 - shortest_dist);
end

In [242]:
first(shuffleDf(filter(row -> row.SURVERSE == 1, comb)), 10)

Unnamed: 0_level_0,ID_OUVRAGE,TP_LAT,TP_LNG,TP_Z,DATE,PCP_SUM,PCP_MAX,SURVERSE
Unnamed: 0_level_1,String,Float64,Float64,Float64,Date,Float64,Float64,Int64
1,0801-05D,0.282104,0.788458,0.142334,2018-06-13,35.3301,13.5885,1
2,3400-01D,0.346518,0.521076,0.578524,2014-06-14,0.0,0.0,1
3,3350-06D,0.37501,0.524895,0.423863,2015-05-25,157.198,44.0155,1
4,3380-01D,0.370625,0.530672,0.453945,2015-07-27,157.062,135.644,1
5,3350-09D,0.330728,0.451592,0.502665,2016-09-08,88.743,57.5186,1
6,4370-01D,0.16919,0.735733,0.359859,2013-08-09,65.5381,65.5381,1
7,4370-02D,0.169218,0.735754,0.359859,2016-06-28,191.156,96.4884,1
8,4320-01D,0.359606,0.789317,0.174378,2013-07-17,206.24,206.24,1
9,3410-02D,0.319462,0.483178,0.359859,2016-06-05,232.035,68.8725,1
10,4320-01D,0.359606,0.789317,0.174378,2013-07-28,80.7773,42.9666,1


#### Normalize pcp_sum and pcp_max

In [257]:
min_pcp_sum = minimum(comb.PCP_SUM);
max_pcp_sum = maximum(comb.PCP_SUM);

min_pcp_max = minimum(comb.PCP_MAX);
max_pcp_max = maximum(comb.PCP_MAX);

In [258]:
comb.PCP_SUM = normalize.(comb.PCP_SUM, min_pcp_sum, max_pcp_sum);
comb.PCP_MAX = normalize.(comb.PCP_MAX, min_pcp_max, max_pcp_max);

Unnamed: 0_level_0,ID_OUVRAGE,TP_LAT,TP_LNG,TP_Z,DATE,PCP_SUM,PCP_MAX,SURVERSE
Unnamed: 0_level_1,String,Float64,Float64,Float64,Date,Float64,Float64,Int64
1,4430-05D,0.0529114,0.554875,0.729588,2018-10-08,0.0502229,0.0314403,1
2,3480-02D,0.236212,0.307589,0.558578,2016-10-21,0.16128,0.0217705,1
3,4370-05D,0.0917291,0.555449,0.537652,2016-07-14,0.0233867,0.0233867,1
4,0801-05D,0.282104,0.788458,0.142334,2016-06-05,0.256565,0.0561522,1
5,4330-02D,0.33464,0.773192,0.359859,2014-06-17,0.0,0.0,1
6,3350-11D,0.320282,0.463777,0.553674,2015-06-08,0.107578,0.0237673,1
7,3410-02D,0.319462,0.483178,0.359859,2016-07-18,0.0661102,0.0528882,1
8,4340-01D,0.304064,0.76384,0.359121,2014-07-28,0.060988,0.014785,1
9,4720-01D,0.731445,0.86632,0.0736684,2015-09-07,0.0862551,0.0636131,1
10,3350-06D,0.37501,0.524895,0.423863,2017-07-08,0.129523,0.0871409,1


### TODO: Remove outlier in PCP_SUM and PCP_MAX that cause compression