# Data Processing

In [13]:
using CSV, DataFrames, DecisionTree, Statistics, Dates, Gadfly, Random;
include("utils/precipitation.jl");

## Build features

### Get and filter the features

#### Latitude, Longitude, Height

In [14]:
features = CSV.read("data/ouvrages-surverses.csv");
colnames = ["N_Env", "ID_SOMA", "ID_OUVRAGE", "NOM", "SOMA_SEC", "REGION", "TP_X", "TP_Y", "TP_Z", "TP_LAT", "TP_LNG", "EMI_X", "EMI_Y", "EMI_LNG", "EMI_LAT"];
names!(features, Symbol.(colnames));
select!(features, [:ID_OUVRAGE, :TP_LAT, :TP_LNG, :TP_Z]);


Unnamed: 0_level_0,ID_OUVRAGE,TP_LAT,TP_LNG,TP_Z
Unnamed: 0_level_1,String,Float64,Float64,Float64⍰
1,0642-01D,45.6727,-73.5262,missing
2,0672-01D,45.6823,-73.531,9.12
3,0672-02D,45.6939,-73.5214,10.23
4,0672-03D,45.6732,-73.5402,9.99
5,0801-01D,45.519,-73.5275,13.71
6,0801-02D,45.5174,-73.5281,12.34
7,0801-03D,45.5081,-73.5273,11.94
8,0801-04D,45.5029,-73.5238,12.36
9,0801-05D,45.5173,-73.5312,12.7
10,0801-06D,45.5187,-73.533,18.66


#### Replace missing Z index with mean

In [34]:
features.TP_Z = coalesce.(features.TP_Z, mean(features[completecases(features), :].TP_Z));
first(features, 10)

Unnamed: 0_level_0,ID_OUVRAGE,TP_LAT,TP_LNG,TP_Z
Unnamed: 0_level_1,String,Float64,Float64,Float64
1,0642-01D,45.6727,-73.5262,19.3526
2,0672-01D,45.6823,-73.531,9.12
3,0672-02D,45.6939,-73.5214,10.23
4,0672-03D,45.6732,-73.5402,9.99
5,0801-01D,45.519,-73.5275,13.71
6,0801-02D,45.5174,-73.5281,12.34
7,0801-03D,45.5081,-73.5273,11.94
8,0801-04D,45.5029,-73.5238,12.36
9,0801-05D,45.5173,-73.5312,12.7
10,0801-06D,45.5187,-73.533,18.66


### Load dates and surverses

In [22]:
surverses = CSV.read("data/surverses.csv",missingstring="-99999");

#### Filter months

In [24]:
surverses = filter(row -> month(row.DATE) > 4, surverses);
surverses = filter(row -> month(row.DATE) < 11, surverses);

#### Filter non rain surverses

In [25]:
raison = coalesce.(surverses[:,:RAISON],"Inconnue");
surverses[!,:RAISON] = raison;

surverses = filter(row -> row.RAISON ∈ ["P","Inconnue","TS"], surverses);
select!(surverses, [:NO_OUVRAGE, :DATE, :SURVERSE]);

#### Remove missing data and rename

In [31]:
surverses = dropmissing(surverses, disallowmissing=true);
rename!(surverses, :NO_OUVRAGE => :ID_OUVRAGE);
first(surverses,10)

Unnamed: 0_level_0,ID_OUVRAGE,DATE,SURVERSE
Unnamed: 0_level_1,String,Date,Int64
1,0642-01D,2013-05-01,0
2,0642-01D,2013-05-02,0
3,0642-01D,2013-05-03,0
4,0642-01D,2013-05-04,0
5,0642-01D,2013-05-05,0
6,0642-01D,2013-05-06,0
7,0642-01D,2013-05-07,0
8,0642-01D,2013-05-08,0
9,0642-01D,2013-05-09,0
10,0642-01D,2013-05-10,0


### Augment features with dates and label

In [32]:
comb = join(features, surverses, on = :ID_OUVRAGE);
first(comb, 10)

Unnamed: 0_level_0,ID_OUVRAGE,TP_LAT,TP_LNG,TP_Z,DATE,SURVERSE
Unnamed: 0_level_1,String,Float64,Float64,Float64⍰,Date,Int64
1,0642-01D,45.6727,-73.5262,missing,2013-05-01,0
2,0642-01D,45.6727,-73.5262,missing,2013-05-02,0
3,0642-01D,45.6727,-73.5262,missing,2013-05-03,0
4,0642-01D,45.6727,-73.5262,missing,2013-05-04,0
5,0642-01D,45.6727,-73.5262,missing,2013-05-05,0
6,0642-01D,45.6727,-73.5262,missing,2013-05-06,0
7,0642-01D,45.6727,-73.5262,missing,2013-05-07,0
8,0642-01D,45.6727,-73.5262,missing,2013-05-08,0
9,0642-01D,45.6727,-73.5262,missing,2013-05-09,0
10,0642-01D,45.6727,-73.5262,missing,2013-05-10,0


### Load precipitation data

#### Load and filter months between May & October included

In [28]:
precipitation = CSV.read("data/precipitations.csv",missingstring="-99999");
rename!(precipitation, Symbol("St-Hubert")=>:StHubert);

precipitation = filter(row -> month(row.date) > 4, precipitation);
precipitation = filter(row -> month(row.date) < 11, precipitation); 
# first(precipitation, 5)

#### Replace missing data by 0

In [29]:
precipitation[!,:McTavish] = coalesce.(precipitation[:,:McTavish], 0);
precipitation[!,:Bellevue] = coalesce.(precipitation[:,:Bellevue], 0);
precipitation[!,:Assomption] = coalesce.(precipitation[:,:Assomption], 0);
precipitation[!,:Trudeau] = coalesce.(precipitation[:,:Trudeau], 0);
precipitation[!,:StHubert] = coalesce.(precipitation[:,:StHubert], 0);

first(precipitation, 5)

Unnamed: 0_level_0,date,heure,McTavish,Bellevue,Assomption,Trudeau,StHubert
Unnamed: 0_level_1,Date,Int64,Int64,Int64,Int64,Int64,Int64
1,2013-05-01,0,0,0,0,0,0
2,2013-05-01,1,0,0,0,0,0
3,2013-05-01,2,0,0,0,0,0
4,2013-05-01,3,0,0,0,0,0
5,2013-05-01,4,0,0,0,0,0


### Extract features from precipitation

#### Sum of precipitation for the day

In [18]:
pcp_sum = by(precipitation, :date,  McTavish = :McTavish=>sum, Bellevue = :Bellevue=>sum, 
   Assomption = :Assomption=>sum, Trudeau = :Trudeau=>sum, StHubert = :StHubert=>sum);
first(pcp_sum, 5)

Unnamed: 0_level_0,date,McTavish,Bellevue,Assomption,Trudeau,StHubert
Unnamed: 0_level_1,Date,Int64,Int64,Int64,Int64,Int64
1,2013-05-01,0,0,0,0,0
2,2013-05-02,0,0,0,0,0
3,2013-05-03,0,0,0,0,0
4,2013-05-04,0,0,0,0,0
5,2013-05-05,0,0,0,0,0


#### Maximum precipitation in an hour for the day

In [20]:
pcp_max = by(precipitation, :date,  McTavish = :McTavish=>maximum, Bellevue = :Bellevue=>maximum, 
   Assomption = :Assomption=>maximum, Trudeau = :Trudeau=>maximum, StHubert = :StHubert=>maximum)
first(pcp_max,5)

Unnamed: 0_level_0,date,McTavish,Bellevue,Assomption,Trudeau,StHubert
Unnamed: 0_level_1,Date,Int64,Int64,Int64,Int64,Int64
1,2013-05-01,0,0,0,0,0
2,2013-05-02,0,0,0,0,0
3,2013-05-03,0,0,0,0,0
4,2013-05-04,0,0,0,0,0
5,2013-05-05,0,0,0,0,0
