# Data Processing

In [208]:
using CSV, DataFrames, DecisionTree, Statistics, Dates, Gadfly, Random;
include("utils/precipitation.jl");

## Build features

### Get and filter the features

#### Latitude, Longitude, Height

In [209]:
features = CSV.read("data/ouvrages-surverses.csv");
colnames = ["N_Env", "ID_SOMA", "ID_OUVRAGE", "NOM", "SOMA_SEC", "REGION", "TP_X", "TP_Y", "TP_Z", "TP_LAT", "TP_LNG", "EMI_X", "EMI_Y", "EMI_LNG", "EMI_LAT"];
names!(features, Symbol.(colnames));
select!(features, [:ID_OUVRAGE, :TP_LAT, :TP_LNG, :TP_Z]);

#### Replace missing Z index with mean

In [4]:
features.TP_Z = coalesce.(features.TP_Z, mean(features[completecases(features), :].TP_Z));
first(shuffleDf(features), 10)

Unnamed: 0_level_0,ID_OUVRAGE,TP_LAT,TP_LNG,TP_Z
Unnamed: 0_level_1,String,Float64,Float64,Float64
1,3782-01D,45.4721,-73.8778,24.14
2,4770-01D,45.6574,-73.4902,10.77
3,0801-06D,45.5187,-73.533,18.66
4,4310-02D,45.6067,-73.5865,32.4
5,3305-03D,45.6196,-73.6329,11.76
6,4430-06D,45.4304,-73.6655,23.67
7,3400-01D,45.5435,-73.6755,26.04
8,4430-03D,45.4149,-73.6303,22.25
9,4230-05D,45.6728,-73.4915,11.55
10,3350-08D,45.5401,-73.7086,24.65


### Load dates and surverses

In [5]:
surverses = CSV.read("data/surverses.csv",missingstring="-99999");

#### Filter months

In [6]:
surverses = filter(row -> month(row.DATE) > 4, surverses);
surverses = filter(row -> month(row.DATE) < 11, surverses);

#### Filter non rain surverses

In [7]:
raison = coalesce.(surverses[:,:RAISON],"Inconnue");
surverses[!,:RAISON] = raison;

surverses = filter(row -> row.RAISON ∈ ["P","Inconnue","TS"], surverses);
select!(surverses, [:NO_OUVRAGE, :DATE, :SURVERSE]);

#### Remove missing data and rename

In [8]:
surverses = dropmissing(surverses, disallowmissing=true);
rename!(surverses, :NO_OUVRAGE => :ID_OUVRAGE);
first(shuffleDf(surverses),10)

Unnamed: 0_level_0,ID_OUVRAGE,DATE,SURVERSE
Unnamed: 0_level_1,String,Date,Int64
1,4380-01D,2015-06-28,0
2,3768-01D,2018-06-06,0
3,0642-01D,2016-10-09,0
4,3782-01D,2014-06-06,0
5,4620-01D,2016-08-05,0
6,4430-04D,2014-05-26,0
7,4430-01D,2013-05-14,0
8,3480-03D,2014-06-22,0
9,3560-01D,2018-09-14,0
10,3500-02D,2013-09-17,0


### Augment features with dates and label

In [12]:
comb = join(features, surverses, on = :ID_OUVRAGE);
first(shuffleDf(comb), 10)

Unnamed: 0_level_0,ID_OUVRAGE,TP_LAT,TP_LNG,TP_Z,DATE,SURVERSE
Unnamed: 0_level_1,String,Float64,Float64,Float64,Date,Int64
1,3350-03D,45.5785,-73.658,17.84,2013-09-08,0
2,4280-02D,45.5986,-73.5102,11.534,2015-05-16,0
3,4390-01D,45.4618,-73.5555,14.62,2014-09-12,0
4,4610-09D,45.4372,-73.8477,19.3526,2017-05-07,0
5,3400-01D,45.5435,-73.6755,26.04,2016-09-12,0
6,4710-01D,45.6976,-73.4968,12.12,2013-10-10,0
7,3250-02D,45.6437,-73.5541,38.93,2015-09-26,0
8,0672-03D,45.6732,-73.5402,9.99,2013-08-01,1
9,3250-01D,45.6562,-73.5731,17.16,2018-06-09,0
10,3400-01D,45.5435,-73.6755,26.04,2014-05-06,0


### Load precipitation data

#### Load and filter months between May & October included

In [13]:
precipitation = CSV.read("data/precipitations.csv",missingstring="-99999");
rename!(precipitation, Symbol("St-Hubert")=>:StHubert);

precipitation = filter(row -> month(row.date) > 4, precipitation);
precipitation = filter(row -> month(row.date) < 11, precipitation); 

#### Replace missing data by 0

In [14]:
precipitation[!,:McTavish] = coalesce.(precipitation[:,:McTavish], 0);
precipitation[!,:Bellevue] = coalesce.(precipitation[:,:Bellevue], 0);
precipitation[!,:Assomption] = coalesce.(precipitation[:,:Assomption], 0);
precipitation[!,:Trudeau] = coalesce.(precipitation[:,:Trudeau], 0);
precipitation[!,:StHubert] = coalesce.(precipitation[:,:StHubert], 0);

first(shuffleDf(precipitation), 5)

Unnamed: 0_level_0,date,heure,McTavish,Bellevue,Assomption,Trudeau,StHubert
Unnamed: 0_level_1,Date,Int64,Int64,Int64,Int64,Int64,Int64
1,2014-05-24,19,0,0,0,0,0
2,2015-08-10,8,0,0,0,0,0
3,2019-08-23,16,0,0,0,0,0
4,2017-10-29,15,0,0,0,0,0
5,2019-08-18,23,0,0,0,0,0


### Extract features from precipitation

#### Sum of precipitation for the day

In [15]:
pcp_sum = by(precipitation, :date,  McTavish = :McTavish=>sum, Bellevue = :Bellevue=>sum, 
   Assomption = :Assomption=>sum, Trudeau = :Trudeau=>sum, StHubert = :StHubert=>sum);
first(shuffleDf(pcp_sum), 5)

Unnamed: 0_level_0,date,McTavish,Bellevue,Assomption,Trudeau,StHubert
Unnamed: 0_level_1,Date,Int64,Int64,Int64,Int64,Int64
1,2018-08-05,0,0,14,0,0
2,2013-07-15,0,0,0,0,0
3,2014-07-19,0,0,0,0,0
4,2017-05-19,2,0,0,2,2
5,2015-10-22,29,29,36,30,29


#### Maximum precipitation in an hour for the day

In [16]:
pcp_max = by(precipitation, :date,  McTavish = :McTavish=>maximum, Bellevue = :Bellevue=>maximum, 
   Assomption = :Assomption=>maximum, Trudeau = :Trudeau=>maximum, StHubert = :StHubert=>maximum)
first(shuffleDf(pcp_max),5)

Unnamed: 0_level_0,date,McTavish,Bellevue,Assomption,Trudeau,StHubert
Unnamed: 0_level_1,Date,Int64,Int64,Int64,Int64,Int64
1,2019-09-27,0,0,0,2,2
2,2018-07-24,12,0,3,0,10
3,2015-09-23,0,0,0,0,0
4,2019-10-21,0,0,0,0,0
5,2016-05-20,0,0,0,0,0


### Add precipitation data to features

#### Get stations lat-lng

In [17]:
station_df = DataFrame(STATION = String[], LAT = Float64[], LNG = Float64[]);

push!(station_df, ["McTavish", 45.504742, -73.579167]);
push!(station_df, ["Bellevue", 45.427222, -73.929167]);
push!(station_df, ["Assomption", 45.809444, -73.434722]);
push!(station_df, ["Trudeau", 45.467778, -73.741667]);
push!(station_df, ["StHubert", 45.5175, -73.416944]);

station_df

Unnamed: 0_level_0,STATION,LAT,LNG
Unnamed: 0_level_1,String,Float64,Float64
1,McTavish,45.5047,-73.5792
2,Bellevue,45.4272,-73.9292
3,Assomption,45.8094,-73.4347
4,Trudeau,45.4678,-73.7417
5,StHubert,45.5175,-73.4169


### Normalize elements

#### Get extreme values

In [18]:
min_lat = min(minimum(comb.TP_LAT), minimum(station_df.LAT));
max_lat = max(maximum(comb.TP_LAT), maximum(station_df.LAT));

min_lng = min(minimum(comb.TP_LNG), minimum(station_df.LNG));
max_lng = max(maximum(comb.TP_LNG), maximum(station_df.LNG));

min_z = minimum(comb.TP_Z);
max_z = maximum(comb.TP_Z);

#### Normalize every value of comb between 0 and 1

In [19]:
comb.TP_LAT = normalize.(comb.TP_LAT, min_lat, max_lat);
comb.TP_LNG = normalize.(comb.TP_LNG, min_lng, max_lng);
comb.TP_Z = normalize.(comb.TP_Z, min_z, max_z);

first(shuffleDf(comb), 10)

Unnamed: 0_level_0,ID_OUVRAGE,TP_LAT,TP_LNG,TP_Z,DATE,SURVERSE
Unnamed: 0_level_1,String,Float64,Float64,Float64,Date,Int64
1,3530-01D,0.253467,0.174368,0.611549,2014-05-23,0
2,3350-08D,0.338262,0.459852,0.533074,2018-09-17,0
3,3230-01D,0.64771,0.75928,0.177321,2013-09-09,0
4,4420-01D,0.100805,0.704213,0.278357,2017-06-25,0
5,4280-02D,0.48184,0.827282,0.104208,2016-07-17,0
6,4520-01D,0.0934202,0.422034,0.505935,2015-09-10,0
7,4600-01D,0.0674511,0.249071,0.490567,2016-05-24,0
8,4610-06D,0.0690096,0.18617,0.676945,2015-07-28,0
9,4250-01D,0.582967,0.868125,0.160318,2018-06-07,0
10,4560-03D,0.121422,0.322806,0.589314,2014-10-17,0


#### Normalize every value of station_df between 0 and 1

In [20]:
station_df.LAT = normalize.(station_df.LAT, min_lat, max_lat);
station_df.LNG = normalize.(station_df.LNG, min_lng, max_lng);

station_df

Unnamed: 0_level_0,STATION,LAT,LNG
Unnamed: 0_level_1,String,Float64,Float64
1,McTavish,0.251316,0.699536
2,Bellevue,0.0608415,0.0512791
3,Assomption,1.0,0.967072
4,Trudeau,0.160492,0.39856
5,StHubert,0.282664,1.0


### Augment Features

#### Add pcp_sum and pcp_max columns

In [21]:
comb.PCP_MAX = zeros(size(comb, 1));
comb.PCP_SUM = zeros(size(comb, 1));
permutecols!(comb, [:ID_OUVRAGE, :TP_LAT, :TP_LNG, :TP_Z, :DATE, :PCP_SUM, :PCP_MAX, :SURVERSE]);

In [22]:
first(shuffleDf(comb), 10)

Unnamed: 0_level_0,ID_OUVRAGE,TP_LAT,TP_LNG,TP_Z,DATE,PCP_SUM,PCP_MAX,SURVERSE
Unnamed: 0_level_1,String,Float64,Float64,Float64,Date,Float64,Float64,Int64
1,4230-09D,0.728544,0.88331,0.0,2015-07-20,0.0,0.0,0
2,4420-02D,0.0997269,0.704644,0.285551,2014-07-05,0.0,0.0,0
3,4230-07D,0.703498,0.87159,0.0494719,2013-09-29,0.0,0.0,0
4,4270-01D,0.511065,0.830105,0.0923062,2013-09-30,0.0,0.0,0
5,4790-01D,0.212882,0.768473,0.192035,2015-05-05,0.0,0.0,0
6,4610-06D,0.0690096,0.18617,0.676945,2014-08-02,0.0,0.0,0
7,4530-02D,0.0940842,0.343984,0.359859,2017-07-02,0.0,0.0,0
8,3400-01D,0.346518,0.521076,0.578524,2013-08-08,0.0,0.0,0
9,4320-01D,0.359606,0.789317,0.174378,2013-06-22,0.0,0.0,0
10,4430-04D,0.0424383,0.570669,0.758362,2013-05-25,0.0,0.0,0


#### Find closest station to each ouvrage and add pcp_sum and pcp_max to it

In [39]:
for i=1:size(comb, 1)
    id_ouvrage = comb[i, 1]; 
    closest_station = "McTavish"; # initial value
    shortest_dist = -1;
    
    # Find closest station
    for j=1:size(station_df, 1)
        dist = findDistance(comb[i, :TP_LAT], comb[i, :TP_LNG], station_df[j, :LAT], station_df[j, :LNG]);
        
        if shortest_dist == -1 || dist < shortest_dist
            shortest_dist = dist;
            closest_station = station_df[j, :STATION];
        end
    end
    
    # Augment comb with a weighted p_sum, based on the distance to the station
    p_sum = pcp_sum[∈([comb[i, :DATE]]).(pcp_sum.date), Symbol(closest_station)];
    comb[i, :PCP_SUM] = p_sum[1] * (1 - shortest_dist); 
    # Augment comb with a weighted p_max, based on the distance to the station
    p_max = pcp_max[∈([comb[i, :DATE]]).(pcp_max.date), Symbol(closest_station)]
    comb[i, :PCP_MAX] = p_max[1] * (1 - shortest_dist);
end

In [41]:
first(shuffleDf(filter(row -> row.SURVERSE == 1, comb)), 10)

Unnamed: 0_level_0,ID_OUVRAGE,TP_LAT,TP_LNG,TP_Z,DATE,PCP_SUM,PCP_MAX,SURVERSE
Unnamed: 0_level_1,String,Float64,Float64,Float64,Date,Float64,Float64,Int64
1,4720-01D,0.731445,0.86632,0.0736684,2014-06-14,221.082,106.975,1
2,3350-07D,0.353054,0.490395,0.405552,2015-08-19,1.57332,1.57332,1
3,4370-04D,0.0842328,0.458181,0.584737,2018-05-01,6.3224,4.516,1
4,3400-01D,0.346518,0.521076,0.578524,2013-07-08,7.97734,7.97734,1
5,3410-02D,0.319462,0.483178,0.359859,2013-05-26,141.025,19.6779,1
6,3290-01D,0.543825,0.620308,0.376124,2015-08-21,73.8769,33.4537,1
7,3310-01D,0.51946,0.602926,0.367296,2017-07-07,52.9087,29.3143,1
8,4230-03D,0.658594,0.861384,0.0246215,2013-06-28,160.652,66.8313,1
9,3410-02D,0.319462,0.483178,0.359859,2018-07-17,262.372,140.205,1
10,4300-01D,0.413346,0.807748,0.171435,2015-09-13,184.381,57.1662,1


#### Normalize pcp_sum and pcp_max

In [25]:
min_pcp_sum = minimum(comb.PCP_SUM);
max_pcp_sum = maximum(comb.PCP_SUM);

min_pcp_max = minimum(comb.PCP_MAX);
max_pcp_max = maximum(comb.PCP_MAX);

In [42]:
comb.PCP_SUM = normalize.(comb.PCP_SUM, min_pcp_sum, max_pcp_sum);
comb.PCP_MAX = normalize.(comb.PCP_MAX, min_pcp_max, max_pcp_max);

In [46]:
first(shuffleDf(filter(row -> row.SURVERSE == 1, comb)), 10)

Unnamed: 0_level_0,ID_OUVRAGE,TP_LAT,TP_LNG,TP_Z,DATE,PCP_SUM,PCP_MAX,SURVERSE
Unnamed: 0_level_1,String,Float64,Float64,Float64,Date,Float64,Float64,Int64
1,4520-01D,0.0934202,0.422034,0.505935,2017-06-05,0.172273,0.028088,1
2,3260-01D,0.609996,0.697487,0.386587,2013-07-19,0.0452461,0.0187448,1
3,3350-01D,0.459837,0.564893,0.220482,2016-10-08,0.0753927,0.0246258,1
4,4300-01D,0.413346,0.807748,0.171435,2014-05-27,0.0348949,0.0178532,1
5,4795-01D,0.116487,0.719818,0.241964,2018-10-08,0.0644145,0.0443938,1
6,3380-01D,0.370625,0.530672,0.453945,2017-08-05,0.0591627,0.0227856,1
7,4280-01D,0.487687,0.8256,0.136775,2017-06-06,0.0708372,0.019554,1
8,4380-01D,0.160421,0.728206,0.359859,2015-07-21,0.0492386,0.0132215,1
9,3260-01D,0.609996,0.697487,0.386587,2015-06-12,0.032965,0.00904921,1
10,3410-02D,0.319462,0.483178,0.359859,2017-10-15,0.0384266,0.0123957,1


#### Normalize dates

In [56]:
dates = dateToDay.(comb.DATE);
# dates = normalize.(dates, minimum(dates), maximum(dates));

### TODO: Remove outlier in PCP_SUM and PCP_MAX that cause compression

# Model creation

### Separate features and labels

In [94]:
train_features = convert(Matrix{Float64},comb[:, [:TP_LAT, :TP_LNG, :TP_Z, :PCP_SUM, :PCP_MAX]]);
train_features = hcat(dates, train_features);

In [123]:
train_labels = comb[:, :SURVERSE];

### Build Model

#### Test with tree first

In [145]:
model_tree = build_tree(train_labels, train_features)

Decision Tree
Leaves: 3936
Depth:  36

In [146]:
model_tree = prune_tree(model_tree, 0.90)

Decision Tree
Leaves: 3770
Depth:  36

In [149]:
model = build_forest(train_labels, train_features, 4, 21, 0.7, 47)

Ensemble of Decision Trees
Trees:      21
Avg Leaves: 2550.904761904762
Avg Depth:  32.904761904761905

# Prediction

## Get the test data

In [179]:
test = CSV.read("data/test.csv");
rename!(test, :NO_OUVRAGE => :ID_OUVRAGE);
first(test, 10)

Unnamed: 0_level_0,ID_OUVRAGE,DATE
Unnamed: 0_level_1,String,Date
1,3260-01D,2019-05-02
2,3260-01D,2019-05-09
3,3260-01D,2019-05-10
4,3260-01D,2019-05-15
5,3260-01D,2019-05-20
6,3260-01D,2019-05-23
7,3260-01D,2019-05-24
8,3260-01D,2019-05-26
9,3260-01D,2019-05-30
10,3350-07D,2019-05-01


In [180]:
to_merge = unique(comb[!, [:ID_OUVRAGE, :TP_LAT, :TP_LNG, :TP_Z]], :ID_OUVRAGE);
test_comb = join(test, to_merge, on= [:ID_OUVRAGE]);
nrow(test_comb)

283

In [186]:
first(shuffleDf(test_comb), 10);

### Add PCP_SUM and PCP_MAX

#### Initialize default pcp

In [184]:
test_comb.PCP_MAX = zeros(size(test_comb, 1));
test_comb.PCP_SUM = zeros(size(test_comb, 1));
permutecols!(test_comb, [:ID_OUVRAGE, :TP_LAT, :TP_LNG, :TP_Z, :DATE, :PCP_SUM, :PCP_MAX]);

In [189]:
first(shuffleDf(test_comb), 10);

#### Populate pcp

In [187]:
for i=1:size(test_comb, 1)
    id_ouvrage = test_comb[i, 1]; 
    closest_station = "McTavish"; # initial value
    shortest_dist = -1;
    
    # Find closest station
    for j=1:size(station_df, 1)
        dist = findDistance(test_comb[i, :TP_LAT], test_comb[i, :TP_LNG], station_df[j, :LAT], station_df[j, :LNG]);
        
        if shortest_dist == -1 || dist < shortest_dist
            shortest_dist = dist;
            closest_station = station_df[j, :STATION];
        end
    end
    
    # Augment comb with a weighted p_sum, based on the distance to the station
    p_sum = pcp_sum[∈([test_comb[i, :DATE]]).(pcp_sum.date), Symbol(closest_station)];
    test_comb[i, :PCP_SUM] = p_sum[1] * (1 - shortest_dist); 
    # Augment comb with a weighted p_max, based on the distance to the station
    p_max = pcp_max[∈([test_comb[i, :DATE]]).(pcp_max.date), Symbol(closest_station)]
    test_comb[i, :PCP_MAX] = p_max[1] * (1 - shortest_dist);
end

In [200]:
first(shuffleDf(test_comb), 10)

Unnamed: 0_level_0,ID_OUVRAGE,TP_LAT,TP_LNG,TP_Z,DATE,PCP_SUM,PCP_MAX
Unnamed: 0_level_1,String,Float64,Float64,Float64,Date,Float64,Float64
1,4380-01D,0.160421,0.728206,0.359859,2019-08-07,0.00493097,0.00917112
2,3260-01D,0.663919,0.797687,0.359859,2019-08-22,0.0,0.0
3,3350-07D,0.353054,0.490395,0.405552,2019-06-15,0.113623,0.0837334
4,4240-01D,0.607546,0.868856,0.116503,2019-05-22,0.0,0.0
5,3260-01D,0.663919,0.797687,0.359859,2019-08-25,0.0,0.0
6,4380-01D,0.160421,0.728206,0.359859,2019-09-22,0.0,0.0
7,4350-01D,0.237379,0.74432,0.359859,2019-07-17,0.0,0.0
8,4350-01D,0.237379,0.74432,0.359859,2019-07-23,0.0,0.0
9,4240-01D,0.607546,0.868856,0.116503,2019-09-07,0.318678,0.312819
10,3350-07D,0.353054,0.490395,0.405552,2019-07-13,0.0,0.0


#### Normalize pcp

In [193]:
min_pcp_sum = minimum(test_comb.PCP_SUM);
max_pcp_sum = maximum(test_comb.PCP_SUM);

min_pcp_max = minimum(test_comb.PCP_MAX);
max_pcp_max = maximum(test_comb.PCP_MAX);

In [196]:
test_comb.PCP_SUM = normalize.(test_comb.PCP_SUM, min_pcp_sum, max_pcp_sum);
test_comb.PCP_MAX = normalize.(test_comb.PCP_MAX, min_pcp_max, max_pcp_max);

#### Normalize dates

In [203]:
test_dates = dateToDay.(test_comb.DATE);
test_dates = normalize.(test_dates, minimum(test_dates), maximum(test_dates));

### Create Test features

In [205]:
test_features = convert(Matrix{Float64}, test_comb[:, [:TP_LAT, :TP_LNG, :TP_Z, :PCP_SUM, :PCP_MAX]]);
test_features = hcat(test_dates, test_features);

test_features

283×6 Array{Float64,2}:
 0.00671141  0.663919  0.797687  0.359859  0.0254936   0.0221273 
 0.0536913   0.663919  0.797687  0.359859  0.130867    0.173858  
 0.0604027   0.663919  0.797687  0.359859  0.486079    0.158052  
 0.0939597   0.663919  0.797687  0.359859  0.0         0.0       
 0.127517    0.663919  0.797687  0.359859  0.0832792   0.0853482 
 0.147651    0.663919  0.797687  0.359859  0.596551    0.587955  
 0.154362    0.663919  0.797687  0.359859  0.0373907   0.0379326 
 0.167785    0.663919  0.797687  0.359859  0.0356911   0.066382  
 0.194631    0.663919  0.797687  0.359859  0.0220945   0.0316105 
 0.0         0.353054  0.490395  0.405552  0.10076     0.0837334 
 0.00671141  0.353054  0.490395  0.405552  0.0278697   0.0318984 
 0.0469799   0.353054  0.490395  0.405552  0.0         0.0       
 0.0536913   0.353054  0.490395  0.405552  0.184369    0.171454  
 ⋮                                                     ⋮         
 0.805369    0.160421  0.728206  0.359859  0.0      

## Predict

In [206]:
test_labels = apply_forest(model, test_features)

283-element Array{Int64,1}:
 0
 0
 0
 0
 0
 0
 0
 0
 0
 0
 0
 0
 0
 ⋮
 0
 1
 0
 0
 0
 0
 0
 0
 1
 1
 0
 0

## Generate submission

In [207]:
ID = test_comb[:,:ID_OUVRAGE].*"_".*string.(test_comb[:,:DATE])
sampleSubmission = DataFrame(ID = ID, Surverse=test_labels)
CSV.write("submissions/mc-submission-4.csv",sampleSubmission)

"submissions/mc-submission-4.csv"