# Data Processing

In [1]:
using CSV, DataFrames, LIBSVM, Statistics, Dates, Gadfly, Random, MLBase;
include("utils/precipitation.jl");

## Build features

### Get and filter the features

#### Latitude, Longitude, Height

In [80]:
features = CSV.read("data/ouvrages-surverses.csv");
colnames = ["N_Env", "ID_SOMA", "ID_OUVRAGE", "NOM", "SOMA_SEC", "REGION", "TP_X", "TP_Y", "TP_Z", "TP_LAT", "TP_LNG", "EMI_X", "EMI_Y", "EMI_LNG", "EMI_LAT"];
names!(features, Symbol.(colnames));
select!(features, [:ID_OUVRAGE, :TP_LAT, :TP_LNG, :TP_Z]);

#### Replace missing Z index with mean

In [81]:
features.TP_Z = coalesce.(features.TP_Z, mean(features[completecases(features), :].TP_Z));
first(shuffleDf(features), 10)

Unnamed: 0_level_0,ID_OUVRAGE,TP_LAT,TP_LNG,TP_Z
Unnamed: 0_level_1,String,Float64,Float64,Float64
1,4300-01D,45.5707,-73.5207,13.59
2,4530-02D,45.4408,-73.7711,19.3526
3,0801-04D,45.5029,-73.5238,12.36
4,4270-02D,45.6112,-73.5084,19.3526
5,3305-02D,45.6233,-73.628,12.51
6,3350-11D,45.5328,-73.7065,25.28
7,3480-05D,45.4986,-73.7835,24.85
8,4620-08D,45.4342,-73.8715,19.3526
9,4370-04D,45.4367,-73.7095,26.23
10,4230-04D,45.6713,-73.4917,8.96


### Load dates and surverses

In [82]:
surverses = CSV.read("data/surverses.csv",missingstring="-99999");

#### Filter months

In [83]:
surverses = filter(row -> month(row.DATE) > 4, surverses);
surverses = filter(row -> month(row.DATE) < 11, surverses);

#### Filter non rain surverses

In [84]:
raison = coalesce.(surverses[:,:RAISON],"Inconnue");
surverses[!,:RAISON] = raison;

surverses = filter(row -> row.RAISON ∈ ["P","Inconnue","TS"], surverses);
select!(surverses, [:NO_OUVRAGE, :DATE, :SURVERSE]);

#### Remove missing data and rename

In [85]:
surverses = dropmissing(surverses, disallowmissing=true);
rename!(surverses, :NO_OUVRAGE => :ID_OUVRAGE);
first(shuffleDf(surverses),10)

Unnamed: 0_level_0,ID_OUVRAGE,DATE,SURVERSE
Unnamed: 0_level_1,String,Date,Int64
1,3480-01D,2016-08-11,0
2,3782-01D,2013-09-19,0
3,3500-03D,2016-06-20,0
4,3350-01D,2018-09-27,0
5,0672-01D,2015-08-24,0
6,3350-03D,2013-05-23,0
7,4370-02D,2014-10-22,0
8,3580-01D,2015-05-22,0
9,4230-03D,2018-05-21,0
10,4330-02D,2014-10-21,0


### Augment features with dates and label

In [86]:
comb = join(features, surverses, on = :ID_OUVRAGE);
first(shuffleDf(comb), 10)

Unnamed: 0_level_0,ID_OUVRAGE,TP_LAT,TP_LNG,TP_Z,DATE,SURVERSE
Unnamed: 0_level_1,String,Float64,Float64,Float64,Date,Int64
1,3310-01D,45.6139,-73.6313,19.58,2016-10-13,1
2,3767-01D,45.5456,-73.701,20.51,2013-09-28,0
3,3290-01D,45.6238,-73.6219,19.85,2016-10-27,1
4,4230-03D,45.6705,-73.4918,9.1,2014-07-09,0
5,4230-08D,45.6968,-73.4809,10.11,2016-05-06,0
6,4390-01D,45.4618,-73.5555,14.62,2013-09-24,0
7,4720-01D,45.7001,-73.4891,10.6,2017-06-04,0
8,4430-02D,45.4344,-73.5869,18.56,2017-08-21,0
9,3310-01D,45.6139,-73.6313,19.58,2016-08-13,1
10,3240-05D,45.6604,-73.5607,27.09,2018-05-30,0


### Load precipitation data

#### Load and filter months between May & October included

In [87]:
precipitation = CSV.read("data/precipitations.csv",missingstring="-99999");
rename!(precipitation, Symbol("St-Hubert")=>:StHubert);

precipitation = filter(row -> month(row.date) > 4, precipitation);
precipitation = filter(row -> month(row.date) < 11, precipitation); 

#### Replace missing data by 0

In [88]:
precipitation[!,:McTavish] = coalesce.(precipitation[:,:McTavish], 0);
precipitation[!,:Bellevue] = coalesce.(precipitation[:,:Bellevue], 0);
precipitation[!,:Assomption] = coalesce.(precipitation[:,:Assomption], 0);
precipitation[!,:Trudeau] = coalesce.(precipitation[:,:Trudeau], 0);
precipitation[!,:StHubert] = coalesce.(precipitation[:,:StHubert], 0);

first(shuffleDf(precipitation), 5)

Unnamed: 0_level_0,date,heure,McTavish,Bellevue,Assomption,Trudeau,StHubert
Unnamed: 0_level_1,Date,Int64,Int64,Int64,Int64,Int64,Int64
1,2016-06-28,5,0,0,0,0,0
2,2014-06-18,12,0,0,0,0,0
3,2014-09-05,8,0,0,0,0,0
4,2014-06-20,9,0,0,0,0,0
5,2017-09-05,14,0,0,0,0,0


### Extract features from precipitation

#### Sum of precipitation for the day

In [89]:
pcp_sum = by(precipitation, :date,  McTavish = :McTavish=>sum, Bellevue = :Bellevue=>sum, 
   Assomption = :Assomption=>sum, Trudeau = :Trudeau=>sum, StHubert = :StHubert=>sum);
first(shuffleDf(pcp_sum), 5)

Unnamed: 0_level_0,date,McTavish,Bellevue,Assomption,Trudeau,StHubert
Unnamed: 0_level_1,Date,Int64,Int64,Int64,Int64,Int64
1,2016-09-11,60,38,30,54,76
2,2019-09-25,18,0,0,2,0
3,2015-06-28,164,108,190,113,119
4,2016-07-25,41,7,100,49,35
5,2013-07-28,94,206,120,110,0


#### Maximum precipitation in an hour for the day

In [90]:
pcp_max = by(precipitation, :date,  McTavish = :McTavish=>maximum, Bellevue = :Bellevue=>maximum, 
   Assomption = :Assomption=>maximum, Trudeau = :Trudeau=>maximum, StHubert = :StHubert=>maximum)
first(shuffleDf(pcp_max),5)

Unnamed: 0_level_0,date,McTavish,Bellevue,Assomption,Trudeau,StHubert
Unnamed: 0_level_1,Date,Int64,Int64,Int64,Int64,Int64
1,2013-07-15,0,0,0,0,0
2,2017-07-14,27,50,0,43,30
3,2018-05-18,0,0,5,0,0
4,2015-08-03,0,10,20,2,0
5,2017-09-27,42,36,50,50,10


#### Maximum precipitation during three consecutive hours in a day

In [91]:
pcp_max3h = by(precipitation, :date,  McTavish = :McTavish=>maximum3, Bellevue = :Bellevue=>maximum3, 
   Assomption = :Assomption=>maximum3, Trudeau = :Trudeau=>maximum3, StHubert = :StHubert=>maximum3)
first(shuffleDf(pcp_max3h),5)

Unnamed: 0_level_0,date,McTavish,Bellevue,Assomption,Trudeau,StHubert
Unnamed: 0_level_1,Date,Int64,Int64,Int64,Int64,Int64
1,2019-08-14,0,0,0,0,0
2,2018-09-03,59,81,123,33,112
3,2019-10-27,210,0,0,197,171
4,2015-10-26,2,0,10,0,0
5,2016-10-19,0,0,0,0,0


### Add precipitation data to features

#### Get stations lat-lng

In [92]:
station_df = DataFrame(STATION = String[], LAT = Float64[], LNG = Float64[]);

push!(station_df, ["McTavish", 45.504742, -73.579167]);
push!(station_df, ["Bellevue", 45.427222, -73.929167]);
push!(station_df, ["Assomption", 45.809444, -73.434722]);
push!(station_df, ["Trudeau", 45.467778, -73.741667]);
push!(station_df, ["StHubert", 45.5175, -73.416944]);

station_df

Unnamed: 0_level_0,STATION,LAT,LNG
Unnamed: 0_level_1,String,Float64,Float64
1,McTavish,45.5047,-73.5792
2,Bellevue,45.4272,-73.9292
3,Assomption,45.8094,-73.4347
4,Trudeau,45.4678,-73.7417
5,StHubert,45.5175,-73.4169


### Normalize elements

#### Get extreme values

In [18]:
min_lat = min(minimum(comb.TP_LAT), minimum(station_df.LAT));
max_lat = max(maximum(comb.TP_LAT), maximum(station_df.LAT));

min_lng = min(minimum(comb.TP_LNG), minimum(station_df.LNG));
max_lng = max(maximum(comb.TP_LNG), maximum(station_df.LNG));

min_z = minimum(comb.TP_Z);
max_z = maximum(comb.TP_Z);

#### Normalize every value of comb between 0 and 1

In [19]:
comb.TP_LAT = normalize.(comb.TP_LAT, min_lat, max_lat);
comb.TP_LNG = normalize.(comb.TP_LNG, min_lng, max_lng);
comb.TP_Z = normalize.(comb.TP_Z, min_z, max_z);

first(shuffleDf(comb), 10)

Unnamed: 0_level_0,ID_OUVRAGE,TP_LAT,TP_LNG,TP_Z,DATE,SURVERSE
Unnamed: 0_level_1,String,Float64,Float64,Float64,Date,Int64
1,4340-01D,0.304064,0.76384,0.359121,2016-10-27,0
2,0801-02D,0.282343,0.794159,0.130563,2014-05-22,0
3,0801-03D,0.259517,0.795544,0.117484,2017-09-14,0
4,3305-02D,0.54252,0.609008,0.136121,2015-08-27,0
5,3350-03D,0.432488,0.553455,0.310401,2018-06-07,0
6,4310-01D,0.398976,0.803102,0.182552,2018-05-26,0
7,4430-02D,0.078588,0.685158,0.333944,2018-10-13,0
8,4230-03D,0.658594,0.861384,0.0246215,2013-05-18,0
9,4265-01D,0.549187,0.847845,0.112252,2013-07-04,0
10,4300-01D,0.413346,0.807748,0.171435,2016-07-30,0


#### Normalize every value of station_df between 0 and 1

In [20]:
station_df.LAT = normalize.(station_df.LAT, min_lat, max_lat);
station_df.LNG = normalize.(station_df.LNG, min_lng, max_lng);

station_df

Unnamed: 0_level_0,STATION,LAT,LNG
Unnamed: 0_level_1,String,Float64,Float64
1,McTavish,0.251316,0.699536
2,Bellevue,0.0608415,0.0512791
3,Assomption,1.0,0.967072
4,Trudeau,0.160492,0.39856
5,StHubert,0.282664,1.0


### Augment Features

#### Add pcp_sum and pcp_max columns

In [94]:
comb.PCP_SUM = zeros(size(comb, 1));
comb.PCP_MAX = zeros(size(comb, 1));
comb.PCP_MAX3 = zeros(size(comb, 1));
permutecols!(comb, [:ID_OUVRAGE, :TP_LAT, :TP_LNG, :TP_Z, :DATE, :PCP_SUM, :PCP_MAX, :PCP_MAX3, :SURVERSE]);

In [95]:
first(shuffleDf(comb), 10)

Unnamed: 0_level_0,ID_OUVRAGE,TP_LAT,TP_LNG,TP_Z,DATE,PCP_SUM,PCP_MAX,PCP_MAX3
Unnamed: 0_level_1,String,Float64,Float64,Float64,Date,Float64,Float64,Float64
1,3305-05D,45.6166,-73.6349,12.12,2017-10-22,0.0,0.0,0.0
2,3350-11D,45.5328,-73.7065,25.28,2013-06-12,0.0,0.0,0.0
3,0801-07D,45.5191,-73.5341,33.58,2013-05-18,0.0,0.0,0.0
4,4330-01D,45.5438,-73.5353,14.16,2017-06-19,0.0,0.0,0.0
5,4560-02D,45.451,-73.7841,26.18,2014-09-22,0.0,0.0,0.0
6,3240-02D,45.6644,-73.5605,10.11,2015-06-15,0.0,0.0,0.0
7,0672-01D,45.6823,-73.531,9.12,2014-05-16,0.0,0.0,0.0
8,3350-11D,45.5328,-73.7065,25.28,2016-07-27,0.0,0.0,0.0
9,3230-01D,45.6661,-73.5469,13.77,2015-06-01,0.0,0.0,0.0
10,3305-06D,45.6136,-73.6363,12.25,2018-10-11,0.0,0.0,0.0


#### Find closest station to each ouvrage and add pcp_sum and pcp_max to it

In [98]:
for i=1:size(comb, 1)
    id_ouvrage = comb[i, 1]; 
    closest_station = "McTavish"; # initial value
    shortest_dist = -1;
    
    # Find closest station
    for j=1:size(station_df, 1)
        dist = findDistance(comb[i, :TP_LAT], comb[i, :TP_LNG], station_df[j, :LAT], station_df[j, :LNG]);
        
        if shortest_dist == -1 || dist < shortest_dist
            shortest_dist = dist;
            closest_station = station_df[j, :STATION];
        end
    end
    
    # Augment comb with a weighted p_sum, based on the distance to the station
    p_sum = pcp_sum[∈([comb[i, :DATE]]).(pcp_sum.date), Symbol(closest_station)];
    comb[i, :PCP_SUM] = p_sum[1] * (1 - shortest_dist); 
    # Augment comb with a weighted p_max, based on the distance to the station
    p_max = pcp_max[∈([comb[i, :DATE]]).(pcp_max.date), Symbol(closest_station)]
    comb[i, :PCP_MAX] = p_max[1] * (1 - shortest_dist);
    # Augment comb with a weighted p_max3h, based on the distance to the station
    p_max3 = pcp_max3h[∈([comb[i, :DATE]]).(pcp_max3h.date), Symbol(closest_station)]
    comb[i, :PCP_MAX3] = p_max3[1] * (1 - shortest_dist);
end

### TODO: Remove outlier in PCP_SUM and PCP_MAX AND PCP_MAX3 that cause compression

In [99]:
comb[comb[:PCP_SUM] .> 750, :PCP_SUM] = 750;
comb[comb[:PCP_MAX] .> 500, :PCP_MAX] = 500;
comb[comb[:PCP_MAX3] .> 750, :PCP_MAX3] = 750;

│   caller = top-level scope at In[99]:1
└ @ Core In[99]:1
│   caller = top-level scope at In[99]:2
└ @ Core In[99]:2
│   caller = top-level scope at In[99]:3
└ @ Core In[99]:3


In [100]:
first(shuffleDf(filter(row -> row.SURVERSE == 1, comb)), 10)

Unnamed: 0_level_0,ID_OUVRAGE,TP_LAT,TP_LNG,TP_Z,DATE,PCP_SUM,PCP_MAX,PCP_MAX3
Unnamed: 0_level_1,String,Float64,Float64,Float64,Date,Float64,Float64,Float64
1,4340-01D,45.5262,-73.5444,19.33,2014-06-17,0.0,0.0,0.0
2,0801-05D,45.5173,-73.5312,12.7,2018-10-27,95.9886,38.0153,59.8741
3,4270-01D,45.6105,-73.5087,11.17,2013-05-29,127.449,65.4703,89.9126
4,3350-01D,45.5896,-73.6519,15.09,2016-08-16,682.181,195.416,526.736
5,3350-07D,45.5461,-73.6921,20.75,2015-07-07,1.81453,1.81453,1.81453
6,3290-01D,45.6238,-73.6219,19.85,2013-05-26,101.326,21.8375,35.8136
7,4230-09D,45.699,-73.4799,8.347,2013-06-07,132.094,27.2994,61.6437
8,0801-05D,45.5173,-73.5312,12.7,2013-07-06,0.0,0.0,0.0
9,3380-01D,45.5533,-73.6703,22.23,2013-08-22,190.101,81.6001,105.811
10,4240-01D,45.6497,-73.4877,11.91,2014-07-31,101.152,55.2509,99.4516


#### Normalize pcp_sum and pcp_max and pcp_max3

In [26]:
min_pcp_sum = minimum(comb.PCP_SUM);
max_pcp_sum = maximum(comb.PCP_SUM);

min_pcp_max = minimum(comb.PCP_MAX);
max_pcp_max = maximum(comb.PCP_MAX);

min_pcp_max3 = minimum(comb.PCP_MAX3);
max_pcp_max3 = maximum(comb.PCP_MAX3);

In [27]:
# comb.PCP_SUM = normalize.(comb.PCP_SUM, min_pcp_sum, max_pcp_sum);
# comb.PCP_MAX = normalize.(comb.PCP_MAX, min_pcp_max, max_pcp_max);
# comb.PCP_MAX3 = normalize.(comb.PCP_MAX3, min_pcp_max3, max_pcp_max3);

In [101]:
first(shuffleDf(filter(row -> row.SURVERSE == 1, comb)), 10)

Unnamed: 0_level_0,ID_OUVRAGE,TP_LAT,TP_LNG,TP_Z,DATE,PCP_SUM,PCP_MAX,PCP_MAX3
Unnamed: 0_level_1,String,Float64,Float64,Float64,Date,Float64,Float64,Float64
1,4720-01D,45.7001,-73.4891,10.6,2013-06-02,122.908,70.2331,96.5706
2,4430-05D,45.424,-73.6573,30.66,2015-07-27,9.04923,7.23938,9.04923
3,4230-09D,45.699,-73.4799,8.347,2013-09-21,140.9,44.0312,107.436
4,4230-09D,45.699,-73.4799,8.347,2013-08-13,246.575,122.407,220.156
5,4340-02D,45.5262,-73.5444,19.33,2015-08-18,188.954,188.954,188.954
6,4230-03D,45.6705,-73.4918,9.1,2016-07-14,118.971,62.8847,118.971
7,3350-11D,45.5328,-73.7065,25.28,2014-09-06,33.3377,13.8907,17.5949
8,3350-07D,45.5461,-73.6921,20.75,2016-08-16,635.084,166.029,453.631
9,3380-01D,45.5533,-73.6703,22.23,2014-05-04,189.205,32.2814,66.3561
10,3350-05D,45.5705,-73.6624,21.27,2017-07-08,292.332,196.676,238.693


### Split dates into months and days

In [102]:
comb.MONTH = month.(comb.DATE);
comb.DAY = day.(comb.DATE);
first(shuffleDf(comb[!, [:DATE, :MONTH, :DAY]]), 5)

Unnamed: 0_level_0,DATE,MONTH,DAY
Unnamed: 0_level_1,Date,Int64,Int64
1,2015-08-25,8,25
2,2016-09-20,9,20
3,2016-05-22,5,22
4,2018-06-11,6,11
5,2018-07-08,7,8


#### Normalize the months and days

In [30]:
# comb.MONTH = normalize.(comb.MONTH, 5, 10);
# comb.DAY = normalize.(comb.DAY, 1, 31);
first(shuffleDf(comb[!, [:DATE, :MONTH, :DAY]]), 5)

Unnamed: 0_level_0,DATE,MONTH,DAY
Unnamed: 0_level_1,Date,Float64,Float64
1,2018-07-14,0.4,0.433333
2,2017-07-06,0.4,0.166667
3,2018-06-04,0.2,0.1
4,2017-09-06,0.8,0.166667
5,2013-06-29,0.2,0.933333


# Validate model

### Split train and validation sets

In [103]:
r_idx = shuffle(1:size(comb, 1));
train_ceil = floor(Int, size(r_idx, 1) * 0.8);
train_set = comb[r_idx[1:train_ceil], :];
val_set = comb[r_idx[train_ceil+1:size(r_idx, 1)], :];

### Train model on train set

#### Build the features and labels

In [104]:
names_ft = [:TP_LAT, :TP_LNG, :TP_Z, :MONTH, :DAY, :PCP_SUM, :PCP_MAX, :PCP_MAX3];
train_features = convert(Matrix{Float64},train_set[:, names_ft]);
train_labels = convert(Vector, train_set[:, :SURVERSE]);

#### Build the model N features to use is log_2(N + 1)

In [None]:
val_model = LIBSVM.fit!(SVC(), train_features, train_labels);

### Validate model on validation set

#### Single validation

In [78]:
val_features = convert(Matrix{Float64},val_set[:, names_ft]);
val_labels = val_set[!, :SURVERSE];
val_pred = LIBSVM.predict(val_model, val_features);

# r = roc(val_labels, val_pred);
# f1score(r)
val_pred

31876-element Array{Int64,1}:
 0
 0
 0
 0
 0
 0
 0
 0
 0
 0
 0
 0
 0
 ⋮
 0
 0
 0
 0
 0
 0
 0
 0
 0
 0
 0
 0

#### Batch validation for F1Score

In [410]:
niter = 10;
batch_score = 0;

for i=1:niter
    # Split train and val sets
    r_idx = shuffle(1:size(comb, 1));
    train_ceil = floor(Int, size(r_idx, 1) * 0.8);
    train_set = comb[r_idx[1:train_ceil], :];
    val_set = comb[r_idx[train_ceil+1:size(r_idx, 1)], :];
    
    # Build features and labels
    train_features = convert(Matrix{Float64},train_set[:, names_ft]);
    train_labels = train_set[:, :SURVERSE];
    
    # Build model
    val_model = build_forest(train_labels, train_features, 4, 100, 0.7, 40);
    
    # Validate model
    val_features = convert(Matrix{Float64},val_set[:, names_ft]);
    val_labels = val_set[!, :SURVERSE];
    val_pred = apply_forest(val_model, val_features);

    r = roc(val_labels, val_pred);
    batch_score += f1score(r);
end

batch_score = batch_score / niter

0.6169089771652957

# Submission model creation

### Separate features and labels

In [383]:
full_train_features = convert(Matrix{Float64},comb[:, names_ft]);
# train_features = hcat(dates, train_features);

In [384]:
full_train_labels = comb[:, :SURVERSE];

### Build Model

#### Test with tree first

In [385]:
model_tree = build_tree(full_train_labels, full_train_features)

Decision Tree
Leaves: 4072
Depth:  35

In [386]:
model_tree = prune_tree(model_tree, 0.90)

Decision Tree
Leaves: 3930
Depth:  35

In [387]:
model = build_forest(full_train_labels, full_train_features, 4, 100, 0.7, 40)

Ensemble of Decision Trees
Trees:      100
Avg Leaves: 2921.1
Avg Depth:  31.62

# Prediction

## Get the test data

In [389]:
test = CSV.read("data/test.csv");
rename!(test, :NO_OUVRAGE => :ID_OUVRAGE);
first(test, 10)

Unnamed: 0_level_0,ID_OUVRAGE,DATE
Unnamed: 0_level_1,String,Date
1,3260-01D,2019-05-02
2,3260-01D,2019-05-09
3,3260-01D,2019-05-10
4,3260-01D,2019-05-15
5,3260-01D,2019-05-20
6,3260-01D,2019-05-23
7,3260-01D,2019-05-24
8,3260-01D,2019-05-26
9,3260-01D,2019-05-30
10,3350-07D,2019-05-01


In [390]:
to_merge = unique(comb[!, [:ID_OUVRAGE, :TP_LAT, :TP_LNG, :TP_Z]], :ID_OUVRAGE);
test_comb = join(test, to_merge, on= [:ID_OUVRAGE]);
nrow(test_comb)

283

In [391]:
first(shuffleDf(test_comb), 10)

Unnamed: 0_level_0,ID_OUVRAGE,DATE,TP_LAT,TP_LNG,TP_Z
Unnamed: 0_level_1,String,Date,Float64,Float64,Float64
1,4380-01D,2019-06-30,0.160421,0.728206,0.359859
2,4350-01D,2019-07-08,0.237379,0.74432,0.359859
3,4240-01D,2019-05-23,0.607546,0.868856,0.116503
4,4350-01D,2019-08-06,0.237379,0.74432,0.359859
5,4240-01D,2019-07-14,0.607546,0.868856,0.116503
6,4350-01D,2019-09-05,0.237379,0.74432,0.359859
7,3260-01D,2019-08-28,0.609996,0.697487,0.386587
8,3350-07D,2019-09-02,0.353054,0.490395,0.405552
9,4240-01D,2019-05-22,0.607546,0.868856,0.116503
10,3260-01D,2019-05-23,0.609996,0.697487,0.386587


### Add PCP_SUM and PCP_MAX

#### Initialize default pcp

In [399]:
test_comb.PCP_SUM = zeros(size(test_comb, 1));
test_comb.PCP_MAX = zeros(size(test_comb, 1));
test_comb.PCP_MAX3 = zeros(size(test_comb, 1));
permutecols!(test_comb, [:ID_OUVRAGE, :TP_LAT, :TP_LNG, :TP_Z, :DATE, :PCP_SUM, :PCP_MAX, :PCP_MAX3]);

In [400]:
first(shuffleDf(test_comb), 10)

Unnamed: 0_level_0,ID_OUVRAGE,TP_LAT,TP_LNG,TP_Z,DATE,PCP_SUM,PCP_MAX,PCP_MAX3
Unnamed: 0_level_1,String,Float64,Float64,Float64,Date,Float64,Float64,Float64
1,4380-01D,0.160421,0.728206,0.359859,2019-08-17,0.0,0.0,0.0
2,4350-01D,0.237379,0.74432,0.359859,2019-06-14,0.0,0.0,0.0
3,4240-01D,0.607546,0.868856,0.116503,2019-08-13,0.0,0.0,0.0
4,4350-01D,0.237379,0.74432,0.359859,2019-09-10,0.0,0.0,0.0
5,4350-01D,0.237379,0.74432,0.359859,2019-05-30,0.0,0.0,0.0
6,3260-01D,0.609996,0.697487,0.386587,2019-08-01,0.0,0.0,0.0
7,3350-07D,0.353054,0.490395,0.405552,2019-05-11,0.0,0.0,0.0
8,3350-07D,0.353054,0.490395,0.405552,2019-05-13,0.0,0.0,0.0
9,4380-01D,0.160421,0.728206,0.359859,2019-05-28,0.0,0.0,0.0
10,3350-07D,0.353054,0.490395,0.405552,2019-09-23,0.0,0.0,0.0


#### Populate pcp

In [401]:
for i=1:size(test_comb, 1)
    id_ouvrage = test_comb[i, 1]; 
    closest_station = "McTavish"; # initial value
    shortest_dist = -1;
    
    # Find closest station
    for j=1:size(station_df, 1)
        dist = findDistance(test_comb[i, :TP_LAT], test_comb[i, :TP_LNG], station_df[j, :LAT], station_df[j, :LNG]);
        
        if shortest_dist == -1 || dist < shortest_dist
            shortest_dist = dist;
            closest_station = station_df[j, :STATION];
        end
    end
    
    # Augment comb with a weighted p_sum, based on the distance to the station
    p_sum = pcp_sum[∈([test_comb[i, :DATE]]).(pcp_sum.date), Symbol(closest_station)];
    test_comb[i, :PCP_SUM] = p_sum[1] * (1 - shortest_dist); 
    # Augment comb with a weighted p_max, based on the distance to the station
    p_max = pcp_max[∈([test_comb[i, :DATE]]).(pcp_max.date), Symbol(closest_station)]
    test_comb[i, :PCP_MAX] = p_max[1] * (1 - shortest_dist);
    # Augment comb with a weighted p_max3, based on the distance to the station
    p_max3 = pcp_max3h[∈([test_comb[i, :DATE]]).(pcp_max3h.date), Symbol(closest_station)]
    test_comb[i, :PCP_MAX3] = p_max3[1] * (1 - shortest_dist);
end

In [402]:
first(shuffleDf(test_comb), 10)

Unnamed: 0_level_0,ID_OUVRAGE,TP_LAT,TP_LNG,TP_Z,DATE,PCP_SUM,PCP_MAX,PCP_MAX3
Unnamed: 0_level_1,String,Float64,Float64,Float64,Date,Float64,Float64,Float64
1,4380-01D,0.160421,0.728206,0.359859,2019-06-24,0.0,0.0,0.0
2,3260-01D,0.609996,0.697487,0.386587,2019-08-09,55.153,21.8047,36.5549
3,3350-07D,0.353054,0.490395,0.405552,2019-07-23,0.0,0.0,0.0
4,4240-01D,0.607546,0.868856,0.116503,2019-06-13,5.19718,5.19718,5.19718
5,4350-01D,0.237379,0.74432,0.359859,2019-06-19,0.0,0.0,0.0
6,3350-07D,0.353054,0.490395,0.405552,2019-06-14,45.6263,14.9465,40.9063
7,3260-01D,0.609996,0.697487,0.386587,2019-07-31,0.0,0.0,0.0
8,3350-07D,0.353054,0.490395,0.405552,2019-06-30,24.3865,14.9465,24.3865
9,3350-07D,0.353054,0.490395,0.405552,2019-07-29,0.0,0.0,0.0
10,4380-01D,0.160421,0.728206,0.359859,2019-06-30,113.086,104.039,113.086


#### Normalize pcp based on comb (not test_comb)

In [403]:
test_comb.PCP_SUM = normalize.(test_comb.PCP_SUM, min_pcp_sum, max_pcp_sum);
test_comb.PCP_MAX = normalize.(test_comb.PCP_MAX, min_pcp_max, max_pcp_max);
test_comb.PCP_MAX3 = normalize.(test_comb.PCP_MAX3, min_pcp_max3, max_pcp_max3);

In [404]:
first(test_comb, 20)

Unnamed: 0_level_0,ID_OUVRAGE,TP_LAT,TP_LNG,TP_Z,DATE,PCP_SUM,PCP_MAX,PCP_MAX3
Unnamed: 0_level_1,String,Float64,Float64,Float64,Date,Float64,Float64,Float64
1,3260-01D,0.609996,0.697487,0.386587,2019-05-02,0.0222322,0.0153915,0.0188119
2,3260-01D,0.609996,0.697487,0.386587,2019-05-09,0.0761026,0.0577183,0.0761026
3,3260-01D,0.609996,0.697487,0.386587,2019-05-10,0.329208,0.0846535,0.132538
4,3260-01D,0.609996,0.697487,0.386587,2019-05-15,0.00171017,0.00256526,0.00171017
5,3260-01D,0.609996,0.697487,0.386587,2019-05-20,0.039334,0.0513052,0.0376238
6,3260-01D,0.609996,0.697487,0.386587,2019-05-23,0.14964,0.0808056,0.122277
7,3260-01D,0.609996,0.697487,0.386587,2019-05-24,0.0111161,0.0115437,0.00940595
8,3260-01D,0.609996,0.697487,0.386587,2019-05-26,0.00256526,0.00384789,0.00256526
9,3260-01D,0.609996,0.697487,0.386587,2019-05-30,0.0059856,0.0089784,0.0059856
10,3350-07D,0.353054,0.490395,0.405552,2019-05-01,0.0492974,0.0330397,0.0430041


#### Split dates into month and day

In [405]:
test_comb.MONTH = month.(test_comb.DATE);
test_comb.DAY = day.(test_comb.DATE);

first(shuffleDf(test_comb[!, [:DATE, :MONTH, :DAY]]), 5)

Unnamed: 0_level_0,DATE,MONTH,DAY
Unnamed: 0_level_1,Date,Int64,Int64
1,2019-05-24,5,24
2,2019-07-11,7,11
3,2019-09-13,9,13
4,2019-05-06,5,6
5,2019-06-28,6,28


#### Normalize months and days

In [406]:
test_comb.MONTH = normalize.(test_comb.MONTH, 5, 10);
test_comb.DAY = normalize.(test_comb.DAY, 1, 31);
first(shuffleDf(test_comb[!, [:DATE, :MONTH, :DAY]]), 5)

Unnamed: 0_level_0,DATE,MONTH,DAY
Unnamed: 0_level_1,Date,Float64,Float64
1,2019-08-27,0.6,0.866667
2,2019-07-07,0.4,0.2
3,2019-07-05,0.4,0.133333
4,2019-09-28,0.8,0.9
5,2019-07-31,0.4,1.0


### Create Test features

In [407]:
test_features = convert(Matrix{Float64}, test_comb[:, names_ft]);
# test_features = hcat(test_dates, test_features);

test_features

283×8 Array{Float64,2}:
 0.609996  0.697487  0.386587  0.0  …  0.0222322   0.0153915   0.0188119 
 0.609996  0.697487  0.386587  0.0     0.0761026   0.0577183   0.0761026 
 0.609996  0.697487  0.386587  0.0     0.329208    0.0846535   0.132538  
 0.609996  0.697487  0.386587  0.0     0.00171017  0.00256526  0.00171017
 0.609996  0.697487  0.386587  0.0     0.039334    0.0513052   0.0376238 
 0.609996  0.697487  0.386587  0.0  …  0.14964     0.0808056   0.122277  
 0.609996  0.697487  0.386587  0.0     0.0111161   0.0115437   0.00940595
 0.609996  0.697487  0.386587  0.0     0.00256526  0.00384789  0.00256526
 0.609996  0.697487  0.386587  0.0     0.0059856   0.0089784   0.0059856 
 0.353054  0.490395  0.405552  0.0     0.0492974   0.0330397   0.0430041 
 0.353054  0.490395  0.405552  0.0  …  0.0136354   0.0125866   0.0115377 
 0.353054  0.490395  0.405552  0.0     0.0         0.0         0.0       
 0.353054  0.490395  0.405552  0.0     0.0902037   0.0676528   0.0902037 
 ⋮            

## Predict

In [408]:
test_labels = apply_forest(model, test_features)

283-element Array{Int64,1}:
 0
 0
 1
 0
 0
 1
 0
 0
 0
 1
 0
 0
 1
 ⋮
 0
 1
 1
 0
 0
 0
 0
 0
 0
 0
 0
 0

## Generate submission

In [409]:
ID = test_comb[:,:ID_OUVRAGE].*"_".*string.(test_comb[:,:DATE])
sampleSubmission = DataFrame(ID = ID, Surverse=test_labels)
CSV.write("submissions/mc-submission-7.csv",sampleSubmission)

"submissions/mc-submission-7.csv"