# Data Processing

In [78]:
using CSV, DataFrames, DecisionTree, Statistics, Dates, Gadfly, Random, MLBase;
include("utils/precipitation.jl");

## Build features

### Get and filter the features

#### Latitude, Longitude, Height

In [79]:
features = CSV.read("data/ouvrages-surverses.csv");
colnames = ["N_Env", "ID_SOMA", "ID_OUVRAGE", "NOM", "SOMA_SEC", "REGION", "TP_X", "TP_Y", "TP_Z", "TP_LAT", "TP_LNG", "EMI_X", "EMI_Y", "EMI_LNG", "EMI_LAT"];
names!(features, Symbol.(colnames));
select!(features, [:ID_OUVRAGE, :TP_LAT, :TP_LNG, :TP_Z]);

#### Replace missing Z index with mean

In [80]:
features.TP_Z = coalesce.(features.TP_Z, mean(features[completecases(features), :].TP_Z));
first(shuffleDf(features), 10)

Unnamed: 0_level_0,ID_OUVRAGE,TP_LAT,TP_LNG,TP_Z
Unnamed: 0_level_1,String,Float64,Float64,Float64
1,3540-02D,45.4751,-73.8727,26.52
2,4340-02D,45.5262,-73.5444,19.33
3,4420-02D,45.443,-73.5764,17.08
4,4560-01D,45.4509,-73.7843,26.37
5,3305-03D,45.6196,-73.6329,11.76
6,3781-01D,45.4723,-73.8773,24.7
7,4430-06D,45.4304,-73.6655,23.67
8,4430-04D,45.4197,-73.6487,31.54
9,4710-01D,45.6976,-73.4968,12.12
10,3490-02D,45.5054,-73.8061,24.49


### Load dates and surverses

In [81]:
surverses = CSV.read("data/surverses.csv",missingstring="-99999");

#### Filter months

In [82]:
surverses = filter(row -> month(row.DATE) > 4, surverses);
surverses = filter(row -> month(row.DATE) < 11, surverses);

#### Filter non rain surverses

In [83]:
raison = coalesce.(surverses[:,:RAISON],"Inconnue");
surverses[!,:RAISON] = raison;

surverses = filter(row -> row.RAISON ∈ ["P","Inconnue","TS"], surverses);
select!(surverses, [:NO_OUVRAGE, :DATE, :SURVERSE]);

#### Remove missing data and rename

In [84]:
surverses = dropmissing(surverses, disallowmissing=true);
rename!(surverses, :NO_OUVRAGE => :ID_OUVRAGE);
first(shuffleDf(surverses),10)

Unnamed: 0_level_0,ID_OUVRAGE,DATE,SURVERSE
Unnamed: 0_level_1,String,Date,Int64
1,4270-02D,2018-05-29,0
2,4560-03D,2013-07-10,0
3,4610-09D,2015-09-16,0
4,4230-06D,2016-08-15,0
5,3305-01D,2015-09-24,0
6,4280-02D,2017-10-21,0
7,4430-06D,2017-05-10,0
8,3305-02D,2017-09-07,0
9,3580-01D,2016-06-29,0
10,0672-03D,2017-05-18,0


### Augment features with dates and label

In [85]:
comb = join(features, surverses, on = :ID_OUVRAGE);
first(shuffleDf(comb), 10)

Unnamed: 0_level_0,ID_OUVRAGE,TP_LAT,TP_LNG,TP_Z,DATE,SURVERSE
Unnamed: 0_level_1,String,Float64,Float64,Float64,Date,Int64
1,3540-02D,45.4751,-73.8727,26.52,2014-06-04,0
2,0801-09D,45.5158,-73.5357,25.28,2013-10-31,0
3,4230-09D,45.699,-73.4799,8.347,2013-05-06,0
4,4230-04D,45.6713,-73.4917,8.96,2016-07-03,0
5,4620-01D,45.4156,-73.885,24.92,2016-10-18,0
6,3400-01D,45.5435,-73.6755,26.04,2014-07-02,0
7,3350-09D,45.5371,-73.713,23.72,2015-06-25,0
8,3290-01D,45.6238,-73.6219,19.85,2014-09-04,0
9,3767-01D,45.5456,-73.701,20.51,2017-06-21,0
10,4600-03D,45.4364,-73.8226,26.42,2017-05-31,0


### Load precipitation data

#### Load and filter months between May & October included

In [86]:
precipitation = CSV.read("data/precipitations.csv",missingstring="-99999");
rename!(precipitation, Symbol("St-Hubert")=>:StHubert);

precipitation = filter(row -> month(row.date) > 4, precipitation);
precipitation = filter(row -> month(row.date) < 11, precipitation); 

#### Replace missing data by 0

In [87]:
precipitation[!,:McTavish] = coalesce.(precipitation[:,:McTavish], 0);
precipitation[!,:Bellevue] = coalesce.(precipitation[:,:Bellevue], 0);
precipitation[!,:Assomption] = coalesce.(precipitation[:,:Assomption], 0);
precipitation[!,:Trudeau] = coalesce.(precipitation[:,:Trudeau], 0);
precipitation[!,:StHubert] = coalesce.(precipitation[:,:StHubert], 0);

first(shuffleDf(precipitation), 5)

Unnamed: 0_level_0,date,heure,McTavish,Bellevue,Assomption,Trudeau,StHubert
Unnamed: 0_level_1,Date,Int64,Int64,Int64,Int64,Int64,Int64
1,2013-10-29,15,0,0,0,0,0
2,2018-07-01,8,0,0,0,0,0
3,2015-08-16,16,0,0,0,0,0
4,2017-05-08,18,0,0,0,0,0
5,2019-09-14,8,2,0,0,0,8


### Extract features from precipitation

#### Sum of precipitation for the day

In [88]:
pcp_sum = by(precipitation, :date,  McTavish = :McTavish=>sum, Bellevue = :Bellevue=>sum, 
   Assomption = :Assomption=>sum, Trudeau = :Trudeau=>sum, StHubert = :StHubert=>sum);
first(shuffleDf(pcp_sum), 5)

Unnamed: 0_level_0,date,McTavish,Bellevue,Assomption,Trudeau,StHubert
Unnamed: 0_level_1,Date,Int64,Int64,Int64,Int64,Int64
1,2016-08-27,0,0,0,0,0
2,2016-05-19,0,0,101,0,8
3,2014-10-14,0,0,10,0,0
4,2019-06-18,0,0,0,0,0
5,2019-06-29,31,0,64,138,186


#### Maximum precipitation in an hour for the day

In [89]:
pcp_max = by(precipitation, :date,  McTavish = :McTavish=>maximum, Bellevue = :Bellevue=>maximum, 
   Assomption = :Assomption=>maximum, Trudeau = :Trudeau=>maximum, StHubert = :StHubert=>maximum)
first(shuffleDf(pcp_max),5)

Unnamed: 0_level_0,date,McTavish,Bellevue,Assomption,Trudeau,StHubert
Unnamed: 0_level_1,Date,Int64,Int64,Int64,Int64,Int64
1,2015-06-16,232,74,62,118,126
2,2017-05-06,34,26,40,47,12
3,2018-05-02,0,0,17,0,0
4,2018-05-20,68,71,35,69,63
5,2018-09-18,0,0,0,2,0


#### Maximum precipitation during three consecutive hours in a day

In [90]:
pcp_max3h = by(precipitation, :date,  McTavish = :McTavish=>maximum3, Bellevue = :Bellevue=>maximum3, 
   Assomption = :Assomption=>maximum3, Trudeau = :Trudeau=>maximum3, StHubert = :StHubert=>maximum3)
first(shuffleDf(pcp_max3h),5)

Unnamed: 0_level_0,date,McTavish,Bellevue,Assomption,Trudeau,StHubert
Unnamed: 0_level_1,Date,Int64,Int64,Int64,Int64,Int64
1,2018-05-11,0,0,0,0,0
2,2019-08-23,0,0,0,0,0
3,2014-09-14,0,0,0,0,0
4,2018-06-10,0,0,2,0,0
5,2015-06-24,6,0,0,29,2


### Add precipitation data to features

#### Get stations lat-lng

In [91]:
station_df = DataFrame(STATION = String[], LAT = Float64[], LNG = Float64[]);

push!(station_df, ["McTavish", 45.504742, -73.579167]);
push!(station_df, ["Bellevue", 45.427222, -73.929167]);
push!(station_df, ["Assomption", 45.809444, -73.434722]);
push!(station_df, ["Trudeau", 45.467778, -73.741667]);
push!(station_df, ["StHubert", 45.5175, -73.416944]);

station_df

Unnamed: 0_level_0,STATION,LAT,LNG
Unnamed: 0_level_1,String,Float64,Float64
1,McTavish,45.5047,-73.5792
2,Bellevue,45.4272,-73.9292
3,Assomption,45.8094,-73.4347
4,Trudeau,45.4678,-73.7417
5,StHubert,45.5175,-73.4169


### Standardize TP and station data

In [92]:
meanlat = mean(comb.TP_LAT);
stdlat = std(comb.TP_LAT);
comb.TP_LAT = (comb.TP_LAT .- meanlat) ./ stdlat;
station_df.LAT = (station_df.LAT .- meanlat) ./ stdlat;

meanlng = mean(comb.TP_LNG);
stdlng = std(comb.TP_LNG);
comb.TP_LNG = (comb.TP_LNG .- meanlng) ./ stdlng;
station_df.LNG = (station_df.LNG .- meanlng) ./ stdlng;

meanz = mean(comb.TP_Z);
stdz = std(comb.TP_Z);
comb.TP_Z = (comb.TP_Z .- meanz) ./ stdz;

In [93]:
station_df

Unnamed: 0_level_0,STATION,LAT,LNG
Unnamed: 0_level_1,String,Float64,Float64
1,McTavish,-0.399934,0.53979
2,Bellevue,-1.29892,-2.14237
3,Assomption,3.13364,1.64672
4,Trudeau,-0.828599,-0.705498
5,StHubert,-0.251981,1.78296


### Augment Features

#### Add pcp_sum and pcp_max columns

In [94]:
comb.PCP_SUM = zeros(size(comb, 1));
comb.PCP_MAX = zeros(size(comb, 1));
comb.PCP_MAX3 = zeros(size(comb, 1));
permutecols!(comb, [:ID_OUVRAGE, :TP_LAT, :TP_LNG, :TP_Z, :DATE, :PCP_SUM, :PCP_MAX, :PCP_MAX3, :SURVERSE]);

In [95]:
first(shuffleDf(comb), 10)

Unnamed: 0_level_0,ID_OUVRAGE,TP_LAT,TP_LNG,TP_Z,DATE,PCP_SUM,PCP_MAX,PCP_MAX3
Unnamed: 0_level_1,String,Float64,Float64,Float64,Date,Float64,Float64,Float64
1,0801-05D,-0.254625,0.907701,-1.07023,2017-10-12,0.0,0.0,0.0
2,4270-02D,0.834128,1.08217,-0.0467296,2018-07-17,0.0,0.0,0.0
3,4610-06D,-1.26037,-1.58426,1.44523,2013-06-04,0.0,0.0,0.0
4,4360-01D,-0.562793,0.757544,-0.693299,2016-09-01,0.0,0.0,0.0
5,3240-03D,1.45111,0.682728,-1.46871,2018-10-20,0.0,0.0,0.0
6,3480-04D,-0.454006,-1.05544,0.760593,2017-06-24,0.0,0.0,0.0
7,3275-01D,1.1339,0.326263,-0.788686,2015-07-07,0.0,0.0,0.0
8,0801-06D,-0.238094,0.893818,-0.153282,2013-10-01,0.0,0.0,0.0
9,3767-01D,0.0734405,-0.394057,0.131343,2017-05-31,0.0,0.0,0.0
10,0801-06D,-0.238094,0.893818,-0.153282,2014-07-10,0.0,0.0,0.0


#### Find closest station to each ouvrage and add pcp_sum and pcp_max to it

In [96]:
for i=1:size(comb, 1)
    id_ouvrage = comb[i, 1]; 
    closest_station = "McTavish"; # initial value
    shortest_dist = -1;
    
    # Find closest station
    for j=1:size(station_df, 1)
        dist = findDistance(comb[i, :TP_LAT], comb[i, :TP_LNG], station_df[j, :LAT], station_df[j, :LNG]);
        
        if shortest_dist == -1 || dist < shortest_dist
            shortest_dist = dist;
            closest_station = station_df[j, :STATION];
        end
    end
    
    # Augment comb with a weighted p_sum, based on the distance to the station
    p_sum = pcp_sum[∈([comb[i, :DATE]]).(pcp_sum.date), Symbol(closest_station)];
#     comb[i, :PCP_SUM] = p_sum[1] * (1 - shortest_dist);
    comb[i, :PCP_SUM] = p_sum[1]; 
    
    # Augment comb with a weighted p_max, based on the distance to the station
    p_max = pcp_max[∈([comb[i, :DATE]]).(pcp_max.date), Symbol(closest_station)]
#     comb[i, :PCP_MAX] = p_max[1] * (1 - shortest_dist);
    comb[i, :PCP_MAX] = p_max[1];
    
    # Augment comb with a weighted p_max3h, based on the distance to the station
    p_max3 = pcp_max3h[∈([comb[i, :DATE]]).(pcp_max3h.date), Symbol(closest_station)]
#     comb[i, :PCP_MAX3] = p_max3[1] * (1 - shortest_dist);
    comb[i, :PCP_MAX3] = p_max3[1]; 
end

#### Remove outlier in PCP_SUM and PCP_MAX AND PCP_MAX3 that cause compression

In [97]:
comb[comb[:PCP_SUM] .> 750, :PCP_SUM] = 750;
comb[comb[:PCP_MAX] .> 500, :PCP_MAX] = 500;
comb[comb[:PCP_MAX3] .> 750, :PCP_MAX3] = 750;

│   caller = top-level scope at In[97]:1
└ @ Core In[97]:1
│   caller = top-level scope at In[97]:2
└ @ Core In[97]:2
│   caller = top-level scope at In[97]:3
└ @ Core In[97]:3


In [98]:
first(shuffleDf(filter(row -> row.SURVERSE == 1, comb)), 10)

Unnamed: 0_level_0,ID_OUVRAGE,TP_LAT,TP_LNG,TP_Z,DATE,PCP_SUM,PCP_MAX,PCP_MAX3
Unnamed: 0_level_1,String,Float64,Float64,Float64,Date,Float64,Float64,Float64
1,4360-01D,-0.562793,0.757544,-0.693299,2013-09-12,362.0,99.0,138.0
2,4430-01D,-1.23852,0.468524,-0.045586,2016-07-23,183.0,128.0,157.0
3,3400-01D,0.0493937,-0.198587,0.982139,2017-08-18,189.0,69.0,172.0
4,3260-01D,1.29293,0.531312,0.0790333,2015-05-30,38.0,28.0,36.0
5,3480-02D,-0.471222,-1.08189,0.88829,2014-06-03,209.0,120.0,198.0
6,3350-11D,-0.074433,-0.435663,0.865212,2013-06-12,37.0,16.0,37.0
7,4360-01D,-0.562793,0.757544,-0.693299,2014-08-13,618.0,132.0,262.0
8,4400-02D,-1.0264,0.591287,-0.536371,2015-08-24,0.0,0.0,0.0
9,3310-01D,0.865632,0.140067,-0.0117388,2014-05-17,300.0,90.0,203.0
10,4430-05D,-1.33635,-0.0587449,1.69293,2018-09-21,329.0,143.0,148.0


### Split dates into months and days

In [99]:
comb.MONTH = month.(comb.DATE);
comb.DAY = day.(comb.DATE);
first(shuffleDf(comb[!, [:DATE, :MONTH, :DAY]]), 5)

Unnamed: 0_level_0,DATE,MONTH,DAY
Unnamed: 0_level_1,Date,Int64,Int64
1,2014-08-26,8,26
2,2016-06-02,6,2
3,2014-08-16,8,16
4,2017-06-28,6,28
5,2018-09-28,9,28


## Standardize the PCP and Date

In [100]:
mean_pcpsum = mean(comb.PCP_SUM);
std_pcpsum = std(comb.PCP_SUM);
comb.PCP_SUM = (comb.PCP_SUM .- mean_pcpsum) ./ std_pcpsum;

mean_pcpmax = mean(comb.PCP_MAX);
std_pcpmax = std(comb.PCP_MAX);
comb.PCP_MAX = (comb.PCP_MAX .- mean_pcpmax) ./ std_pcpmax;

mean_pcpmax3 = mean(comb.PCP_MAX3);
std_pcpmax3 = std(comb.PCP_MAX3);
comb.PCP_MAX3 = (comb.PCP_MAX3 .- mean_pcpmax3) ./ std_pcpmax3;

meanmonth = mean(comb.MONTH);
stdmonth = std(comb.MONTH);
comb.MONTH = (comb.MONTH .- meanmonth) ./ stdmonth;

meanday = mean(comb.DAY);
stdday = std(comb.DAY);
comb.DAY = (comb.DAY .- meanday) ./ stdday;

In [101]:
first(shuffleDf(filter(row -> row.SURVERSE == 1, comb)), 10)

Unnamed: 0_level_0,ID_OUVRAGE,TP_LAT,TP_LNG,TP_Z,DATE,PCP_SUM,PCP_MAX
Unnamed: 0_level_1,String,Float64,Float64,Float64,Date,Float64,Float64
1,3400-01D,0.0493937,-0.198587,0.982139,2015-07-27,2.28954,4.85889
2,4330-01D,0.0527466,0.876211,-0.845611,2014-06-17,-0.400087,-0.401707
3,3400-01D,0.0493937,-0.198587,0.982139,2015-06-16,5.11501,6.73548
4,3380-01D,0.163171,-0.158884,0.395966,2016-08-14,1.80052,0.767314
5,4230-09D,1.85245,1.30015,-1.73995,2013-07-28,1.22999,1.75175
6,3250-01D,1.3565,0.586367,-0.384058,2018-06-04,3.52568,0.64426
7,4430-05D,-1.33635,-0.0587449,1.69293,2016-08-13,3.15891,2.52085
8,4265-01D,1.00593,1.15342,-1.21178,2014-07-05,0.00743168,0.521205
9,3250-01D,1.3565,0.586367,-0.384058,2015-07-07,0.523623,0.951897
10,4265-01D,1.00593,1.15342,-1.21178,2014-10-04,2.00428,0.582732


# Validate model

### Split train and validation sets

In [102]:
r_idx = shuffle(1:size(comb, 1));
train_ceil = floor(Int, size(r_idx, 1) * 0.8);
train_set = comb[r_idx[1:train_ceil], :];
val_set = comb[r_idx[train_ceil+1:size(r_idx, 1)], :];

### Train model on train set

#### Random Forest Params

In [103]:
names_ft = [:TP_LAT, :TP_LNG, :TP_Z, :MONTH, :DAY, :PCP_SUM, :PCP_MAX, :PCP_MAX3];
nft = 3;
ntrees = 100;
podata = 0.8;
maxd = 25;

#### Build the features and labels

In [104]:
train_features = convert(Matrix{Float64},train_set[:, names_ft]);
train_labels = train_set[:, :SURVERSE];

#### Build the model N features to use is log_2(N + 1)

In [105]:
val_model = build_forest(train_labels, train_features, nft, ntrees, podata, maxd)

Ensemble of Decision Trees
Trees:      100
Avg Leaves: 2988.55
Avg Depth:  25.0

### Validate model on validation set

#### Single validation

In [106]:
val_model = build_forest(train_labels, train_features, nft, ntrees, podata, maxd)
val_features = convert(Matrix{Float64},val_set[:, names_ft]);
val_labels = val_set[!, :SURVERSE];
val_pred = apply_forest(val_model, val_features);

r = roc(val_labels, val_pred);
f1score(r)

0.6528384279475983

#### Batch validation for F1Score

In [107]:
niter = 10;
batch_score = 0;

for i=1:niter
    # Split train and val sets
    r_idx = shuffle(1:size(comb, 1));
    train_ceil = floor(Int, size(r_idx, 1) * 0.8);
    train_set = comb[r_idx[1:train_ceil], :];
    val_set = comb[r_idx[train_ceil+1:size(r_idx, 1)], :];
    
    # Build features and labels
    train_features = convert(Matrix{Float64},train_set[:, names_ft]);
    train_labels = train_set[:, :SURVERSE];
    
    # Build model
    val_model = build_forest(train_labels, train_features, nft, ntrees, podata, maxd);
    
    # Validate model
    val_features = convert(Matrix{Float64},val_set[:, names_ft]);
    val_labels = val_set[!, :SURVERSE];
    val_pred = apply_forest(val_model, val_features);

    r = roc(val_labels, val_pred);
    batch_score += f1score(r);
end

batch_score = batch_score / niter

0.6427118767207884

# Submission model creation

### Separate features and labels

In [108]:
full_train_features = convert(Matrix{Float64},comb[:, names_ft]);

In [109]:
full_train_labels = comb[:, :SURVERSE];

### Build Model

#### Test with tree first

In [110]:
model_tree = build_tree(full_train_labels, full_train_features)

Decision Tree
Leaves: 4709
Depth:  33

In [111]:
model_tree = prune_tree(model_tree, 0.90)

Decision Tree
Leaves: 4588
Depth:  33

In [112]:
model = build_forest(full_train_labels, full_train_features, nft, ntrees, podata, maxd)

Ensemble of Decision Trees
Trees:      100
Avg Leaves: 3568.16
Avg Depth:  25.0

# Prediction

## Get the test data

In [113]:
test = CSV.read("data/test.csv");
rename!(test, :NO_OUVRAGE => :ID_OUVRAGE);
first(test, 10)

Unnamed: 0_level_0,ID_OUVRAGE,DATE
Unnamed: 0_level_1,String,Date
1,3260-01D,2019-05-02
2,3260-01D,2019-05-09
3,3260-01D,2019-05-10
4,3260-01D,2019-05-15
5,3260-01D,2019-05-20
6,3260-01D,2019-05-23
7,3260-01D,2019-05-24
8,3260-01D,2019-05-26
9,3260-01D,2019-05-30
10,3350-07D,2019-05-01


In [114]:
to_merge = unique(comb[!, [:ID_OUVRAGE, :TP_LAT, :TP_LNG, :TP_Z]], :ID_OUVRAGE);
test_comb = join(test, to_merge, on= [:ID_OUVRAGE]);
nrow(test_comb)

283

In [115]:
first(shuffleDf(test_comb), 10)

Unnamed: 0_level_0,ID_OUVRAGE,DATE,TP_LAT,TP_LNG,TP_Z
Unnamed: 0_level_1,String,Date,Float64,Float64,Float64
1,3260-01D,2019-05-23,1.29293,0.531312,0.0790333
2,4380-01D,2019-09-28,-0.828933,0.658409,-0.0467296
3,3350-07D,2019-07-11,0.0802391,-0.325532,0.168267
4,3350-07D,2019-06-17,0.0802391,-0.325532,0.168267
5,4380-01D,2019-06-28,-0.828933,0.658409,-0.0467296
6,4240-01D,2019-09-28,1.28137,1.24035,-1.19178
7,4240-01D,2019-06-28,1.28137,1.24035,-1.19178
8,4350-01D,2019-08-17,-0.465715,0.725081,-0.0467296
9,4380-01D,2019-09-29,-0.828933,0.658409,-0.0467296
10,4380-01D,2019-09-01,-0.828933,0.658409,-0.0467296


### Add PCP_SUM and PCP_MAX

#### Initialize default pcp

In [116]:
test_comb.PCP_SUM = zeros(size(test_comb, 1));
test_comb.PCP_MAX = zeros(size(test_comb, 1));
test_comb.PCP_MAX3 = zeros(size(test_comb, 1));
permutecols!(test_comb, [:ID_OUVRAGE, :TP_LAT, :TP_LNG, :TP_Z, :DATE, :PCP_SUM, :PCP_MAX, :PCP_MAX3]);

In [117]:
first(shuffleDf(test_comb), 10)

Unnamed: 0_level_0,ID_OUVRAGE,TP_LAT,TP_LNG,TP_Z,DATE,PCP_SUM,PCP_MAX,PCP_MAX3
Unnamed: 0_level_1,String,Float64,Float64,Float64,Date,Float64,Float64,Float64
1,3260-01D,1.29293,0.531312,0.0790333,2019-08-21,0.0,0.0,0.0
2,4350-01D,-0.465715,0.725081,-0.0467296,2019-07-06,0.0,0.0,0.0
3,3260-01D,1.29293,0.531312,0.0790333,2019-09-17,0.0,0.0,0.0
4,4380-01D,-0.828933,0.658409,-0.0467296,2019-06-20,0.0,0.0,0.0
5,3260-01D,1.29293,0.531312,0.0790333,2019-05-24,0.0,0.0,0.0
6,4240-01D,1.28137,1.24035,-1.19178,2019-05-22,0.0,0.0,0.0
7,3260-01D,1.29293,0.531312,0.0790333,2019-08-28,0.0,0.0,0.0
8,4240-01D,1.28137,1.24035,-1.19178,2019-08-17,0.0,0.0,0.0
9,3260-01D,1.29293,0.531312,0.0790333,2019-05-09,0.0,0.0,0.0
10,4380-01D,-0.828933,0.658409,-0.0467296,2019-09-05,0.0,0.0,0.0


#### Populate pcp

In [118]:
for i=1:size(test_comb, 1)
    id_ouvrage = test_comb[i, 1]; 
    closest_station = "McTavish"; # initial value
    shortest_dist = -1;
    
    # Find closest station
    for j=1:size(station_df, 1)
        dist = findDistance(test_comb[i, :TP_LAT], test_comb[i, :TP_LNG], station_df[j, :LAT], station_df[j, :LNG]);
        
        if shortest_dist == -1 || dist < shortest_dist
            shortest_dist = dist;
            closest_station = station_df[j, :STATION];
        end
    end
    
    # Augment comb with a weighted p_sum, based on the distance to the station
    p_sum = pcp_sum[∈([test_comb[i, :DATE]]).(pcp_sum.date), Symbol(closest_station)];
#     test_comb[i, :PCP_SUM] = p_sum[1] * (1 - shortest_dist); 
    test_comb[i, :PCP_SUM] = p_sum[1]; 
    # Augment comb with a weighted p_max, based on the distance to the station
    p_max = pcp_max[∈([test_comb[i, :DATE]]).(pcp_max.date), Symbol(closest_station)]
#     test_comb[i, :PCP_MAX] = p_max[1] * (1 - shortest_dist);
    test_comb[i, :PCP_MAX] = p_max[1];
    # Augment comb with a weighted p_max3, based on the distance to the station
    p_max3 = pcp_max3h[∈([test_comb[i, :DATE]]).(pcp_max3h.date), Symbol(closest_station)]
#     test_comb[i, :PCP_MAX3] = p_max3[1] * (1 - shortest_dist);
    test_comb[i, :PCP_MAX3] = p_max3[1];
end

In [119]:
first(shuffleDf(test_comb), 10)

Unnamed: 0_level_0,ID_OUVRAGE,TP_LAT,TP_LNG,TP_Z,DATE,PCP_SUM,PCP_MAX,PCP_MAX3
Unnamed: 0_level_1,String,Float64,Float64,Float64,Date,Float64,Float64,Float64
1,3350-07D,0.0802391,-0.325532,0.168267,2019-07-05,27.0,25.0,27.0
2,3260-01D,1.29293,0.531312,0.0790333,2019-07-02,0.0,0.0,0.0
3,3350-07D,0.0802391,-0.325532,0.168267,2019-07-04,0.0,0.0,0.0
4,4380-01D,-0.828933,0.658409,-0.0467296,2019-08-12,20.0,10.0,12.0
5,3260-01D,1.29293,0.531312,0.0790333,2019-07-11,377.0,207.0,279.0
6,4350-01D,-0.465715,0.725081,-0.0467296,2019-05-07,3.0,3.0,3.0
7,4240-01D,1.28137,1.24035,-1.19178,2019-08-21,52.0,32.0,52.0
8,3260-01D,1.29293,0.531312,0.0790333,2019-08-26,0.0,0.0,0.0
9,4240-01D,1.28137,1.24035,-1.19178,2019-09-15,0.0,0.0,0.0
10,4240-01D,1.28137,1.24035,-1.19178,2019-06-11,175.0,52.0,108.0


### Standardize PCP

In [120]:
test_comb.PCP_SUM = (test_comb.PCP_SUM .- mean_pcpsum) ./ std_pcpsum;
test_comb.PCP_MAX = (test_comb.PCP_MAX .- mean_pcpmax) ./ std_pcpmax;
test_comb.PCP_MAX3 = (test_comb.PCP_MAX3 .- mean_pcpmax3) ./ std_pcpmax3;

In [121]:
first(test_comb, 20)

Unnamed: 0_level_0,ID_OUVRAGE,TP_LAT,TP_LNG,TP_Z,DATE,PCP_SUM,PCP_MAX
Unnamed: 0_level_1,String,Float64,Float64,Float64,Date,Float64,Float64
1,3260-01D,1.29293,0.531312,0.0790333,2019-05-02,-0.0469042,-0.0325426
2,3260-01D,1.29293,0.531312,0.0790333,2019-05-09,0.808886,0.982661
3,3260-01D,1.29293,0.531312,0.0790333,2019-05-10,4.82974,1.6287
4,3260-01D,1.29293,0.531312,0.0790333,2019-05-15,-0.37292,-0.34018
5,3260-01D,1.29293,0.531312,0.0790333,2019-05-20,0.224775,0.828842
6,3260-01D,1.29293,0.531312,0.0790333,2019-05-23,1.97711,1.53641
7,3260-01D,1.29293,0.531312,0.0790333,2019-05-24,-0.223496,-0.124834
8,3260-01D,1.29293,0.531312,0.0790333,2019-05-26,-0.359336,-0.309416
9,3260-01D,1.29293,0.531312,0.0790333,2019-05-30,-0.305,-0.186361
10,3350-07D,0.0802391,-0.325532,0.168267,2019-05-01,0.238359,0.244331


#### Split dates into month and day

In [122]:
test_comb.MONTH = month.(test_comb.DATE);
test_comb.DAY = day.(test_comb.DATE);

first(shuffleDf(test_comb[!, [:DATE, :MONTH, :DAY]]), 5)

Unnamed: 0_level_0,DATE,MONTH,DAY
Unnamed: 0_level_1,Date,Int64,Int64
1,2019-08-17,8,17
2,2019-06-05,6,5
3,2019-07-19,7,19
4,2019-08-02,8,2
5,2019-08-22,8,22


#### Standardize months and days

In [123]:
test_comb.MONTH = (test_comb.MONTH .- meanmonth) ./ stdmonth;
test_comb.DAY = (test_comb.DAY .- meanday) ./ stdday;

In [124]:
first(shuffleDf(test_comb[!, [:ID_OUVRAGE, :TP_LAT, :TP_LNG, :TP_Z, :MONTH, :DAY, :PCP_SUM, :PCP_MAX, :PCP_MAX3]]), 5)

Unnamed: 0_level_0,ID_OUVRAGE,TP_LAT,TP_LNG,TP_Z,MONTH,DAY,PCP_SUM,PCP_MAX
Unnamed: 0_level_1,String,Float64,Float64,Float64,Float64,Float64,Float64,Float64
1,4350-01D,-0.465715,0.725081,-0.0467296,-0.875331,0.0176291,-0.400087,-0.401707
2,4240-01D,1.28137,1.24035,-1.19178,-0.875331,1.03449,0.75455,0.459677
3,4380-01D,-0.828933,0.658409,-0.0467296,0.293667,1.26046,-0.400087,-0.401707
4,3260-01D,1.29293,0.531312,0.0790333,0.878166,0.921508,-0.400087,-0.401707
5,4380-01D,-0.828933,0.658409,-0.0467296,0.878166,0.695538,-0.400087,-0.401707


### Create Test features

In [125]:
test_features = convert(Matrix{Float64}, test_comb[:, names_ft])

283×8 Array{Float64,2}:
  1.29293     0.531312   0.0790333  …  -0.0469042  -0.0325426   0.0430963
  1.29293     0.531312   0.0790333      0.808886    0.982661    1.41166  
  1.29293     0.531312   0.0790333      4.82974     1.6287      2.75979  
  1.29293     0.531312   0.0790333     -0.37292    -0.34018    -0.365429 
  1.29293     0.531312   0.0790333      0.224775    0.828842    0.492474 
  1.29293     0.531312   0.0790333  …   1.97711     1.53641     2.51467  
  1.29293     0.531312   0.0790333     -0.223496   -0.124834   -0.181592 
  1.29293     0.531312   0.0790333     -0.359336   -0.309416   -0.345002 
  1.29293     0.531312   0.0790333     -0.305      -0.186361   -0.263297 
  0.0802391  -0.325532   0.168267       0.238359    0.244331    0.431195 
  0.0802391  -0.325532   0.168267   …  -0.223496   -0.155598   -0.181592 
  0.0802391  -0.325532   0.168267      -0.400087   -0.401707   -0.406281 
  0.0802391  -0.325532   0.168267       0.768134    0.921133    1.35038  
  ⋮           

## Predict

In [126]:
test_labels = apply_forest(model, test_features)

283-element Array{Int64,1}:
 0
 0
 1
 0
 0
 1
 0
 0
 0
 1
 0
 0
 1
 ⋮
 0
 1
 1
 0
 0
 0
 0
 0
 0
 0
 0
 0

## Generate submission

In [127]:
ID = test_comb[:,:ID_OUVRAGE].*"_".*string.(test_comb[:,:DATE])
sampleSubmission = DataFrame(ID = ID, Surverse=test_labels)
CSV.write("submissions/mc-submission-10.csv",sampleSubmission)

"submissions/mc-submission-10.csv"