# Data Processing

In [1]:
using CSV, DataFrames, DecisionTree, Statistics, Dates, Gadfly, Random, MLBase;
include("utils/precipitation.jl");

## Build features

### Get and filter the features

#### Latitude, Longitude, Height

In [2]:
features = CSV.read("data/ouvrages-surverses.csv");
colnames = ["N_Env", "ID_SOMA", "ID_OUVRAGE", "NOM", "SOMA_SEC", "REGION", "TP_X", "TP_Y", "TP_Z", "TP_LAT", "TP_LNG", "EMI_X", "EMI_Y", "EMI_LNG", "EMI_LAT"];
names!(features, Symbol.(colnames));
select!(features, [:ID_OUVRAGE, :TP_LAT, :TP_LNG, :TP_Z]);

#### Replace missing Z index with mean

In [3]:
features.TP_Z = coalesce.(features.TP_Z, mean(features[completecases(features), :].TP_Z));
first(shuffleDf(features), 10)

Unnamed: 0_level_0,ID_OUVRAGE,TP_LAT,TP_LNG,TP_Z
Unnamed: 0_level_1,String,Float64,Float64,Float64
1,4230-03D,45.6705,-73.4918,9.1
2,3350-04D,45.5761,-73.6603,9.43
3,3260-01D,45.6507,-73.5803,20.17
4,3305-04D,45.6196,-73.6329,11.76
5,4420-02D,45.443,-73.5764,17.08
6,4265-01D,45.626,-73.4991,11.78
7,0672-03D,45.6732,-73.5402,9.99
8,4610-08D,45.4267,-73.8538,19.3526
9,3305-01D,45.6257,-73.6241,12.7
10,3790-01D,45.4027,-73.9466,23.6


### Load dates and surverses

In [4]:
surverses = CSV.read("data/surverses.csv",missingstring="-99999");

#### Filter months

In [5]:
surverses = filter(row -> month(row.DATE) > 4, surverses);
surverses = filter(row -> month(row.DATE) < 11, surverses);

#### Filter non rain surverses

In [6]:
raison = coalesce.(surverses[:,:RAISON],"Inconnue");
surverses[!,:RAISON] = raison;

surverses = filter(row -> row.RAISON ∈ ["P","Inconnue","TS"], surverses);
select!(surverses, [:NO_OUVRAGE, :DATE, :SURVERSE]);

#### Remove missing data and rename

In [7]:
surverses = dropmissing(surverses, disallowmissing=true);
rename!(surverses, :NO_OUVRAGE => :ID_OUVRAGE);
first(shuffleDf(surverses),10)

Unnamed: 0_level_0,ID_OUVRAGE,DATE,SURVERSE
Unnamed: 0_level_1,String,Date,Int64
1,3530-01D,2015-10-11,0
2,4620-03D,2015-09-28,0
3,4620-05D,2015-06-11,0
4,3280-02D,2013-06-09,0
5,3240-05D,2014-07-17,0
6,4600-03D,2014-06-20,0
7,3450-01D,2016-08-30,0
8,3781-01D,2016-07-22,0
9,4710-02D,2014-08-23,0
10,3350-01D,2017-07-18,0


### Augment features with dates and label

In [8]:
comb = join(features, surverses, on = :ID_OUVRAGE);
first(shuffleDf(comb), 10)

Unnamed: 0_level_0,ID_OUVRAGE,TP_LAT,TP_LNG,TP_Z,DATE,SURVERSE
Unnamed: 0_level_1,String,Float64,Float64,Float64,Date,Int64
1,4380-01D,45.4677,-73.5637,19.3526,2013-06-15,0
2,0672-02D,45.6939,-73.5214,10.23,2015-10-04,0
3,4340-01D,45.5262,-73.5444,19.33,2014-09-19,0
4,4620-07D,45.4163,-73.8825,19.3526,2016-09-26,0
5,4230-03D,45.6705,-73.4918,9.1,2017-10-27,0
6,3240-05D,45.6604,-73.5607,27.09,2013-07-16,0
7,4610-09D,45.4372,-73.8477,19.3526,2017-05-07,0
8,4560-02D,45.451,-73.7841,26.18,2018-05-22,0
9,3350-03D,45.5785,-73.658,17.84,2013-08-02,0
10,3540-02D,45.4751,-73.8727,26.52,2013-06-04,0


### Load precipitation data

#### Load and filter months between May & October included

In [9]:
precipitation = CSV.read("data/precipitations.csv",missingstring="-99999");
rename!(precipitation, Symbol("St-Hubert")=>:StHubert);

precipitation = filter(row -> month(row.date) > 4, precipitation);
precipitation = filter(row -> month(row.date) < 11, precipitation); 

#### Replace missing data by 0

In [10]:
precipitation[!,:McTavish] = coalesce.(precipitation[:,:McTavish], 0);
precipitation[!,:Bellevue] = coalesce.(precipitation[:,:Bellevue], 0);
precipitation[!,:Assomption] = coalesce.(precipitation[:,:Assomption], 0);
precipitation[!,:Trudeau] = coalesce.(precipitation[:,:Trudeau], 0);
precipitation[!,:StHubert] = coalesce.(precipitation[:,:StHubert], 0);

first(shuffleDf(precipitation), 5)

Unnamed: 0_level_0,date,heure,McTavish,Bellevue,Assomption,Trudeau,StHubert
Unnamed: 0_level_1,Date,Int64,Int64,Int64,Int64,Int64,Int64
1,2017-08-09,18,0,0,0,0,0
2,2013-05-24,14,0,0,0,3,0
3,2018-10-25,9,0,0,0,0,0
4,2015-06-10,3,0,0,0,0,0
5,2016-07-04,3,0,0,0,0,0


### Extract features from precipitation

#### Sum of precipitation for the day

In [11]:
pcp_sum = by(precipitation, :date,  McTavish = :McTavish=>sum, Bellevue = :Bellevue=>sum, 
   Assomption = :Assomption=>sum, Trudeau = :Trudeau=>sum, StHubert = :StHubert=>sum);
first(shuffleDf(pcp_sum), 5)

Unnamed: 0_level_0,date,McTavish,Bellevue,Assomption,Trudeau,StHubert
Unnamed: 0_level_1,Date,Int64,Int64,Int64,Int64,Int64
1,2013-10-17,114,78,80,113,0
2,2015-06-25,2,2,0,2,0
3,2016-07-26,0,0,0,0,18
4,2018-10-03,0,0,0,0,0
5,2016-09-04,0,0,0,0,0


#### Maximum precipitation in an hour for the day

In [12]:
pcp_max = by(precipitation, :date,  McTavish = :McTavish=>maximum, Bellevue = :Bellevue=>maximum, 
   Assomption = :Assomption=>maximum, Trudeau = :Trudeau=>maximum, StHubert = :StHubert=>maximum)
first(shuffleDf(pcp_max),5)

Unnamed: 0_level_0,date,McTavish,Bellevue,Assomption,Trudeau,StHubert
Unnamed: 0_level_1,Date,Int64,Int64,Int64,Int64,Int64
1,2015-08-13,0,0,0,0,0
2,2014-10-29,8,7,10,7,5
3,2014-06-13,0,44,80,68,45
4,2018-07-02,0,0,13,0,0
5,2013-06-11,50,35,50,50,0


#### Maximum precipitation during three consecutive hours in a day

In [13]:
pcp_max3h = by(precipitation, :date,  McTavish = :McTavish=>maximum3, Bellevue = :Bellevue=>maximum3, 
   Assomption = :Assomption=>maximum3, Trudeau = :Trudeau=>maximum3, StHubert = :StHubert=>maximum3)
first(shuffleDf(pcp_max3h),5)

Unnamed: 0_level_0,date,McTavish,Bellevue,Assomption,Trudeau,StHubert
Unnamed: 0_level_1,Date,Int64,Int64,Int64,Int64,Int64
1,2015-08-03,0,10,20,2,0
2,2016-08-13,115,83,430,118,200
3,2015-08-08,0,0,0,3,0
4,2016-05-15,7,3,0,2,10
5,2013-09-28,0,0,0,0,0


### Add precipitation data to features

#### Get stations lat-lng

In [14]:
station_df = DataFrame(STATION = String[], LAT = Float64[], LNG = Float64[]);

push!(station_df, ["McTavish", 45.504742, -73.579167]);
push!(station_df, ["Bellevue", 45.427222, -73.929167]);
push!(station_df, ["Assomption", 45.809444, -73.434722]);
push!(station_df, ["Trudeau", 45.467778, -73.741667]);
push!(station_df, ["StHubert", 45.5175, -73.416944]);

station_df

Unnamed: 0_level_0,STATION,LAT,LNG
Unnamed: 0_level_1,String,Float64,Float64
1,McTavish,45.5047,-73.5792
2,Bellevue,45.4272,-73.9292
3,Assomption,45.8094,-73.4347
4,Trudeau,45.4678,-73.7417
5,StHubert,45.5175,-73.4169


### Normalize elements

#### Get extreme values

In [332]:
# min_lat = min(minimum(comb.TP_LAT), minimum(station_df.LAT));
# max_lat = max(maximum(comb.TP_LAT), maximum(station_df.LAT));

# min_lng = min(minimum(comb.TP_LNG), minimum(station_df.LNG));
# max_lng = max(maximum(comb.TP_LNG), maximum(station_df.LNG));

# min_z = minimum(comb.TP_Z);
# max_z = maximum(comb.TP_Z);

#### Normalize every value of comb between 0 and 1

In [15]:
# comb.TP_LAT = normalize.(comb.TP_LAT, min_lat, max_lat);
# comb.TP_LNG = normalize.(comb.TP_LNG, min_lng, max_lng);
# comb.TP_Z = normalize.(comb.TP_Z, min_z, max_z);

# first(shuffleDf(comb), 10)

#### Normalize every value of station_df between 0 and 1

In [16]:
# station_df.LAT = normalize.(station_df.LAT, min_lat, max_lat);
# station_df.LNG = normalize.(station_df.LNG, min_lng, max_lng);

# station_df

### Augment Features

#### Add pcp_sum and pcp_max columns

In [17]:
comb.PCP_SUM = zeros(size(comb, 1));
comb.PCP_MAX = zeros(size(comb, 1));
comb.PCP_MAX3 = zeros(size(comb, 1));
permutecols!(comb, [:ID_OUVRAGE, :TP_LAT, :TP_LNG, :TP_Z, :DATE, :PCP_SUM, :PCP_MAX, :PCP_MAX3, :SURVERSE]);

In [18]:
first(shuffleDf(comb), 10)

Unnamed: 0_level_0,ID_OUVRAGE,TP_LAT,TP_LNG,TP_Z,DATE,PCP_SUM,PCP_MAX,PCP_MAX3
Unnamed: 0_level_1,String,Float64,Float64,Float64,Date,Float64,Float64,Float64
1,4370-03D,45.4367,-73.7095,26.23,2018-07-10,0.0,0.0,0.0
2,4280-02D,45.5986,-73.5102,11.534,2013-07-17,0.0,0.0,0.0
3,4270-02D,45.6112,-73.5084,19.3526,2013-10-14,0.0,0.0,0.0
4,4310-02D,45.6067,-73.5865,32.4,2015-08-01,0.0,0.0,0.0
5,4280-01D,45.6009,-73.5111,12.53,2017-10-06,0.0,0.0,0.0
6,4430-01D,45.4324,-73.5885,19.36,2013-07-10,0.0,0.0,0.0
7,4360-01D,45.4907,-73.5508,15.15,2014-07-23,0.0,0.0,0.0
8,0801-03D,45.5081,-73.5273,11.94,2015-10-02,0.0,0.0,0.0
9,4330-01D,45.5438,-73.5353,14.16,2014-10-14,0.0,0.0,0.0
10,4600-03D,45.4364,-73.8226,26.42,2013-10-15,0.0,0.0,0.0


#### Find closest station to each ouvrage and add pcp_sum and pcp_max to it

In [19]:
for i=1:size(comb, 1)
    id_ouvrage = comb[i, 1]; 
    closest_station = "McTavish"; # initial value
    shortest_dist = -1;
    
    # Find closest station
    for j=1:size(station_df, 1)
        dist = findDistance(comb[i, :TP_LAT], comb[i, :TP_LNG], station_df[j, :LAT], station_df[j, :LNG]);
        
        if shortest_dist == -1 || dist < shortest_dist
            shortest_dist = dist;
            closest_station = station_df[j, :STATION];
        end
    end
    
    # Augment comb with a weighted p_sum, based on the distance to the station
    p_sum = pcp_sum[∈([comb[i, :DATE]]).(pcp_sum.date), Symbol(closest_station)];
    comb[i, :PCP_SUM] = p_sum[1] * (1 - shortest_dist); 
    # Augment comb with a weighted p_max, based on the distance to the station
    p_max = pcp_max[∈([comb[i, :DATE]]).(pcp_max.date), Symbol(closest_station)]
    comb[i, :PCP_MAX] = p_max[1] * (1 - shortest_dist);
    # Augment comb with a weighted p_max3h, based on the distance to the station
    p_max3 = pcp_max3h[∈([comb[i, :DATE]]).(pcp_max3h.date), Symbol(closest_station)]
    comb[i, :PCP_MAX3] = p_max3[1] * (1 - shortest_dist);
end

### TODO: Remove outlier in PCP_SUM and PCP_MAX AND PCP_MAX3 that cause compression

In [20]:
comb[comb[:PCP_SUM] .> 750, :PCP_SUM] = 750;
comb[comb[:PCP_MAX] .> 500, :PCP_MAX] = 500;
comb[comb[:PCP_MAX3] .> 750, :PCP_MAX3] = 750;

│   caller = top-level scope at In[20]:1
└ @ Core In[20]:1
│   caller = setindex!(::DataFrame, ::Int64, ::BitArray{1}, ::Symbol) at deprecated.jl:1490
└ @ DataFrames /home/chaime/.julia/packages/DataFrames/yH0f6/src/deprecated.jl:1490
│   caller = top-level scope at In[20]:2
└ @ Core In[20]:2
│   caller = top-level scope at In[20]:3
└ @ Core In[20]:3


In [21]:
first(shuffleDf(filter(row -> row.SURVERSE == 1, comb)), 10)

Unnamed: 0_level_0,ID_OUVRAGE,TP_LAT,TP_LNG,TP_Z,DATE,PCP_SUM,PCP_MAX,PCP_MAX3
Unnamed: 0_level_1,String,Float64,Float64,Float64,Date,Float64,Float64,Float64
1,4400-01D,45.4586,-73.5621,16.8,2014-08-13,587.611,125.509,249.117
2,3350-01D,45.5896,-73.6519,15.09,2016-10-27,104.814,27.536,62.178
3,3350-11D,45.5328,-73.7065,25.28,2013-09-12,296.335,72.2317,113.904
4,4290-01D,45.5846,-73.5081,11.89,2015-10-28,262.562,65.194,138.426
5,3400-01D,45.5435,-73.6755,26.04,2015-07-27,8.99464,7.19571,8.99464
6,3350-09D,45.5371,-73.713,23.72,2015-06-05,13.8755,13.8755,13.8755
7,0801-05D,45.5173,-73.5312,12.7,2016-05-06,0.0,0.0,0.0
8,4340-02D,45.5262,-73.5444,19.33,2018-05-03,168.812,44.1212,115.099
9,3310-01D,45.6139,-73.6313,19.58,2016-08-14,142.405,33.4037,80.8721
10,3260-01D,45.6507,-73.5803,20.17,2014-08-13,527.784,112.731,223.753


#### Normalize pcp_sum and pcp_max and pcp_max3

In [22]:
# min_pcp_sum = minimum(comb.PCP_SUM);
# max_pcp_sum = maximum(comb.PCP_SUM);

# min_pcp_max = minimum(comb.PCP_MAX);
# max_pcp_max = maximum(comb.PCP_MAX);

# min_pcp_max3 = minimum(comb.PCP_MAX3);
# max_pcp_max3 = maximum(comb.PCP_MAX3);

In [354]:
# comb.PCP_SUM = normalize.(comb.PCP_SUM, min_pcp_sum, max_pcp_sum);
# comb.PCP_MAX = normalize.(comb.PCP_MAX, min_pcp_max, max_pcp_max);
# comb.PCP_MAX3 = normalize.(comb.PCP_MAX3, min_pcp_max3, max_pcp_max3);

In [23]:
first(shuffleDf(filter(row -> row.SURVERSE == 1, comb)), 10)

Unnamed: 0_level_0,ID_OUVRAGE,TP_LAT,TP_LNG,TP_Z,DATE,PCP_SUM,PCP_MAX,PCP_MAX3
Unnamed: 0_level_1,String,Float64,Float64,Float64,Date,Float64,Float64,Float64
1,3350-11D,45.5328,-73.7065,25.28,2017-07-08,191.692,146.315,163.91
2,3250-01D,45.6562,-73.5731,17.16,2018-07-06,27.1495,27.1495,27.1495
3,3280-01D,45.634,-73.6059,23.86,2016-10-28,83.3252,18.2274,43.3986
4,0801-05D,45.5173,-73.5312,12.7,2017-07-14,53.2214,25.6603,45.6184
5,4430-04D,45.4197,-73.6487,31.54,2014-06-24,389.495,148.635,265.931
6,3260-01D,45.6507,-73.5803,20.17,2014-08-13,527.784,112.731,223.753
7,4400-01D,45.4586,-73.5621,16.8,2014-07-23,0.0,0.0,0.0
8,3350-05D,45.5705,-73.6624,21.27,2013-07-28,84.0344,44.6991,44.6991
9,3370-01D,45.5653,-73.6631,18.21,2017-07-31,90.5477,90.5477,90.5477
10,4230-09D,45.699,-73.4799,8.347,2014-10-07,79.2562,35.225,35.225


### Split dates into months and days

In [24]:
comb.MONTH = month.(comb.DATE);
comb.DAY = day.(comb.DATE);
first(shuffleDf(comb[!, [:DATE, :MONTH, :DAY]]), 5)

Unnamed: 0_level_0,DATE,MONTH,DAY
Unnamed: 0_level_1,Date,Int64,Int64
1,2016-06-13,6,13
2,2014-07-05,7,5
3,2015-09-24,9,24
4,2014-05-07,5,7
5,2018-07-26,7,26


#### Normalize the months and days

In [25]:
# comb.MONTH = normalize.(comb.MONTH, 5, 10);
# comb.DAY = normalize.(comb.DAY, 1, 31);
# first(shuffleDf(comb[!, [:DATE, :MONTH, :DAY]]), 5)

# Validate model

### Split train and validation sets

In [26]:
r_idx = shuffle(1:size(comb, 1));
train_ceil = floor(Int, size(r_idx, 1) * 0.8);
train_set = comb[r_idx[1:train_ceil], :];
val_set = comb[r_idx[train_ceil+1:size(r_idx, 1)], :];

### Train model on train set

#### Random Forest Params

In [90]:
names_ft = [:TP_LAT, :TP_LNG, :TP_Z, :MONTH, :PCP_SUM, :PCP_MAX, :PCP_MAX3];
nft = 3;
ntrees = 100;
podata = 0.7;
maxd = 30;

#### Build the features and labels

In [85]:
train_features = convert(Matrix{Float64},train_set[:, names_ft]);
train_labels = train_set[:, :SURVERSE];

#### Build the model N features to use is log_2(N + 1)

In [91]:
val_model = build_forest(train_labels, train_features, nft, ntrees, podata, maxd)

Ensemble of Decision Trees
Trees:      100
Avg Leaves: 2102.38
Avg Depth:  29.47

### Validate model on validation set

#### Single validation

In [92]:
val_features = convert(Matrix{Float64},val_set[:, names_ft]);
val_labels = val_set[!, :SURVERSE];
val_pred = apply_forest(val_model, val_features);

r = roc(val_labels, val_pred);
f1score(r)

0.6117517847336629

#### Batch validation for F1Score

In [93]:
niter = 10;
batch_score = 0;

for i=1:niter
    # Split train and val sets
    r_idx = shuffle(1:size(comb, 1));
    train_ceil = floor(Int, size(r_idx, 1) * 0.8);
    train_set = comb[r_idx[1:train_ceil], :];
    val_set = comb[r_idx[train_ceil+1:size(r_idx, 1)], :];
    
    # Build features and labels
    train_features = convert(Matrix{Float64},train_set[:, names_ft]);
    train_labels = train_set[:, :SURVERSE];
    
    # Build model
    val_model = build_forest(train_labels, train_features, nft, ntrees, podata, maxd);
    
    # Validate model
    val_features = convert(Matrix{Float64},val_set[:, names_ft]);
    val_labels = val_set[!, :SURVERSE];
    val_pred = apply_forest(val_model, val_features);

    r = roc(val_labels, val_pred);
    batch_score += f1score(r);
end

batch_score = batch_score / niter

0.6226479666429657

# Submission model creation

### Separate features and labels

In [94]:
full_train_features = convert(Matrix{Float64},comb[:, names_ft]);

In [95]:
full_train_labels = comb[:, :SURVERSE];

### Build Model

#### Test with tree first

In [96]:
model_tree = build_tree(full_train_labels, full_train_features)

Decision Tree
Leaves: 3470
Depth:  34

In [97]:
model_tree = prune_tree(model_tree, 0.90)

Decision Tree
Leaves: 3046
Depth:  34

In [98]:
model = build_forest(full_train_labels, full_train_features, nft, ntrees, podata, maxd)

Ensemble of Decision Trees
Trees:      100
Avg Leaves: 2516.86
Avg Depth:  29.63

# Prediction

## Get the test data

In [99]:
test = CSV.read("data/test.csv");
rename!(test, :NO_OUVRAGE => :ID_OUVRAGE);
first(test, 10)

Unnamed: 0_level_0,ID_OUVRAGE,DATE
Unnamed: 0_level_1,String,Date
1,3260-01D,2019-05-02
2,3260-01D,2019-05-09
3,3260-01D,2019-05-10
4,3260-01D,2019-05-15
5,3260-01D,2019-05-20
6,3260-01D,2019-05-23
7,3260-01D,2019-05-24
8,3260-01D,2019-05-26
9,3260-01D,2019-05-30
10,3350-07D,2019-05-01


In [100]:
to_merge = unique(comb[!, [:ID_OUVRAGE, :TP_LAT, :TP_LNG, :TP_Z]], :ID_OUVRAGE);
test_comb = join(test, to_merge, on= [:ID_OUVRAGE]);
nrow(test_comb)

283

In [101]:
first(shuffleDf(test_comb), 10)

Unnamed: 0_level_0,ID_OUVRAGE,DATE,TP_LAT,TP_LNG,TP_Z
Unnamed: 0_level_1,String,Date,Float64,Float64,Float64
1,4350-01D,2019-07-31,45.4991,-73.555,19.3526
2,4240-01D,2019-09-04,45.6497,-73.4877,11.91
3,4350-01D,2019-06-16,45.4991,-73.555,19.3526
4,4350-01D,2019-08-11,45.4991,-73.555,19.3526
5,3260-01D,2019-07-05,45.6507,-73.5803,20.17
6,4240-01D,2019-05-25,45.6497,-73.4877,11.91
7,4380-01D,2019-08-27,45.4677,-73.5637,19.3526
8,3350-07D,2019-06-04,45.5461,-73.6921,20.75
9,3260-01D,2019-09-17,45.6507,-73.5803,20.17
10,4240-01D,2019-07-03,45.6497,-73.4877,11.91


### Add PCP_SUM and PCP_MAX

#### Initialize default pcp

In [102]:
test_comb.PCP_SUM = zeros(size(test_comb, 1));
test_comb.PCP_MAX = zeros(size(test_comb, 1));
test_comb.PCP_MAX3 = zeros(size(test_comb, 1));
permutecols!(test_comb, [:ID_OUVRAGE, :TP_LAT, :TP_LNG, :TP_Z, :DATE, :PCP_SUM, :PCP_MAX, :PCP_MAX3]);

In [103]:
first(shuffleDf(test_comb), 10)

Unnamed: 0_level_0,ID_OUVRAGE,TP_LAT,TP_LNG,TP_Z,DATE,PCP_SUM,PCP_MAX,PCP_MAX3
Unnamed: 0_level_1,String,Float64,Float64,Float64,Date,Float64,Float64,Float64
1,3350-07D,45.5461,-73.6921,20.75,2019-05-14,0.0,0.0,0.0
2,4240-01D,45.6497,-73.4877,11.91,2019-06-12,0.0,0.0,0.0
3,4350-01D,45.4991,-73.555,19.3526,2019-09-04,0.0,0.0,0.0
4,3260-01D,45.6507,-73.5803,20.17,2019-08-22,0.0,0.0,0.0
5,3350-07D,45.5461,-73.6921,20.75,2019-07-27,0.0,0.0,0.0
6,3260-01D,45.6507,-73.5803,20.17,2019-05-26,0.0,0.0,0.0
7,3350-07D,45.5461,-73.6921,20.75,2019-09-02,0.0,0.0,0.0
8,3350-07D,45.5461,-73.6921,20.75,2019-07-08,0.0,0.0,0.0
9,4350-01D,45.4991,-73.555,19.3526,2019-08-01,0.0,0.0,0.0
10,4350-01D,45.4991,-73.555,19.3526,2019-09-07,0.0,0.0,0.0


#### Populate pcp

In [104]:
for i=1:size(test_comb, 1)
    id_ouvrage = test_comb[i, 1]; 
    closest_station = "McTavish"; # initial value
    shortest_dist = -1;
    
    # Find closest station
    for j=1:size(station_df, 1)
        dist = findDistance(test_comb[i, :TP_LAT], test_comb[i, :TP_LNG], station_df[j, :LAT], station_df[j, :LNG]);
        
        if shortest_dist == -1 || dist < shortest_dist
            shortest_dist = dist;
            closest_station = station_df[j, :STATION];
        end
    end
    
    # Augment comb with a weighted p_sum, based on the distance to the station
    p_sum = pcp_sum[∈([test_comb[i, :DATE]]).(pcp_sum.date), Symbol(closest_station)];
    test_comb[i, :PCP_SUM] = p_sum[1] * (1 - shortest_dist); 
    # Augment comb with a weighted p_max, based on the distance to the station
    p_max = pcp_max[∈([test_comb[i, :DATE]]).(pcp_max.date), Symbol(closest_station)]
    test_comb[i, :PCP_MAX] = p_max[1] * (1 - shortest_dist);
    # Augment comb with a weighted p_max3, based on the distance to the station
    p_max3 = pcp_max3h[∈([test_comb[i, :DATE]]).(pcp_max3h.date), Symbol(closest_station)]
    test_comb[i, :PCP_MAX3] = p_max3[1] * (1 - shortest_dist);
end

In [106]:
first(shuffleDf(test_comb), 10)

Unnamed: 0_level_0,ID_OUVRAGE,TP_LAT,TP_LNG,TP_Z,DATE,PCP_SUM,PCP_MAX,PCP_MAX3
Unnamed: 0_level_1,String,Float64,Float64,Float64,Date,Float64,Float64,Float64
1,4350-01D,45.4991,-73.555,19.3526,2019-08-18,0.0,0.0,0.0
2,4380-01D,45.4677,-73.5637,19.3526,2019-09-01,0.0,0.0,0.0
3,4380-01D,45.4677,-73.5637,19.3526,2019-08-07,1.9198,1.9198,1.9198
4,3350-07D,45.5461,-73.6921,20.75,2019-08-24,0.0,0.0,0.0
5,4380-01D,45.4677,-73.5637,19.3526,2019-07-17,0.0,0.0,0.0
6,4350-01D,45.4991,-73.555,19.3526,2019-06-23,0.0,0.0,0.0
7,4380-01D,45.4677,-73.5637,19.3526,2019-09-04,217.897,120.947,184.301
8,4240-01D,45.6497,-73.4877,11.91,2019-06-25,72.2511,23.8004,56.1009
9,3260-01D,45.6507,-73.5803,20.17,2019-07-10,0.0,0.0,0.0
10,3260-01D,45.6507,-73.5803,20.17,2019-05-20,39.2849,34.1608,37.5768


#### Normalize pcp based on comb (not test_comb)

In [59]:
# test_comb.PCP_SUM = normalize.(test_comb.PCP_SUM, min_pcp_sum, max_pcp_sum);
# test_comb.PCP_MAX = normalize.(test_comb.PCP_MAX, min_pcp_max, max_pcp_max);
# test_comb.PCP_MAX3 = normalize.(test_comb.PCP_MAX3, min_pcp_max3, max_pcp_max3);

In [107]:
first(test_comb, 20)

Unnamed: 0_level_0,ID_OUVRAGE,TP_LAT,TP_LNG,TP_Z,DATE,PCP_SUM,PCP_MAX,PCP_MAX3
Unnamed: 0_level_1,String,Float64,Float64,Float64,Date,Float64,Float64,Float64
1,3260-01D,45.6507,-73.5803,20.17,2019-05-02,22.2045,10.2482,18.7884
2,3260-01D,45.6507,-73.5803,20.17,2019-05-09,76.0077,38.4309,76.0077
3,3260-01D,45.6507,-73.5803,20.17,2019-05-10,328.797,56.3653,132.373
4,3260-01D,45.6507,-73.5803,20.17,2019-05-15,1.70804,1.70804,1.70804
5,3260-01D,45.6507,-73.5803,20.17,2019-05-20,39.2849,34.1608,37.5768
6,3260-01D,45.6507,-73.5803,20.17,2019-05-23,149.453,53.8032,122.125
7,3260-01D,45.6507,-73.5803,20.17,2019-05-24,11.1022,7.68617,9.39421
8,3260-01D,45.6507,-73.5803,20.17,2019-05-26,2.56206,2.56206,2.56206
9,3260-01D,45.6507,-73.5803,20.17,2019-05-30,5.97813,5.97813,5.97813
10,3350-07D,45.5461,-73.6921,20.75,2019-05-01,42.6413,19.0525,37.1978


#### Split dates into month and day

In [108]:
test_comb.MONTH = month.(test_comb.DATE);
test_comb.DAY = day.(test_comb.DATE);

first(shuffleDf(test_comb[!, [:DATE, :MONTH, :DAY]]), 5)

Unnamed: 0_level_0,DATE,MONTH,DAY
Unnamed: 0_level_1,Date,Int64,Int64
1,2019-08-02,8,2
2,2019-05-24,5,24
3,2019-06-11,6,11
4,2019-08-24,8,24
5,2019-07-21,7,21


#### Normalize months and days

In [62]:
# test_comb.MONTH = normalize.(test_comb.MONTH, 5, 10);
# test_comb.DAY = normalize.(test_comb.DAY, 1, 31);
# first(shuffleDf(test_comb[!, [:DATE, :MONTH, :DAY]]), 5)

In [109]:
first(shuffleDf(test_comb[!, [:ID_OUVRAGE, :TP_LAT, :TP_LNG, :TP_Z, :MONTH, :DAY, :PCP_SUM, :PCP_MAX, :PCP_MAX3]]), 5)

Unnamed: 0_level_0,ID_OUVRAGE,TP_LAT,TP_LNG,TP_Z,MONTH,DAY,PCP_SUM,PCP_MAX,PCP_MAX3
Unnamed: 0_level_1,String,Float64,Float64,Float64,Int64,Int64,Float64,Float64,Float64
1,3260-01D,45.6507,-73.5803,20.17,5,2,22.2045,10.2482,18.7884
2,4240-01D,45.6497,-73.4877,11.91,7,14,0.0,0.0,0.0
3,3260-01D,45.6507,-73.5803,20.17,9,5,0.0,0.0,0.0
4,4380-01D,45.4677,-73.5637,19.3526,6,26,0.0,0.0,0.0
5,4240-01D,45.6497,-73.4877,11.91,7,4,0.0,0.0,0.0


### Create Test features

In [110]:
test_features = convert(Matrix{Float64}, test_comb[:, names_ft]);
# test_features = hcat(test_dates, test_features);

test_features

283×7 Array{Float64,2}:
 45.6507  -73.5803  20.17    5.0   22.2045    10.2482    18.7884 
 45.6507  -73.5803  20.17    5.0   76.0077    38.4309    76.0077 
 45.6507  -73.5803  20.17    5.0  328.797     56.3653   132.373  
 45.6507  -73.5803  20.17    5.0    1.70804    1.70804    1.70804
 45.6507  -73.5803  20.17    5.0   39.2849    34.1608    37.5768 
 45.6507  -73.5803  20.17    5.0  149.453     53.8032   122.125  
 45.6507  -73.5803  20.17    5.0   11.1022     7.68617    9.39421
 45.6507  -73.5803  20.17    5.0    2.56206    2.56206    2.56206
 45.6507  -73.5803  20.17    5.0    5.97813    5.97813    5.97813
 45.5461  -73.6921  20.75    5.0   42.6413    19.0525    37.1978 
 45.5461  -73.6921  20.75    5.0   11.7944     7.2581     9.97989
 45.5461  -73.6921  20.75    5.0    0.0        0.0        0.0    
 45.5461  -73.6921  20.75    5.0   78.0246    39.0123    78.0246 
  ⋮                                            ⋮                 
 45.4677  -73.5637  19.3526  9.0    0.0        0.0  

## Predict

In [111]:
test_labels = apply_forest(model, test_features)

283-element Array{Int64,1}:
 0
 0
 1
 0
 0
 1
 0
 0
 0
 1
 0
 0
 1
 ⋮
 0
 1
 1
 0
 0
 0
 0
 0
 0
 0
 0
 0

## Generate submission

In [112]:
ID = test_comb[:,:ID_OUVRAGE].*"_".*string.(test_comb[:,:DATE])
sampleSubmission = DataFrame(ID = ID, Surverse=test_labels)
CSV.write("submissions/mc-submission-9.csv",sampleSubmission)

"submissions/mc-submission-9.csv"