# Data Processing

In [31]:
using CSV, DataFrames, LIBSVM, Statistics, Dates, Gadfly, Random, MLBase;
include("utils/precipitation.jl");

## Build features

### Get and filter the features

#### Latitude, Longitude, Height

In [2]:
features = CSV.read("data/ouvrages-surverses.csv");
colnames = ["N_Env", "ID_SOMA", "ID_OUVRAGE", "NOM", "SOMA_SEC", "REGION", "TP_X", "TP_Y", "TP_Z", "TP_LAT", "TP_LNG", "EMI_X", "EMI_Y", "EMI_LNG", "EMI_LAT"];
names!(features, Symbol.(colnames));
select!(features, [:ID_OUVRAGE, :TP_LAT, :TP_LNG, :TP_Z]);

#### Replace missing Z index with mean

In [3]:
features.TP_Z = coalesce.(features.TP_Z, mean(features[completecases(features), :].TP_Z));
first(shuffleDf(features), 10)

Unnamed: 0_level_0,ID_OUVRAGE,TP_LAT,TP_LNG,TP_Z
Unnamed: 0_level_1,String,Float64,Float64,Float64
1,4370-02D,45.4713,-73.5596,19.3526
2,0801-01D,45.519,-73.5275,13.71
3,4430-02D,45.4344,-73.5869,18.56
4,3230-01D,45.6661,-73.5469,13.77
5,4315-01D,45.5628,-73.5243,14.39
6,0672-03D,45.6732,-73.5402,9.99
7,0801-07D,45.5191,-73.5341,33.58
8,4330-02D,45.5387,-73.5394,19.3526
9,4420-04D,45.4485,-73.5697,16.0
10,3580-01D,45.4085,-73.9569,25.48


### Load dates and surverses

In [4]:
surverses = CSV.read("data/surverses.csv",missingstring="-99999");

#### Filter months

In [5]:
surverses = filter(row -> month(row.DATE) > 4, surverses);
surverses = filter(row -> month(row.DATE) < 11, surverses);

#### Filter non rain surverses

In [6]:
raison = coalesce.(surverses[:,:RAISON],"Inconnue");
surverses[!,:RAISON] = raison;

surverses = filter(row -> row.RAISON ∈ ["P","Inconnue","TS"], surverses);
select!(surverses, [:NO_OUVRAGE, :DATE, :SURVERSE]);

#### Remove missing data and rename

In [7]:
surverses = dropmissing(surverses, disallowmissing=true);
rename!(surverses, :NO_OUVRAGE => :ID_OUVRAGE);
first(shuffleDf(surverses),10)

Unnamed: 0_level_0,ID_OUVRAGE,DATE,SURVERSE
Unnamed: 0_level_1,String,Date,Int64
1,4600-02D,2014-10-28,0
2,4210-01D,2016-08-28,0
3,4330-01D,2014-05-15,0
4,0672-01D,2017-08-16,0
5,4265-01D,2013-08-07,0
6,3765-01D,2017-05-02,0
7,4620-03D,2013-08-09,0
8,3767-01D,2015-08-19,0
9,3275-01D,2016-09-12,0
10,0672-02D,2018-07-31,0


### Augment features with dates and label

In [10]:
comb = join(features, surverses, on = :ID_OUVRAGE);
first(shuffleDf(comb), 10)

Unnamed: 0_level_0,ID_OUVRAGE,TP_LAT,TP_LNG,TP_Z,DATE,SURVERSE
Unnamed: 0_level_1,String,Float64,Float64,Float64,Date,Int64
1,3780-01D,45.4856,-73.8687,24.16,2016-10-04,0
2,4790-01D,45.4891,-73.5419,14.22,2016-10-01,0
3,4430-04D,45.4197,-73.6487,31.54,2016-10-11,0
4,4370-04D,45.4367,-73.7095,26.23,2015-08-11,0
5,4340-01D,45.5262,-73.5444,19.33,2015-07-17,0
6,3305-03D,45.6196,-73.6329,11.76,2016-10-18,0
7,4230-09D,45.699,-73.4799,8.347,2016-09-30,0
8,4265-01D,45.626,-73.4991,11.78,2013-10-20,1
9,3240-01D,45.6644,-73.5605,9.97,2018-07-12,0
10,3480-02D,45.4986,-73.7908,25.43,2018-10-23,0


### Load precipitation data

#### Load and filter months between May & October included

In [11]:
precipitation = CSV.read("data/precipitations.csv",missingstring="-99999");
rename!(precipitation, Symbol("St-Hubert")=>:StHubert);

precipitation = filter(row -> month(row.date) > 4, precipitation);
precipitation = filter(row -> month(row.date) < 11, precipitation); 

#### Replace missing data by 0

In [12]:
precipitation[!,:McTavish] = coalesce.(precipitation[:,:McTavish], 0);
precipitation[!,:Bellevue] = coalesce.(precipitation[:,:Bellevue], 0);
precipitation[!,:Assomption] = coalesce.(precipitation[:,:Assomption], 0);
precipitation[!,:Trudeau] = coalesce.(precipitation[:,:Trudeau], 0);
precipitation[!,:StHubert] = coalesce.(precipitation[:,:StHubert], 0);

first(shuffleDf(precipitation), 5)

Unnamed: 0_level_0,date,heure,McTavish,Bellevue,Assomption,Trudeau,StHubert
Unnamed: 0_level_1,Date,Int64,Int64,Int64,Int64,Int64,Int64
1,2019-10-14,9,0,0,0,0,0
2,2013-08-19,22,0,0,0,0,0
3,2017-07-03,21,0,0,0,0,0
4,2015-05-21,7,0,0,0,0,0
5,2014-07-06,3,0,0,0,0,0


### Extract features from precipitation

#### Sum of precipitation for the day

In [13]:
pcp_sum = by(precipitation, :date,  McTavish = :McTavish=>sum, Bellevue = :Bellevue=>sum, 
   Assomption = :Assomption=>sum, Trudeau = :Trudeau=>sum, StHubert = :StHubert=>sum);
first(shuffleDf(pcp_sum), 5)

Unnamed: 0_level_0,date,McTavish,Bellevue,Assomption,Trudeau,StHubert
Unnamed: 0_level_1,Date,Int64,Int64,Int64,Int64,Int64
1,2017-08-12,110,110,170,55,63
2,2017-10-18,0,0,0,0,0
3,2015-08-20,0,0,0,0,0
4,2014-08-20,0,0,0,0,0
5,2018-10-09,56,46,5,57,66


#### Maximum precipitation in an hour for the day

In [14]:
pcp_max = by(precipitation, :date,  McTavish = :McTavish=>maximum, Bellevue = :Bellevue=>maximum, 
   Assomption = :Assomption=>maximum, Trudeau = :Trudeau=>maximum, StHubert = :StHubert=>maximum)
first(shuffleDf(pcp_max),5)

Unnamed: 0_level_0,date,McTavish,Bellevue,Assomption,Trudeau,StHubert
Unnamed: 0_level_1,Date,Int64,Int64,Int64,Int64,Int64
1,2017-08-18,69,67,60,75,60
2,2014-10-02,0,0,0,0,0
3,2017-09-06,0,0,0,0,2
4,2014-10-19,0,0,0,0,0
5,2018-08-17,21,9,33,10,28


#### Maximum precipitation during three consecutive hours in a day

In [15]:
pcp_max3h = by(precipitation, :date,  McTavish = :McTavish=>maximum3, Bellevue = :Bellevue=>maximum3, 
   Assomption = :Assomption=>maximum3, Trudeau = :Trudeau=>maximum3, StHubert = :StHubert=>maximum3)
first(shuffleDf(pcp_max3h),5)

Unnamed: 0_level_0,date,McTavish,Bellevue,Assomption,Trudeau,StHubert
Unnamed: 0_level_1,Date,Int64,Int64,Int64,Int64,Int64
1,2014-05-08,0,0,0,0,0
2,2014-06-11,0,14,0,9,10
3,2015-10-16,7,36,0,20,2
4,2017-06-26,55,8,0,15,38
5,2014-10-24,6,4,0,7,4


### Add precipitation data to features

#### Get stations lat-lng

In [16]:
station_df = DataFrame(STATION = String[], LAT = Float64[], LNG = Float64[]);

push!(station_df, ["McTavish", 45.504742, -73.579167]);
push!(station_df, ["Bellevue", 45.427222, -73.929167]);
push!(station_df, ["Assomption", 45.809444, -73.434722]);
push!(station_df, ["Trudeau", 45.467778, -73.741667]);
push!(station_df, ["StHubert", 45.5175, -73.416944]);

station_df

Unnamed: 0_level_0,STATION,LAT,LNG
Unnamed: 0_level_1,String,Float64,Float64
1,McTavish,45.5047,-73.5792
2,Bellevue,45.4272,-73.9292
3,Assomption,45.8094,-73.4347
4,Trudeau,45.4678,-73.7417
5,StHubert,45.5175,-73.4169


### Standardize TP and station data

In [17]:
meanlat = mean(comb.TP_LAT);
stdlat = std(comb.TP_LAT);
comb.TP_LAT = (comb.TP_LAT .- meanlat) ./ stdlat;
station_df.LAT = (station_df.LAT .- meanlat) ./ stdlat;

meanlng = mean(comb.TP_LNG);
stdlng = std(comb.TP_LNG);
comb.TP_LNG = (comb.TP_LNG .- meanlng) ./ stdlng;
station_df.LNG = (station_df.LNG .- meanlng) ./ stdlng;

meanz = mean(comb.TP_Z);
stdz = std(comb.TP_Z);
comb.TP_Z = (comb.TP_Z .- meanz) ./ stdz;

In [18]:
station_df

Unnamed: 0_level_0,STATION,LAT,LNG
Unnamed: 0_level_1,String,Float64,Float64
1,McTavish,-0.399934,0.53979
2,Bellevue,-1.29892,-2.14237
3,Assomption,3.13364,1.64672
4,Trudeau,-0.828599,-0.705498
5,StHubert,-0.251981,1.78296


### Augment Features

#### Add pcp_sum and pcp_max columns

In [19]:
comb.PCP_SUM = zeros(size(comb, 1));
comb.PCP_MAX = zeros(size(comb, 1));
comb.PCP_MAX3 = zeros(size(comb, 1));
permutecols!(comb, [:ID_OUVRAGE, :TP_LAT, :TP_LNG, :TP_Z, :DATE, :PCP_SUM, :PCP_MAX, :PCP_MAX3, :SURVERSE]);

In [20]:
first(shuffleDf(comb), 10)

Unnamed: 0_level_0,ID_OUVRAGE,TP_LAT,TP_LNG,TP_Z,DATE,PCP_SUM,PCP_MAX,PCP_MAX3
Unnamed: 0_level_1,String,Float64,Float64,Float64,Date,Float64,Float64,Float64
1,3350-09D,-0.0251308,-0.486079,0.625204,2013-10-11,0.0,0.0,0.0
2,0642-01D,1.54744,0.945888,-0.0467296,2015-05-20,0.0,0.0,0.0
3,3762-01D,0.431147,-0.0943688,-0.942537,2014-10-25,0.0,0.0,0.0
4,3310-01D,0.865632,0.140067,-0.0117388,2017-06-22,0.0,0.0,0.0
5,0801-06D,-0.238094,0.893818,-0.153282,2018-06-03,0.0,0.0,0.0
6,3767-01D,0.0734405,-0.394057,0.131343,2013-08-18,0.0,0.0,0.0
7,4270-02D,0.834128,1.08217,-0.0467296,2017-07-10,0.0,0.0,0.0
8,4370-04D,-1.18852,-0.458815,1.01137,2014-05-06,0.0,0.0,0.0
9,4270-02D,0.834128,1.08217,-0.0467296,2013-09-26,0.0,0.0,0.0
10,4380-01D,-0.828933,0.658409,-0.0467296,2014-06-06,0.0,0.0,0.0


#### Find closest station to each ouvrage and add pcp_sum and pcp_max to it

In [21]:
for i=1:size(comb, 1)
    id_ouvrage = comb[i, 1]; 
    closest_station = "McTavish"; # initial value
    shortest_dist = -1;
    
    # Find closest station
    for j=1:size(station_df, 1)
        dist = findDistance(comb[i, :TP_LAT], comb[i, :TP_LNG], station_df[j, :LAT], station_df[j, :LNG]);
        
        if shortest_dist == -1 || dist < shortest_dist
            shortest_dist = dist;
            closest_station = station_df[j, :STATION];
        end
    end
    
    # Augment comb with a weighted p_sum, based on the distance to the station
    p_sum = pcp_sum[∈([comb[i, :DATE]]).(pcp_sum.date), Symbol(closest_station)];
#     comb[i, :PCP_SUM] = p_sum[1] * (1 - shortest_dist);
    comb[i, :PCP_SUM] = p_sum[1]; 
    
    # Augment comb with a weighted p_max, based on the distance to the station
    p_max = pcp_max[∈([comb[i, :DATE]]).(pcp_max.date), Symbol(closest_station)]
#     comb[i, :PCP_MAX] = p_max[1] * (1 - shortest_dist);
    comb[i, :PCP_MAX] = p_max[1];
    
    # Augment comb with a weighted p_max3h, based on the distance to the station
    p_max3 = pcp_max3h[∈([comb[i, :DATE]]).(pcp_max3h.date), Symbol(closest_station)]
#     comb[i, :PCP_MAX3] = p_max3[1] * (1 - shortest_dist);
    comb[i, :PCP_MAX3] = p_max3[1]; 
end

#### Remove outlier in PCP_SUM and PCP_MAX AND PCP_MAX3 that cause compression

In [22]:
comb[comb[:PCP_SUM] .> 750, :PCP_SUM] = 750;
comb[comb[:PCP_MAX] .> 500, :PCP_MAX] = 500;
comb[comb[:PCP_MAX3] .> 750, :PCP_MAX3] = 750;

│   caller = top-level scope at In[22]:1
└ @ Core In[22]:1
│   caller = setindex!(::DataFrame, ::Int64, ::BitArray{1}, ::Symbol) at deprecated.jl:1490
└ @ DataFrames /home/chaime/.julia/packages/DataFrames/yH0f6/src/deprecated.jl:1490
│   caller = top-level scope at In[22]:2
└ @ Core In[22]:2
│   caller = top-level scope at In[22]:3
└ @ Core In[22]:3


In [23]:
first(shuffleDf(filter(row -> row.SURVERSE == 1, comb)), 10)

Unnamed: 0_level_0,ID_OUVRAGE,TP_LAT,TP_LNG,TP_Z,DATE,PCP_SUM,PCP_MAX,PCP_MAX3
Unnamed: 0_level_1,String,Float64,Float64,Float64,Date,Float64,Float64,Float64
1,4300-01D,0.3648,0.987515,-0.933306,2015-07-19,110.0,8.0,21.0
2,4380-01D,-0.828933,0.658409,-0.0467296,2017-06-20,174.0,84.0,174.0
3,4430-02D,-1.21516,0.4803,-0.168667,2016-08-14,162.0,38.0,92.0
4,4230-03D,1.5223,1.20943,-1.6241,2017-08-22,260.0,170.0,170.0
5,3350-09D,-0.0251308,-0.486079,0.625204,2015-06-16,242.0,118.0,227.0
6,4370-02D,-0.787414,0.689638,-0.0467296,2013-05-29,146.0,75.0,103.0
7,3540-02D,-0.743445,-1.70934,1.05599,2017-06-20,96.0,76.0,96.0
8,3270-01D,1.18312,0.417032,0.239038,2014-07-28,132.0,32.0,72.0
9,3350-07D,0.0802391,-0.325532,0.168267,2017-07-31,366.0,366.0,366.0
10,4720-01D,1.86614,1.22986,-1.39332,2014-06-12,270.0,50.0,100.0


### Split dates into months and days

In [24]:
comb.MONTH = month.(comb.DATE);
comb.DAY = day.(comb.DATE);
first(shuffleDf(comb[!, [:DATE, :MONTH, :DAY]]), 5)

Unnamed: 0_level_0,DATE,MONTH,DAY
Unnamed: 0_level_1,Date,Int64,Int64
1,2013-09-10,9,10
2,2015-05-20,5,20
3,2014-06-17,6,17
4,2015-10-19,10,19
5,2013-10-03,10,3


## Standardize the PCP and Date

In [25]:
mean_pcpsum = mean(comb.PCP_SUM);
std_pcpsum = std(comb.PCP_SUM);
comb.PCP_SUM = (comb.PCP_SUM .- mean_pcpsum) ./ std_pcpsum;

mean_pcpmax = mean(comb.PCP_MAX);
std_pcpmax = std(comb.PCP_MAX);
comb.PCP_MAX = (comb.PCP_MAX .- mean_pcpmax) ./ std_pcpmax;

mean_pcpmax3 = mean(comb.PCP_MAX3);
std_pcpmax3 = std(comb.PCP_MAX3);
comb.PCP_MAX3 = (comb.PCP_MAX3 .- mean_pcpmax3) ./ std_pcpmax3;

meanmonth = mean(comb.MONTH);
stdmonth = std(comb.MONTH);
comb.MONTH = (comb.MONTH .- meanmonth) ./ stdmonth;

meanday = mean(comb.DAY);
stdday = std(comb.DAY);
comb.DAY = (comb.DAY .- meanday) ./ stdday;

In [26]:
first(shuffleDf(filter(row -> row.SURVERSE == 1, comb)), 10)

Unnamed: 0_level_0,ID_OUVRAGE,TP_LAT,TP_LNG,TP_Z,DATE,PCP_SUM,PCP_MAX
Unnamed: 0_level_1,String,Float64,Float64,Float64,Date,Float64,Float64
1,3350-09D,-0.0251308,-0.486079,0.625204,2017-10-09,3.38984,1.59794
2,3350-09D,-0.0251308,-0.486079,0.625204,2015-07-01,0.89039,0.551968
3,4430-05D,-1.33635,-0.0587449,1.69293,2015-09-19,0.971894,1.99786
4,3380-01D,0.163171,-0.158884,0.395966,2016-09-08,0.645878,0.767314
5,3350-07D,0.0802391,-0.325532,0.168267,2016-08-14,2.62914,1.25953
6,4265-01D,1.00593,1.15342,-1.21178,2015-07-19,1.74618,0.982661
7,0801-05D,-0.254625,0.907701,-1.07023,2018-08-13,-0.400087,-0.401707
8,3310-01D,0.865632,0.140067,-0.0117388,2017-09-03,3.33551,0.951897
9,3350-06D,0.183866,-0.182787,0.254423,2013-07-19,1.50167,1.38259
10,4795-01D,-1.03629,0.623706,-0.60145,2018-07-17,3.49851,3.65911


# Validate model

### Split train and validation sets

In [27]:
r_idx = shuffle(1:size(comb, 1));
train_ceil = floor(Int, size(r_idx, 1) * 0.8);
train_set = comb[r_idx[1:train_ceil], :];
val_set = comb[r_idx[train_ceil+1:size(r_idx, 1)], :];

### Train model on train set

#### Random Forest Params

In [32]:
names_ft = [:TP_LAT, :TP_LNG, :TP_Z, :MONTH, :DAY, :PCP_SUM, :PCP_MAX, :PCP_MAX3];

#### Build the features and labels

In [33]:
train_features = convert(Matrix{Float64},train_set[:, names_ft]);
train_labels = train_set[:, :SURVERSE];

#### Build the model N features to use is log_2(N + 1)

In [34]:
val_model = LIBSVM.fit!(SVC(), train_features, train_labels);

### Validate model on validation set

#### Single validation

In [37]:
val_features = convert(Matrix{Float64},val_set[:, names_ft]);
val_labels = val_set[!, :SURVERSE];
val_pred = LIBSVM.predict(val_model, val_features);

r = roc(val_labels, val_pred);
f1score(r)

0.2109375

#### Batch validation for F1Score

In [None]:
niter = 10;
batch_score = 0;

for i=1:niter
    # Split train and val sets
    r_idx = shuffle(1:size(comb, 1));
    train_ceil = floor(Int, size(r_idx, 1) * 0.8);
    train_set = comb[r_idx[1:train_ceil], :];
    val_set = comb[r_idx[train_ceil+1:size(r_idx, 1)], :];
    
    # Build features and labels
    train_features = convert(Matrix{Float64},train_set[:, names_ft]);
    train_labels = train_set[:, :SURVERSE];
    
    # Build model
    val_model = LIBSVM.fit!(SVC(), train_features, train_labels);
    
    # Validate model
    val_features = convert(Matrix{Float64},val_set[:, names_ft]);
    val_labels = val_set[!, :SURVERSE];
    val_pred = LIBSVM.predict(val_model, val_features);

    r = roc(val_labels, val_pred);
    batch_score += f1score(r);
end

batch_score = batch_score / niter

# Submission model creation

### Separate features and labels

In [38]:
full_train_features = convert(Matrix{Float64},comb[:, names_ft]);

In [39]:
full_train_labels = comb[:, :SURVERSE];

### Build Model

In [40]:
model = LIBSVM.fit!(SVC(), full_train_features, full_train_labels);

# Prediction

## Get the test data

In [41]:
test = CSV.read("data/test.csv");
rename!(test, :NO_OUVRAGE => :ID_OUVRAGE);
first(test, 10)

Unnamed: 0_level_0,ID_OUVRAGE,DATE
Unnamed: 0_level_1,String,Date
1,3260-01D,2019-05-02
2,3260-01D,2019-05-09
3,3260-01D,2019-05-10
4,3260-01D,2019-05-15
5,3260-01D,2019-05-20
6,3260-01D,2019-05-23
7,3260-01D,2019-05-24
8,3260-01D,2019-05-26
9,3260-01D,2019-05-30
10,3350-07D,2019-05-01


In [42]:
to_merge = unique(comb[!, [:ID_OUVRAGE, :TP_LAT, :TP_LNG, :TP_Z]], :ID_OUVRAGE);
test_comb = join(test, to_merge, on= [:ID_OUVRAGE]);
nrow(test_comb)

283

In [43]:
first(shuffleDf(test_comb), 10)

Unnamed: 0_level_0,ID_OUVRAGE,DATE,TP_LAT,TP_LNG,TP_Z
Unnamed: 0_level_1,String,Date,Float64,Float64,Float64
1,4350-01D,2019-07-11,-0.465715,0.725081,-0.0467296
2,3350-07D,2019-05-14,0.0802391,-0.325532,0.168267
3,4380-01D,2019-05-04,-0.828933,0.658409,-0.0467296
4,4380-01D,2019-05-06,-0.828933,0.658409,-0.0467296
5,3350-07D,2019-09-30,0.0802391,-0.325532,0.168267
6,4380-01D,2019-08-10,-0.828933,0.658409,-0.0467296
7,4380-01D,2019-08-14,-0.828933,0.658409,-0.0467296
8,3260-01D,2019-05-15,1.29293,0.531312,0.0790333
9,4380-01D,2019-09-26,-0.828933,0.658409,-0.0467296
10,4350-01D,2019-07-17,-0.465715,0.725081,-0.0467296


### Add PCP_SUM and PCP_MAX

#### Initialize default pcp

In [44]:
test_comb.PCP_SUM = zeros(size(test_comb, 1));
test_comb.PCP_MAX = zeros(size(test_comb, 1));
test_comb.PCP_MAX3 = zeros(size(test_comb, 1));
permutecols!(test_comb, [:ID_OUVRAGE, :TP_LAT, :TP_LNG, :TP_Z, :DATE, :PCP_SUM, :PCP_MAX, :PCP_MAX3]);

In [45]:
first(shuffleDf(test_comb), 10)

Unnamed: 0_level_0,ID_OUVRAGE,TP_LAT,TP_LNG,TP_Z,DATE,PCP_SUM,PCP_MAX,PCP_MAX3
Unnamed: 0_level_1,String,Float64,Float64,Float64,Date,Float64,Float64,Float64
1,3350-07D,0.0802391,-0.325532,0.168267,2019-08-26,0.0,0.0,0.0
2,3350-07D,0.0802391,-0.325532,0.168267,2019-09-01,0.0,0.0,0.0
3,3260-01D,1.29293,0.531312,0.0790333,2019-07-21,0.0,0.0,0.0
4,4240-01D,1.28137,1.24035,-1.19178,2019-06-20,0.0,0.0,0.0
5,3350-07D,0.0802391,-0.325532,0.168267,2019-09-13,0.0,0.0,0.0
6,4380-01D,-0.828933,0.658409,-0.0467296,2019-09-12,0.0,0.0,0.0
7,3350-07D,0.0802391,-0.325532,0.168267,2019-09-24,0.0,0.0,0.0
8,4240-01D,1.28137,1.24035,-1.19178,2019-09-28,0.0,0.0,0.0
9,4350-01D,-0.465715,0.725081,-0.0467296,2019-05-22,0.0,0.0,0.0
10,4350-01D,-0.465715,0.725081,-0.0467296,2019-09-04,0.0,0.0,0.0


#### Populate pcp

In [46]:
for i=1:size(test_comb, 1)
    id_ouvrage = test_comb[i, 1]; 
    closest_station = "McTavish"; # initial value
    shortest_dist = -1;
    
    # Find closest station
    for j=1:size(station_df, 1)
        dist = findDistance(test_comb[i, :TP_LAT], test_comb[i, :TP_LNG], station_df[j, :LAT], station_df[j, :LNG]);
        
        if shortest_dist == -1 || dist < shortest_dist
            shortest_dist = dist;
            closest_station = station_df[j, :STATION];
        end
    end
    
    # Augment comb with a weighted p_sum, based on the distance to the station
    p_sum = pcp_sum[∈([test_comb[i, :DATE]]).(pcp_sum.date), Symbol(closest_station)];
#     test_comb[i, :PCP_SUM] = p_sum[1] * (1 - shortest_dist); 
    test_comb[i, :PCP_SUM] = p_sum[1]; 
    # Augment comb with a weighted p_max, based on the distance to the station
    p_max = pcp_max[∈([test_comb[i, :DATE]]).(pcp_max.date), Symbol(closest_station)]
#     test_comb[i, :PCP_MAX] = p_max[1] * (1 - shortest_dist);
    test_comb[i, :PCP_MAX] = p_max[1];
    # Augment comb with a weighted p_max3, based on the distance to the station
    p_max3 = pcp_max3h[∈([test_comb[i, :DATE]]).(pcp_max3h.date), Symbol(closest_station)]
#     test_comb[i, :PCP_MAX3] = p_max3[1] * (1 - shortest_dist);
    test_comb[i, :PCP_MAX3] = p_max3[1];
end

In [47]:
first(shuffleDf(test_comb), 10)

Unnamed: 0_level_0,ID_OUVRAGE,TP_LAT,TP_LNG,TP_Z,DATE,PCP_SUM,PCP_MAX,PCP_MAX3
Unnamed: 0_level_1,String,Float64,Float64,Float64,Date,Float64,Float64,Float64
1,4350-01D,-0.465715,0.725081,-0.0467296,2019-05-29,0.0,0.0,0.0
2,3350-07D,0.0802391,-0.325532,0.168267,2019-08-28,157.0,53.0,86.0
3,3260-01D,1.29293,0.531312,0.0790333,2019-07-28,0.0,0.0,0.0
4,4240-01D,1.28137,1.24035,-1.19178,2019-06-28,0.0,0.0,0.0
5,4380-01D,-0.828933,0.658409,-0.0467296,2019-05-08,0.0,0.0,0.0
6,4240-01D,1.28137,1.24035,-1.19178,2019-08-15,0.0,0.0,0.0
7,4240-01D,1.28137,1.24035,-1.19178,2019-06-01,0.0,0.0,0.0
8,3350-07D,0.0802391,-0.325532,0.168267,2019-05-27,0.0,0.0,0.0
9,4350-01D,-0.465715,0.725081,-0.0467296,2019-08-17,194.0,174.0,189.0
10,3350-07D,0.0802391,-0.325532,0.168267,2019-06-16,0.0,0.0,0.0


### Standardize PCP

In [48]:
test_comb.PCP_SUM = (test_comb.PCP_SUM .- mean_pcpsum) ./ std_pcpsum;
test_comb.PCP_MAX = (test_comb.PCP_MAX .- mean_pcpmax) ./ std_pcpmax;
test_comb.PCP_MAX3 = (test_comb.PCP_MAX3 .- mean_pcpmax3) ./ std_pcpmax3;

In [49]:
first(test_comb, 20)

Unnamed: 0_level_0,ID_OUVRAGE,TP_LAT,TP_LNG,TP_Z,DATE,PCP_SUM,PCP_MAX
Unnamed: 0_level_1,String,Float64,Float64,Float64,Date,Float64,Float64
1,3260-01D,1.29293,0.531312,0.0790333,2019-05-02,-0.0469042,-0.0325426
2,3260-01D,1.29293,0.531312,0.0790333,2019-05-09,0.808886,0.982661
3,3260-01D,1.29293,0.531312,0.0790333,2019-05-10,4.82974,1.6287
4,3260-01D,1.29293,0.531312,0.0790333,2019-05-15,-0.37292,-0.34018
5,3260-01D,1.29293,0.531312,0.0790333,2019-05-20,0.224775,0.828842
6,3260-01D,1.29293,0.531312,0.0790333,2019-05-23,1.97711,1.53641
7,3260-01D,1.29293,0.531312,0.0790333,2019-05-24,-0.223496,-0.124834
8,3260-01D,1.29293,0.531312,0.0790333,2019-05-26,-0.359336,-0.309416
9,3260-01D,1.29293,0.531312,0.0790333,2019-05-30,-0.305,-0.186361
10,3350-07D,0.0802391,-0.325532,0.168267,2019-05-01,0.238359,0.244331


#### Split dates into month and day

In [50]:
test_comb.MONTH = month.(test_comb.DATE);
test_comb.DAY = day.(test_comb.DATE);

first(shuffleDf(test_comb[!, [:DATE, :MONTH, :DAY]]), 5)

Unnamed: 0_level_0,DATE,MONTH,DAY
Unnamed: 0_level_1,Date,Int64,Int64
1,2019-07-30,7,30
2,2019-07-10,7,10
3,2019-08-12,8,12
4,2019-07-10,7,10
5,2019-07-23,7,23


#### Standardize months and days

In [51]:
test_comb.MONTH = (test_comb.MONTH .- meanmonth) ./ stdmonth;
test_comb.DAY = (test_comb.DAY .- meanday) ./ stdday;

In [52]:
first(shuffleDf(test_comb[!, [:ID_OUVRAGE, :TP_LAT, :TP_LNG, :TP_Z, :MONTH, :DAY, :PCP_SUM, :PCP_MAX, :PCP_MAX3]]), 5)

Unnamed: 0_level_0,ID_OUVRAGE,TP_LAT,TP_LNG,TP_Z,MONTH,DAY,PCP_SUM,PCP_MAX
Unnamed: 0_level_1,String,Float64,Float64,Float64,Float64,Float64,Float64,Float64
1,4350-01D,-0.465715,0.725081,-0.0467296,0.293667,0.356584,-0.400087,-0.401707
2,3350-07D,0.0802391,-0.325532,0.168267,-0.875331,-1.33819,-0.400087,-0.401707
3,4350-01D,-0.465715,0.725081,-0.0467296,0.878166,-1.11222,-0.400087,-0.401707
4,4350-01D,-0.465715,0.725081,-0.0467296,-0.290832,-1.11222,-0.264248,-0.155598
5,3350-07D,0.0802391,-0.325532,0.168267,-0.290832,-0.88625,-0.400087,-0.401707


### Create Test features

In [53]:
test_features = convert(Matrix{Float64}, test_comb[:, names_ft])

283×8 Array{Float64,2}:
  1.29293     0.531312   0.0790333  …  -0.0469042  -0.0325426   0.0430963
  1.29293     0.531312   0.0790333      0.808886    0.982661    1.41166  
  1.29293     0.531312   0.0790333      4.82974     1.6287      2.75979  
  1.29293     0.531312   0.0790333     -0.37292    -0.34018    -0.365429 
  1.29293     0.531312   0.0790333      0.224775    0.828842    0.492474 
  1.29293     0.531312   0.0790333  …   1.97711     1.53641     2.51467  
  1.29293     0.531312   0.0790333     -0.223496   -0.124834   -0.181592 
  1.29293     0.531312   0.0790333     -0.359336   -0.309416   -0.345002 
  1.29293     0.531312   0.0790333     -0.305      -0.186361   -0.263297 
  0.0802391  -0.325532   0.168267       0.238359    0.244331    0.431195 
  0.0802391  -0.325532   0.168267   …  -0.223496   -0.155598   -0.181592 
  0.0802391  -0.325532   0.168267      -0.400087   -0.401707   -0.406281 
  0.0802391  -0.325532   0.168267       0.768134    0.921133    1.35038  
  ⋮           

## Predict

In [54]:
test_labels = LIBSVM.predict(val_model, test_features)

283-element Array{Int64,1}:
 0
 0
 1
 0
 0
 0
 0
 0
 0
 0
 0
 0
 0
 ⋮
 0
 0
 0
 0
 0
 0
 0
 0
 0
 0
 0
 0

## Generate submission

In [55]:
ID = test_comb[:,:ID_OUVRAGE].*"_".*string.(test_comb[:,:DATE])
sampleSubmission = DataFrame(ID = ID, Surverse=test_labels)
CSV.write("submissions/mc-submission-11.csv",sampleSubmission)

"submissions/mc-submission-11.csv"