# Data Processing

In [125]:
using CSV, DataFrames, DecisionTree, Statistics, Dates, Gadfly, Random, MLBase;
include("utils/precipitation.jl");

## Build features

### Get and filter the features

#### Latitude, Longitude, Height

In [2]:
features = CSV.read("data/ouvrages-surverses.csv");
colnames = ["N_Env", "ID_SOMA", "ID_OUVRAGE", "NOM", "SOMA_SEC", "REGION", "TP_X", "TP_Y", "TP_Z", "TP_LAT", "TP_LNG", "EMI_X", "EMI_Y", "EMI_LNG", "EMI_LAT"];
names!(features, Symbol.(colnames));
select!(features, [:ID_OUVRAGE, :TP_LAT, :TP_LNG, :TP_Z]);

#### Replace missing Z index with mean

In [3]:
features.TP_Z = coalesce.(features.TP_Z, mean(features[completecases(features), :].TP_Z));
first(shuffleDf(features), 10)

Unnamed: 0_level_0,ID_OUVRAGE,TP_LAT,TP_LNG,TP_Z
Unnamed: 0_level_1,String,Float64,Float64,Float64
1,3350-06D,45.5551,-73.6735,21.31
2,3480-03D,45.4972,-73.7953,25.51
3,4240-02D,45.6498,-73.4877,19.29
4,3490-02D,45.5054,-73.8061,24.49
5,4210-01D,45.6669,-73.5075,11.59
6,4230-08D,45.6968,-73.4809,10.11
7,3350-11D,45.5328,-73.7065,25.28
8,3350-02D,45.5783,-73.6585,19.3526
9,3782-01D,45.4721,-73.8778,24.14
10,3275-02D,45.637,-73.607,14.53


### Load dates and surverses

In [5]:
surverses = CSV.read("data/surverses.csv",missingstring="-99999");

#### Filter months

In [6]:
surverses = filter(row -> month(row.DATE) > 4, surverses);
surverses = filter(row -> month(row.DATE) < 11, surverses);

#### Filter non rain surverses

In [7]:
raison = coalesce.(surverses[:,:RAISON],"Inconnue");
surverses[!,:RAISON] = raison;

surverses = filter(row -> row.RAISON ∈ ["P","Inconnue","TS"], surverses);
select!(surverses, [:NO_OUVRAGE, :DATE, :SURVERSE]);

#### Remove missing data and rename

In [8]:
surverses = dropmissing(surverses, disallowmissing=true);
rename!(surverses, :NO_OUVRAGE => :ID_OUVRAGE);
first(shuffleDf(surverses),10)

Unnamed: 0_level_0,ID_OUVRAGE,DATE,SURVERSE
Unnamed: 0_level_1,String,Date,Int64
1,4710-01D,2013-08-19,0
2,3330-01D,2015-09-03,0
3,3240-04D,2015-09-19,0
4,4620-08D,2016-09-03,0
5,3250-01D,2018-08-27,0
6,0801-03D,2016-07-19,0
7,3480-05D,2013-07-28,0
8,0642-01D,2018-08-23,0
9,3350-06D,2015-06-14,0
10,4420-01D,2013-05-25,0


### Augment features with dates and label

In [9]:
comb = join(features, surverses, on = :ID_OUVRAGE);
first(shuffleDf(comb), 10)

Unnamed: 0_level_0,ID_OUVRAGE,TP_LAT,TP_LNG,TP_Z,DATE,SURVERSE
Unnamed: 0_level_1,String,Float64,Float64,Float64,Date,Int64
1,3275-02D,45.637,-73.607,14.53,2013-10-08,0
2,4370-02D,45.4713,-73.5596,19.3526,2013-07-28,0
3,3280-01D,45.634,-73.6059,23.86,2015-07-04,0
4,4230-03D,45.6705,-73.4918,9.1,2017-06-19,0
5,3305-03D,45.6196,-73.6329,11.76,2018-05-22,0
6,3380-01D,45.5533,-73.6703,22.23,2016-07-16,0
7,4600-03D,45.4364,-73.8226,26.42,2013-06-03,0
8,0801-03D,45.5081,-73.5273,11.94,2018-08-10,0
9,4430-02D,45.4344,-73.5869,18.56,2018-06-17,0
10,4600-03D,45.4364,-73.8226,26.42,2014-07-25,0


### Load precipitation data

#### Load and filter months between May & October included

In [10]:
precipitation = CSV.read("data/precipitations.csv",missingstring="-99999");
rename!(precipitation, Symbol("St-Hubert")=>:StHubert);

precipitation = filter(row -> month(row.date) > 4, precipitation);
precipitation = filter(row -> month(row.date) < 11, precipitation); 

#### Replace missing data by 0

In [11]:
precipitation[!,:McTavish] = coalesce.(precipitation[:,:McTavish], 0);
precipitation[!,:Bellevue] = coalesce.(precipitation[:,:Bellevue], 0);
precipitation[!,:Assomption] = coalesce.(precipitation[:,:Assomption], 0);
precipitation[!,:Trudeau] = coalesce.(precipitation[:,:Trudeau], 0);
precipitation[!,:StHubert] = coalesce.(precipitation[:,:StHubert], 0);

first(shuffleDf(precipitation), 5)

Unnamed: 0_level_0,date,heure,McTavish,Bellevue,Assomption,Trudeau,StHubert
Unnamed: 0_level_1,Date,Int64,Int64,Int64,Int64,Int64,Int64
1,2018-10-22,20,0,0,0,0,0
2,2014-05-04,10,14,2,0,6,0
3,2015-09-20,5,0,0,0,0,0
4,2016-09-28,18,0,0,0,0,0
5,2018-06-03,9,0,0,0,0,0


### Extract features from precipitation

#### Sum of precipitation for the day

In [13]:
pcp_sum = by(precipitation, :date,  McTavish = :McTavish=>sum, Bellevue = :Bellevue=>sum, 
   Assomption = :Assomption=>sum, Trudeau = :Trudeau=>sum, StHubert = :StHubert=>sum);
first(shuffleDf(pcp_sum), 5)

Unnamed: 0_level_0,date,McTavish,Bellevue,Assomption,Trudeau,StHubert
Unnamed: 0_level_1,Date,Int64,Int64,Int64,Int64,Int64
1,2014-09-25,0,0,0,0,0
2,2015-05-26,52,64,10,47,48
3,2014-08-30,0,0,0,0,0
4,2017-06-21,0,0,30,0,0
5,2018-08-24,0,0,4,0,0


#### Maximum precipitation in an hour for the day

In [14]:
pcp_max = by(precipitation, :date,  McTavish = :McTavish=>maximum, Bellevue = :Bellevue=>maximum, 
   Assomption = :Assomption=>maximum, Trudeau = :Trudeau=>maximum, StHubert = :StHubert=>maximum)
first(shuffleDf(pcp_max),5)

Unnamed: 0_level_0,date,McTavish,Bellevue,Assomption,Trudeau,StHubert
Unnamed: 0_level_1,Date,Int64,Int64,Int64,Int64,Int64
1,2014-05-29,0,0,0,0,0
2,2014-05-27,44,10,40,39,0
3,2015-08-18,197,0,0,44,0
4,2013-10-16,12,8,0,7,0
5,2018-07-15,0,0,4,0,0


### Add precipitation data to features

#### Get stations lat-lng

In [15]:
station_df = DataFrame(STATION = String[], LAT = Float64[], LNG = Float64[]);

push!(station_df, ["McTavish", 45.504742, -73.579167]);
push!(station_df, ["Bellevue", 45.427222, -73.929167]);
push!(station_df, ["Assomption", 45.809444, -73.434722]);
push!(station_df, ["Trudeau", 45.467778, -73.741667]);
push!(station_df, ["StHubert", 45.5175, -73.416944]);

station_df

Unnamed: 0_level_0,STATION,LAT,LNG
Unnamed: 0_level_1,String,Float64,Float64
1,McTavish,45.5047,-73.5792
2,Bellevue,45.4272,-73.9292
3,Assomption,45.8094,-73.4347
4,Trudeau,45.4678,-73.7417
5,StHubert,45.5175,-73.4169


### Normalize elements

#### Get extreme values

In [16]:
min_lat = min(minimum(comb.TP_LAT), minimum(station_df.LAT));
max_lat = max(maximum(comb.TP_LAT), maximum(station_df.LAT));

min_lng = min(minimum(comb.TP_LNG), minimum(station_df.LNG));
max_lng = max(maximum(comb.TP_LNG), maximum(station_df.LNG));

min_z = minimum(comb.TP_Z);
max_z = maximum(comb.TP_Z);

#### Normalize every value of comb between 0 and 1

In [17]:
comb.TP_LAT = normalize.(comb.TP_LAT, min_lat, max_lat);
comb.TP_LNG = normalize.(comb.TP_LNG, min_lng, max_lng);
comb.TP_Z = normalize.(comb.TP_Z, min_z, max_z);

first(shuffleDf(comb), 10)

Unnamed: 0_level_0,ID_OUVRAGE,TP_LAT,TP_LNG,TP_Z,DATE,SURVERSE
Unnamed: 0_level_1,String,Float64,Float64,Float64,Date,Int64
1,3350-05D,0.412789,0.545451,0.422555,2016-10-05,0
2,4350-01D,0.237379,0.74432,0.359859,2013-07-18,0
3,4230-06D,0.687093,0.864544,0.0203708,2015-09-26,0
4,3260-01D,0.609996,0.697487,0.386587,2016-08-24,0
5,4710-01D,0.725267,0.852146,0.123369,2013-09-10,0
6,4610-09D,0.0853384,0.202151,0.359859,2015-09-08,0
7,4430-04D,0.0424383,0.570669,0.758362,2014-08-31,0
8,3460-01D,0.254612,0.368257,0.568388,2017-10-18,0
9,4300-01D,0.413346,0.807748,0.171435,2017-09-12,0
10,4620-06D,0.0789395,0.164927,0.774384,2016-10-15,0


#### Normalize every value of station_df between 0 and 1

In [18]:
station_df.LAT = normalize.(station_df.LAT, min_lat, max_lat);
station_df.LNG = normalize.(station_df.LNG, min_lng, max_lng);

station_df

Unnamed: 0_level_0,STATION,LAT,LNG
Unnamed: 0_level_1,String,Float64,Float64
1,McTavish,0.251316,0.699536
2,Bellevue,0.0608415,0.0512791
3,Assomption,1.0,0.967072
4,Trudeau,0.160492,0.39856
5,StHubert,0.282664,1.0


### Augment Features

#### Add pcp_sum and pcp_max columns

In [57]:
comb.PCP_MAX = zeros(size(comb, 1));
comb.PCP_SUM = zeros(size(comb, 1));
permutecols!(comb, [:ID_OUVRAGE, :TP_LAT, :TP_LNG, :TP_Z, :DATE, :PCP_SUM, :PCP_MAX, :SURVERSE]);

In [58]:
first(shuffleDf(comb), 10)

Unnamed: 0_level_0,ID_OUVRAGE,TP_LAT,TP_LNG,TP_Z,DATE,PCP_SUM,PCP_MAX,SURVERSE
Unnamed: 0_level_1,String,Float64,Float64,Float64,Date,Float64,Float64,Int64
1,0672-01D,0.687609,0.788765,0.0252755,2018-05-24,0.0,0.0,0
2,4610-09D,0.0853384,0.202151,0.359859,2015-10-14,0.0,0.0,0
3,3390-01D,0.363893,0.528042,0.442501,2018-05-15,0.0,0.0,0
4,3270-01D,0.586728,0.669867,0.420593,2018-07-20,0.0,0.0,0
5,3450-01D,0.267984,0.374662,0.572965,2014-10-14,0.0,0.0,0
6,3310-01D,0.51946,0.602926,0.367296,2014-10-01,0.0,0.0,0
7,3290-01D,0.543825,0.620308,0.376124,2014-10-27,0.0,0.0,0
8,3340-01D,0.471794,0.572824,0.37547,2017-07-16,0.0,0.0,0
9,3500-01D,0.26828,0.202171,0.528823,2017-09-07,0.0,0.0,0
10,4530-02D,0.0940842,0.343984,0.359859,2013-05-14,0.0,0.0,0


#### Find closest station to each ouvrage and add pcp_sum and pcp_max to it

In [59]:
for i=1:size(comb, 1)
    id_ouvrage = comb[i, 1]; 
    closest_station = "McTavish"; # initial value
    shortest_dist = -1;
    
    # Find closest station
    for j=1:size(station_df, 1)
        dist = findDistance(comb[i, :TP_LAT], comb[i, :TP_LNG], station_df[j, :LAT], station_df[j, :LNG]);
        
        if shortest_dist == -1 || dist < shortest_dist
            shortest_dist = dist;
            closest_station = station_df[j, :STATION];
        end
    end
    
    # Augment comb with a weighted p_sum, based on the distance to the station
    p_sum = pcp_sum[∈([comb[i, :DATE]]).(pcp_sum.date), Symbol(closest_station)];
    comb[i, :PCP_SUM] = p_sum[1] * (1 - shortest_dist); 
    # Augment comb with a weighted p_max, based on the distance to the station
    p_max = pcp_max[∈([comb[i, :DATE]]).(pcp_max.date), Symbol(closest_station)]
    comb[i, :PCP_MAX] = p_max[1] * (1 - shortest_dist);
end

### TODO: Remove outlier in PCP_SUM and PCP_MAX that cause compression

In [60]:
comb[comb[:PCP_SUM] .> 500, :PCP_SUM] = 500;
comb[comb[:PCP_MAX] .> 500, :PCP_MAX] = 500;

│   caller = top-level scope at In[60]:1
└ @ Core In[60]:1
│   caller = top-level scope at In[60]:2
└ @ Core In[60]:2


In [61]:
first(shuffleDf(filter(row -> row.SURVERSE == 1, comb)), 10)

Unnamed: 0_level_0,ID_OUVRAGE,TP_LAT,TP_LNG,TP_Z,DATE,PCP_SUM,PCP_MAX,SURVERSE
Unnamed: 0_level_1,String,Float64,Float64,Float64,Date,Float64,Float64,Int64
1,4230-05D,0.664256,0.861884,0.104731,2013-06-02,90.7429,51.8531,1
2,3410-02D,0.319462,0.483178,0.359859,2014-06-03,171.361,98.3893,1
3,3260-01D,0.609996,0.697487,0.386587,2013-06-02,43.6094,16.0329,1
4,4290-01D,0.447648,0.831152,0.115849,2017-06-12,40.4882,40.4882,1
5,3350-01D,0.459837,0.564893,0.220482,2018-07-17,215.763,99.2358,1
6,4330-01D,0.347229,0.780847,0.190073,2015-09-19,68.1923,48.0843,1
7,4230-09D,0.728544,0.88331,0.0,2015-07-31,50.1141,42.9549,1
8,3350-01D,0.459837,0.564893,0.220482,2017-06-20,130.811,63.1501,1
9,3290-01D,0.543825,0.620308,0.376124,2017-07-08,227.903,153.329,1
10,4270-02D,0.512785,0.830625,0.359859,2014-06-03,195.709,88.569,1


#### Normalize pcp_sum and pcp_max

In [62]:
min_pcp_sum = minimum(comb.PCP_SUM);
max_pcp_sum = maximum(comb.PCP_SUM);

min_pcp_max = minimum(comb.PCP_MAX);
max_pcp_max = maximum(comb.PCP_MAX);

In [63]:
comb.PCP_SUM = normalize.(comb.PCP_SUM, min_pcp_sum, max_pcp_sum);
comb.PCP_MAX = normalize.(comb.PCP_MAX, min_pcp_max, max_pcp_max);

In [64]:
first(shuffleDf(filter(row -> row.SURVERSE == 1, comb)), 10)

Unnamed: 0_level_0,ID_OUVRAGE,TP_LAT,TP_LNG,TP_Z,DATE,PCP_SUM,PCP_MAX,SURVERSE
Unnamed: 0_level_1,String,Float64,Float64,Float64,Date,Float64,Float64,Int64
1,0672-03D,0.66527,0.771707,0.0537227,2013-08-13,0.34296,0.170255,1
2,3340-01D,0.471794,0.572824,0.37547,2018-07-17,0.428034,0.196866,1
3,4230-09D,0.728544,0.88331,0.0,2014-06-24,1.0,0.286366,1
4,3350-06D,0.37501,0.524895,0.423863,2016-06-06,0.0833151,0.0785991,1
5,3350-09D,0.330728,0.451592,0.502665,2018-06-18,0.346755,0.157765,1
6,3750-01D,0.506656,0.587505,0.106366,2014-07-09,0.0,0.0,1
7,4240-01D,0.607546,0.868856,0.116503,2015-10-28,0.359905,0.0883521,1
8,3270-01D,0.586728,0.669867,0.420593,2018-06-04,0.383375,0.0451029,1
9,4380-01D,0.160421,0.728206,0.359859,2017-08-15,0.106754,0.0958972,1
10,0801-09D,0.278466,0.780082,0.553674,2014-06-25,0.0,0.0,1


#### Normalize dates (UNUSED ANYMORE)

In [28]:
dates = dateToDay.(comb.DATE);
dates = normalize.(dates, minimum(dates), maximum(dates));

# Validate model

### Split train and validation sets

In [130]:
r_idx = shuffle(1:size(comb, 1));
train_ceil = floor(Int, size(r_idx, 1) * 0.8);
train_set = comb[r_idx[1:train_ceil], :];
val_set = comb[r_idx[train_ceil+1:size(r_idx, 1)], :];

### Train model on train set

#### Build the features and labels

In [131]:
train_features = convert(Matrix{Float64},train_set[:, [:TP_LAT, :TP_LNG, :TP_Z, :PCP_SUM, :PCP_MAX]]);
train_labels = train_set[:, :SURVERSE];

#### Build the model

In [132]:
val_model = build_forest(train_labels, train_features, 3, 30, 0.5, 10)

Ensemble of Decision Trees
Trees:      30
Avg Leaves: 328.56666666666666
Avg Depth:  10.0

### Validate model on validation set

#### Single validation

In [133]:
val_features = convert(Matrix{Float64},val_set[:, [:TP_LAT, :TP_LNG, :TP_Z, :PCP_SUM, :PCP_MAX]]);
val_labels = val_set[!, :SURVERSE];
val_pred = apply_forest(model, val_features);

r = roc(val_labels, val_pred);
f1score(r)

0.5229244114002478

#### Batch validation

In [None]:
niter = 10;
batch_score = 0;

for i=1:niter
    # Split train and val sets
    r_idx = shuffle(1:size(comb, 1));
    train_ceil = floor(Int, size(r_idx, 1) * 0.8);
    train_set = comb[r_idx[1:train_ceil], :];
    val_set = comb[r_idx[train_ceil+1:size(r_idx, 1)], :];
    
    # Build features and labels
    train_features = convert(Matrix{Float64},train_set[:, [:TP_LAT, :TP_LNG, :TP_Z, :PCP_SUM, :PCP_MAX]]);
    train_labels = train_set[:, :SURVERSE];
    
    # Build model
    val_model = build_forest(train_labels, train_features, 3, 30, 0.5, 10);
    
    # Validate model
    val_features = convert(Matrix{Float64},val_set[:, [:TP_LAT, :TP_LNG, :TP_Z, :PCP_SUM, :PCP_MAX]]);
    val_labels = val_set[!, :SURVERSE];
    val_pred = apply_forest(model, val_features);

    r = roc(val_labels, val_pred);
    batch_score += f1score(r);
end

batch_score = batch_score / niter

# Submission model creation

### Separate features and labels

In [109]:
full_train_features = convert(Matrix{Float64},comb[:, [:TP_LAT, :TP_LNG, :TP_Z, :PCP_SUM, :PCP_MAX]]);
# train_features = hcat(dates, train_features);

In [110]:
full_train_labels = comb[:, :SURVERSE];

### Build Model

#### Test with tree first

In [111]:
model_tree = build_tree(full_train_labels, full_train_features)

Decision Tree
Leaves: 3794
Depth:  36

In [112]:
model_tree = prune_tree(model_tree, 0.90)

Decision Tree
Leaves: 3544
Depth:  36

In [113]:
model = build_forest(full_train_labels, full_train_features, 3, 30, 0.5, 10)

Ensemble of Decision Trees
Trees:      30
Avg Leaves: 336.3
Avg Depth:  10.0

# Prediction

## Get the test data

In [37]:
test = CSV.read("data/test.csv");
rename!(test, :NO_OUVRAGE => :ID_OUVRAGE);
first(test, 10)

Unnamed: 0_level_0,ID_OUVRAGE,DATE
Unnamed: 0_level_1,String,Date
1,3260-01D,2019-05-02
2,3260-01D,2019-05-09
3,3260-01D,2019-05-10
4,3260-01D,2019-05-15
5,3260-01D,2019-05-20
6,3260-01D,2019-05-23
7,3260-01D,2019-05-24
8,3260-01D,2019-05-26
9,3260-01D,2019-05-30
10,3350-07D,2019-05-01


In [38]:
to_merge = unique(comb[!, [:ID_OUVRAGE, :TP_LAT, :TP_LNG, :TP_Z]], :ID_OUVRAGE);
test_comb = join(test, to_merge, on= [:ID_OUVRAGE]);
nrow(test_comb)

283

In [39]:
first(shuffleDf(test_comb), 10);

### Add PCP_SUM and PCP_MAX

#### Initialize default pcp

In [65]:
test_comb.PCP_MAX = zeros(size(test_comb, 1));
test_comb.PCP_SUM = zeros(size(test_comb, 1));
permutecols!(test_comb, [:ID_OUVRAGE, :TP_LAT, :TP_LNG, :TP_Z, :DATE, :PCP_SUM, :PCP_MAX]);

In [66]:
first(shuffleDf(test_comb), 10);

#### Populate pcp

In [67]:
for i=1:size(test_comb, 1)
    id_ouvrage = test_comb[i, 1]; 
    closest_station = "McTavish"; # initial value
    shortest_dist = -1;
    
    # Find closest station
    for j=1:size(station_df, 1)
        dist = findDistance(test_comb[i, :TP_LAT], test_comb[i, :TP_LNG], station_df[j, :LAT], station_df[j, :LNG]);
        
        if shortest_dist == -1 || dist < shortest_dist
            shortest_dist = dist;
            closest_station = station_df[j, :STATION];
        end
    end
    
    # Augment comb with a weighted p_sum, based on the distance to the station
    p_sum = pcp_sum[∈([test_comb[i, :DATE]]).(pcp_sum.date), Symbol(closest_station)];
    test_comb[i, :PCP_SUM] = p_sum[1] * (1 - shortest_dist); 
    # Augment comb with a weighted p_max, based on the distance to the station
    p_max = pcp_max[∈([test_comb[i, :DATE]]).(pcp_max.date), Symbol(closest_station)]
    test_comb[i, :PCP_MAX] = p_max[1] * (1 - shortest_dist);
end

In [68]:
first(shuffleDf(test_comb), 10)

Unnamed: 0_level_0,ID_OUVRAGE,TP_LAT,TP_LNG,TP_Z,DATE,PCP_SUM,PCP_MAX
Unnamed: 0_level_1,String,Float64,Float64,Float64,Date,Float64,Float64
1,3350-07D,0.353054,0.490395,0.405552,2019-07-05,21.2398,19.6665
2,3350-07D,0.353054,0.490395,0.405552,2019-08-14,0.0,0.0
3,4240-01D,0.607546,0.868856,0.116503,2019-09-07,116.937,61.7165
4,4350-01D,0.237379,0.74432,0.359859,2019-09-06,0.0,0.0
5,3350-07D,0.353054,0.490395,0.405552,2019-06-29,108.559,62.1462
6,4350-01D,0.237379,0.74432,0.359859,2019-06-14,92.4505,40.0301
7,4240-01D,0.607546,0.868856,0.116503,2019-05-24,9.09506,7.79577
8,4350-01D,0.237379,0.74432,0.359859,2019-09-01,0.0,0.0
9,3350-07D,0.353054,0.490395,0.405552,2019-06-02,53.4929,23.5998
10,3350-07D,0.353054,0.490395,0.405552,2019-06-19,0.0,0.0


#### Normalize pcp based on comb (not test_comb)

In [69]:
test_comb.PCP_SUM = normalize.(test_comb.PCP_SUM, min_pcp_sum, max_pcp_sum);
test_comb.PCP_MAX = normalize.(test_comb.PCP_MAX, min_pcp_max, max_pcp_max);

In [70]:
first(test_comb, 20)

Unnamed: 0_level_0,ID_OUVRAGE,TP_LAT,TP_LNG,TP_Z,DATE,PCP_SUM,PCP_MAX
Unnamed: 0_level_1,String,Float64,Float64,Float64,Date,Float64,Float64
1,3260-01D,0.609996,0.697487,0.386587,2019-05-02,0.0333484,0.0153915
2,3260-01D,0.609996,0.697487,0.386587,2019-05-09,0.114154,0.0577183
3,3260-01D,0.609996,0.697487,0.386587,2019-05-10,0.493812,0.0846535
4,3260-01D,0.609996,0.697487,0.386587,2019-05-15,0.00256526,0.00256526
5,3260-01D,0.609996,0.697487,0.386587,2019-05-20,0.0590009,0.0513052
6,3260-01D,0.609996,0.697487,0.386587,2019-05-23,0.22446,0.0808056
7,3260-01D,0.609996,0.697487,0.386587,2019-05-24,0.0166742,0.0115437
8,3260-01D,0.609996,0.697487,0.386587,2019-05-26,0.00384789,0.00384789
9,3260-01D,0.609996,0.697487,0.386587,2019-05-30,0.0089784,0.0089784
10,3350-07D,0.353054,0.490395,0.405552,2019-05-01,0.0739461,0.0330397


#### Normalize dates (UNUSED ANYMORE)

In [71]:
test_dates = dateToDay.(test_comb.DATE);
test_dates = normalize.(test_dates, minimum(test_dates), maximum(test_dates));

### Create Test features

In [72]:
test_features = convert(Matrix{Float64}, test_comb[:, [:TP_LAT, :TP_LNG, :TP_Z, :PCP_SUM, :PCP_MAX]]);
# test_features = hcat(test_dates, test_features);

test_features

283×5 Array{Float64,2}:
 0.609996  0.697487  0.386587  0.0333484   0.0153915 
 0.609996  0.697487  0.386587  0.114154    0.0577183 
 0.609996  0.697487  0.386587  0.493812    0.0846535 
 0.609996  0.697487  0.386587  0.00256526  0.00256526
 0.609996  0.697487  0.386587  0.0590009   0.0513052 
 0.609996  0.697487  0.386587  0.22446     0.0808056 
 0.609996  0.697487  0.386587  0.0166742   0.0115437 
 0.609996  0.697487  0.386587  0.00384789  0.00384789
 0.609996  0.697487  0.386587  0.0089784   0.0089784 
 0.353054  0.490395  0.405552  0.0739461   0.0330397 
 0.353054  0.490395  0.405552  0.0204532   0.0125866 
 0.353054  0.490395  0.405552  0.0         0.0       
 0.353054  0.490395  0.405552  0.135306    0.0676528 
 ⋮                                                   
 0.160421  0.728206  0.359859  0.0         0.0       
 0.160421  0.728206  0.359859  0.636902    0.110372  
 0.160421  0.728206  0.359859  0.41073     0.227982  
 0.160421  0.728206  0.359859  0.0         0.0       
 0.1

## Predict

In [73]:
test_labels = apply_forest(model, test_features)

283-element Array{Int64,1}:
 0
 0
 1
 0
 0
 0
 0
 0
 0
 0
 0
 0
 1
 ⋮
 0
 1
 1
 0
 0
 0
 0
 0
 0
 0
 0
 0

## Generate submission

In [68]:
ID = test_comb[:,:ID_OUVRAGE].*"_".*string.(test_comb[:,:DATE])
sampleSubmission = DataFrame(ID = ID, Surverse=test_labels)
CSV.write("submissions/mc-submission-5.csv",sampleSubmission)

"submissions/mc-submission-5.csv"