In [None]:
using CSV, DataFrames, Statistics, Dates, Gadfly, Random;
include("utils/precipitation.jl");

On garde les ouvrages d'intérêt.

In [None]:
ouvrages = CSV.read("data/ouvrages-surverses.csv");
colnames = ["N_Env", "ID_SOMA", "ID_OUVRAGE", "NOM", "SOMA_SEC", "REGION", "TP_X", "TP_Y", "TP_Z", "TP_LAT", "TP_LNG", "EMI_X", "EMI_Y", "EMI_LNG", "EMI_LAT"];
names!(ouvrages, Symbol.(colnames));
select!(ouvrages, [:ID_OUVRAGE, :TP_LAT, :TP_LNG]);

In [None]:
size(ouvrages)

In [None]:
important_ouvrages = ["3260-01D", "3350-07D", "4240-01D", "4350-01D", "4380-01D"];
ouvrages = filter(row -> row.ID_OUVRAGE ∈ important_ouvrages, ouvrages);

In [None]:
first(ouvrages, 10)

In [None]:
surverses = CSV.read("data/surverses.csv", missingstring="-99999");
surverses = filter(row -> month(row.DATE) > 4, surverses);
surverses = filter(row -> month(row.DATE) < 11, surverses);
surverses[!,:RAISON] = coalesce.(surverses[:,:RAISON],"Inconnue");

surverses = filter(row -> row.RAISON ∈ ["P","Inconnue","TS"], surverses);
select!(surverses, [:NO_OUVRAGE, :DATE, :SURVERSE]);
rename!(surverses, :NO_OUVRAGE => :ID_OUVRAGE);

In [None]:
surverses = filter(row -> row.ID_OUVRAGE ∈ important_ouvrages, surverses);
dropmissing!(surverses);

In [None]:
describe(surverses[!, :SURVERSE])

In [None]:
curr = filter(row -> row.ID_OUVRAGE == important_ouvrages[3], surverses);
describe(curr[!, :SURVERSE])

Le mean correspond au taux de surverses ici -> Beaucoup plus de non surverses que de surverses

### Beaucoup plus de 0 que de 1 -> Class imbalance problem
On le solve avec du over sampling de 1 et du under sampling de 0, plus tard

## Précipitations

In [None]:
precipitations = CSV.read("data/precipitations.csv",missingstring="-99999");
rename!(precipitations, Symbol("St-Hubert")=>:StHubert);

precipitations = filter(row -> month(row.date) > 4, precipitations);
precipitations = filter(row -> month(row.date) < 11, precipitations); 
names(precipitations)

In [None]:
describe(precipitations[!, :StHubert])

### Traitement des données abérantes

In [None]:
idx_outliers = precipitations[!, :McTavish] .> 2000;
idx_outliers[isequal.(idx_outliers, missing)] .= false;
idx_outliers = convert(Array{Bool, 1}, idx_outliers);

date_outlier = precipitations[idx_outliers, :date];
precipitations[idx_outliers, :]

In [None]:
precipitations[idx_outliers, :McTavish] .= 0;
precipitations[idx_outliers, :StHubert] .= 0;
precipitations[idx_outliers, :]

In [None]:
precipitations[precipitations.date .== Date(2017, 7, 20), :Bellevue] .= 0;
precipitations[precipitations.date .== Date(2017, 7, 20), :Trudeau] .= 0; # VOIR SI GARDER LUI

precipitations[precipitations.date .== Date(2013, 6, 24), :Assomption] .= 0;
precipitations[precipitations.date .== Date(2014, 8, 5), :Assomption] .= 0;
precipitations[precipitations.date .== Date(2015, 6, 10), :Assomption] .= 0;
precipitations[precipitations.date .== Date(2018, 7, 26), :Trudeau] .= 0;

### Traitement des données manquantes

In [None]:
precipitation_by_day = by(precipitations, :date,  
                            McTavish = :McTavish=>mean_wo_missing, 
                            Bellevue = :Bellevue=>mean_wo_missing, 
                            Assomption = :Assomption=>mean_wo_missing,
                            Trudeau = :Trudeau=>mean_wo_missing,
                            StHubert = :StHubert=>mean_wo_missing)

for i=1:size(precipitations,1)
    if isequal(precipitations[i, :McTavish], missing)
        precipitations[i,:McTavish] = filter(row-> row.date == precipitations[i,:date], precipitation_by_day)[!,:McTavish][1]
    end
    if isequal(precipitations[i, :Bellevue], missing)
        precipitations[i,:Bellevue] = filter(row-> row.date == precipitations[i,:date], precipitation_by_day)[!,:Bellevue][1]
    end
    if isequal(precipitations[i, :Assomption], missing)
        precipitations[i,:Assomption] = filter(row-> row.date == precipitations[i,:date], precipitation_by_day)[!,:Assomption][1]
    end
    if isequal(precipitations[i, :Trudeau], missing)
        precipitations[i,:Trudeau] = filter(row-> row.date == precipitations[i,:date], precipitation_by_day)[!,:Trudeau][1]
    end
    if isequal(precipitations[i, :StHubert], missing)
        precipitations[i,:StHubert] = filter(row-> row.date == precipitations[i,:date], precipitation_by_day)[!,:StHubert][1]
    end
end

In [None]:
describe(precipitations[!, :StHubert])

In [None]:
first(shuffleDf(precipitations), 10)

In [None]:
pcp_sum = by(precipitations, :date,  
            McTavish = :McTavish=>sum, 
            Bellevue = :Bellevue=>sum,
            Assomption = :Assomption=>sum, 
            Trudeau = :Trudeau=>sum, 
            StHubert = :StHubert=>sum);

In [None]:
first(shuffleDf(pcp_sum), 5)

In [None]:
df_for_plot = melt(pcp_sum, :date)
set_default_plot_size(25cm, 13cm)

plot(df_for_plot, x=:date, y=:value, Geom.point, color=:variable)

On réduit le gros outlier 

In [None]:
pcp_max = by(precipitations, :date,  
            McTavish = :McTavish=>maximum,
            Bellevue = :Bellevue=>maximum, 
            Assomption = :Assomption=>maximum,
            Trudeau = :Trudeau=>maximum,
            StHubert = :StHubert=>maximum);

In [None]:
df_for_plot = melt(pcp_max, :date)
plot(df_for_plot, x=:date, y=:value, Geom.point, color=:variable)

In [None]:
first(shuffleDf(pcp_max), 5)

In [None]:
pcp_max3 = by(precipitations, :date,
                McTavish = :McTavish=>maximum3,
                Bellevue = :Bellevue=>maximum3,
                Assomption = :Assomption=>maximum3,
                Trudeau = :Trudeau=>maximum3,
                StHubert = :StHubert=>maximum3);

In [None]:
df_for_plot = melt(pcp_max3, :date)
plot(df_for_plot, x=:date, y=:value, Geom.point, color=:variable)

In [None]:
first(shuffleDf(pcp_max3), 5)

## Standardiser les données de précipitations

In [None]:
function standardize_col(col)
    mean_col = mean(col);
    std_col = std(col);
    
    res = (col .- mean_col) ./ std_col;
    
    return res;
end

In [None]:
pcp_sum[!, :McTavish] = standardize_col(pcp_sum[!, :McTavish]);
pcp_sum[!, :Bellevue] = standardize_col(pcp_sum[!, :Bellevue]);
pcp_sum[!, :Assomption] = standardize_col(pcp_sum[!, :Assomption]);
pcp_sum[!, :Trudeau] = standardize_col(pcp_sum[!, :Trudeau]);
pcp_sum[!, :StHubert] = standardize_col(pcp_sum[!, :StHubert]);

pcp_max[!, :McTavish] = standardize_col(pcp_max[!, :McTavish]);
pcp_max[!, :Bellevue] = standardize_col(pcp_max[!, :Bellevue]);
pcp_max[!, :Assomption] = standardize_col(pcp_max[!, :Assomption]);
pcp_max[!, :Trudeau] = standardize_col(pcp_max[!, :Trudeau]);
pcp_max[!, :StHubert] = standardize_col(pcp_max[!, :StHubert]);

pcp_max3[!, :McTavish] = standardize_col(pcp_max3[!, :McTavish]);
pcp_max3[!, :Bellevue] = standardize_col(pcp_max3[!, :Bellevue]);
pcp_max3[!, :Assomption] = standardize_col(pcp_max3[!, :Assomption]);
pcp_max3[!, :Trudeau] = standardize_col(pcp_max3[!, :Trudeau]);
pcp_max3[!, :StHubert] = standardize_col(pcp_max3[!, :StHubert]);

In [None]:
first(shuffleDf(pcp_sum), 10)

## Stations

In [None]:
station_df = DataFrame(STATION = String[], LAT = Float64[], LNG = Float64[]);

push!(station_df, ["McTavish", 45.504742, -73.579167]);
push!(station_df, ["Bellevue", 45.427222, -73.929167]);
push!(station_df, ["Assomption", 45.809444, -73.434722]);
push!(station_df, ["Trudeau", 45.467778, -73.741667]);
push!(station_df, ["StHubert", 45.5175, -73.416944]);

station_df

### On ajoute les colonnes de précipitations

In [None]:
train_data = surverses;

train_data[!, :FS_dist] = zeros(size(train_data, 1));
train_data[!, :SS_dist] = zeros(size(train_data, 1));
train_data[!, :FS_sum] = zeros(size(train_data, 1));
train_data[!, :FS_max] = zeros(size(train_data, 1));
train_data[!, :FS_max3] = zeros(size(train_data, 1));
train_data[!, :SS_sum] = zeros(size(train_data, 1));
train_data[!, :SS_max] = zeros(size(train_data, 1));
train_data[!, :SS_max3] = zeros(size(train_data, 1));

In [None]:
first(shuffleDf(train_data), 5)

In [None]:
describe(train_data[!, :SURVERSE])

Populate les fields de chaque data

In [None]:
for i=1:size(train_data, 1)
    curr_ouvrage = train_data[i, 1];
    ouvrage_data = filter(row -> row.ID_OUVRAGE == curr_ouvrage, ouvrages);
    
    closest_station = nothing;
    closest_distance = 9999;
    
    second_closest_station = nothing;
    second_closest_distance = 9999;
    
    # Pour chaque station
    for j=1:5
       current_station = station_df[j, :STATION];
       dist = findDistance(ouvrage_data[1, :TP_LAT], ouvrage_data[1, :TP_LNG], station_df[j, :LAT], station_df[j, :LNG]);
       
        if dist < closest_distance
            second_closest_distance = closest_distance;
            second_closest_station = closest_station;
            closest_distance = dist;
            closest_station = current_station;
        elseif dist < second_closest_distance
            second_closest_distance = dist;
            second_closest_station = current_station;
        end
    end
    
    train_data[i, :FS_dist] = closest_distance;
    train_data[i, :SS_dist] = second_closest_distance;
    
    # Add data for first station
    sum_p = pcp_sum[∈([train_data[i, :DATE]]).(pcp_sum.date), Symbol(closest_station)];
    train_data[i, :FS_sum] = sum_p[1];
    max_p = pcp_max[∈([train_data[i, :DATE]]).(pcp_max.date), Symbol(closest_station)];
    train_data[i, :FS_max] = max_p[1];
    max3_p = pcp_max3[∈([train_data[i, :DATE]]).(pcp_max3.date), Symbol(closest_station)];
    train_data[i, :FS_max3] = max3_p[1];
    
    # Find multiplier for second station
    ratio = second_closest_distance / closest_distance;
    logratio = log(sqrt(ratio));
    multiplier = 1 - logratio;
    
    # Add data for second station
    s_sum_p = pcp_sum[∈([train_data[i, :DATE]]).(pcp_sum.date), Symbol(second_closest_station)];
    train_data[i, :SS_sum] = s_sum_p[1] * multiplier;
    s_max_p = pcp_max[∈([train_data[i, :DATE]]).(pcp_max.date), Symbol(second_closest_station)];
    train_data[i, :SS_max] = s_max_p[1] * multiplier;
    s_max3_p = pcp_max3[∈([train_data[i, :DATE]]).(pcp_max3.date), Symbol(second_closest_station)];
    train_data[i, :SS_max3] = s_max3_p[1] * multiplier;
end

In [None]:
cols = [:ID_OUVRAGE, :SURVERSE, :FS_dist, :SS_dist, :FS_sum, :SS_sum, :FS_max, :SS_max];
first(shuffleDf(train_data[!, cols]), 10)

### Save dataframes in files per ouvrage

In [None]:
ouvrage_3260 = filter(row -> row.ID_OUVRAGE == "3260-01D", train_data);
select!(ouvrage_3260, Not(:ID_OUVRAGE));
CSV.write("data/parsed/ouvrage_3260.csv",ouvrage_3260)

In [None]:
ouvrage_3350 = filter(row -> row.ID_OUVRAGE == "3350-07D", train_data)
select!(ouvrage_3350, Not(:ID_OUVRAGE));
CSV.write("data/parsed/ouvrage_3350.csv",ouvrage_3350)

In [None]:
ouvrage_4240 = filter(row -> row.ID_OUVRAGE == "4240-01D", train_data)
select!(ouvrage_4240, Not(:ID_OUVRAGE));
CSV.write("data/parsed/ouvrage_4240.csv",ouvrage_4240)

In [None]:
ouvrage_4350 = filter(row -> row.ID_OUVRAGE == "4350-01D", train_data)
select!(ouvrage_4350, Not(:ID_OUVRAGE));
CSV.write("data/parsed/ouvrage_4350.csv",ouvrage_4350)

In [None]:
ouvrage_4380 = filter(row -> row.ID_OUVRAGE == "4380-01D", train_data)
select!(ouvrage_4380, Not(:ID_OUVRAGE));
CSV.write("data/parsed/ouvrage_4380.csv",ouvrage_4380)

### Tests

In [None]:
test_data = CSV.read("data/test.csv");
rename!(test_data, :NO_OUVRAGE => :ID_OUVRAGE);

In [None]:
levels(test_data[:,:ID_OUVRAGE])

In [None]:
test_data[!, :FS_dist] = zeros(size(test_data, 1));
test_data[!, :SS_dist] = zeros(size(test_data, 1));
test_data[!, :FS_sum] = zeros(size(test_data, 1));
test_data[!, :FS_max] = zeros(size(test_data, 1));
test_data[!, :FS_max3] = zeros(size(test_data, 1));
test_data[!, :SS_sum] = zeros(size(test_data, 1));
test_data[!, :SS_max] = zeros(size(test_data, 1));
test_data[!, :SS_max3] = zeros(size(test_data, 1));

In [None]:
for i=1:size(test_data, 1)
    curr_ouvrage = test_data[i, 1];
    ouvrage_data = filter(row -> row.ID_OUVRAGE == curr_ouvrage, ouvrages);
    
    closest_station = nothing;
    closest_distance = 9999;
    
    second_closest_station = nothing;
    second_closest_distance = 9999;
    
    # Pour chaque station
    for j=1:5
       current_station = station_df[j, :STATION];
       dist = findDistance(ouvrage_data[1, :TP_LAT], ouvrage_data[1, :TP_LNG], station_df[j, :LAT], station_df[j, :LNG]);
       
        if dist < closest_distance
            second_closest_distance = closest_distance;
            second_closest_station = closest_station;
            closest_distance = dist;
            closest_station = current_station;
        elseif dist < second_closest_distance
            second_closest_distance = dist;
            second_closest_station = current_station;
        end
    end
    
    test_data[i, :FS_dist] = closest_distance;
    test_data[i, :SS_dist] = second_closest_distance;
    
    # Add data for first station
    sum_p = pcp_sum[∈([test_data[i, :DATE]]).(pcp_sum.date), Symbol(closest_station)];
    test_data[i, :FS_sum] = sum_p[1];
    max_p = pcp_max[∈([test_data[i, :DATE]]).(pcp_max.date), Symbol(closest_station)];
    test_data[i, :FS_max] = max_p[1];
    max3_p = pcp_max3[∈([test_data[i, :DATE]]).(pcp_max3.date), Symbol(closest_station)];
    test_data[i, :FS_max3] = max3_p[1];
    
    # Find multiplier for second station
    ratio = second_closest_distance / closest_distance;
    logratio = log(sqrt(ratio));
    multiplier = 1 - logratio;
    
    # Add data for second station
    s_sum_p = pcp_sum[∈([test_data[i, :DATE]]).(pcp_sum.date), Symbol(second_closest_station)];
    test_data[i, :SS_sum] = s_sum_p[1] * multiplier;
    s_max_p = pcp_max[∈([test_data[i, :DATE]]).(pcp_max.date), Symbol(second_closest_station)];
    test_data[i, :SS_max] = s_max_p[1] * multiplier;
    s_max3_p = pcp_max3[∈([test_data[i, :DATE]]).(pcp_max3.date), Symbol(second_closest_station)];
    test_data[i, :SS_max3] = s_max3_p[1] * multiplier;
end

In [None]:
cols = [:ID_OUVRAGE, :FS_dist, :SS_dist, :FS_sum, :SS_sum, :FS_max, :SS_max];
first(shuffleDf(test_data[!, cols]), 10)

In [None]:
test_3260 = filter(row -> row.ID_OUVRAGE == "3260-01D", test_data);
select!(test_3260, Not(:ID_OUVRAGE));
CSV.write("data/parsed/test_3260.csv",test_3260)

In [None]:
test_3350 = filter(row -> row.ID_OUVRAGE == "3350-07D", test_data);
select!(test_3350, Not(:ID_OUVRAGE));
CSV.write("data/parsed/test_3350.csv",test_3350)

In [None]:
test_4240 = filter(row -> row.ID_OUVRAGE == "4240-01D", test_data);
select!(test_4240, Not(:ID_OUVRAGE));
CSV.write("data/parsed/test_4240.csv",test_4240)

In [None]:
test_4350 = filter(row -> row.ID_OUVRAGE == "4350-01D", test_data);
select!(test_4350, Not(:ID_OUVRAGE));
CSV.write("data/parsed/test_4350.csv",test_4350)

In [None]:
test_4380 = filter(row -> row.ID_OUVRAGE == "4380-01D", test_data);
select!(test_4380, Not(:ID_OUVRAGE));
CSV.write("data/parsed/test_4380.csv",test_4380)