# Data Processing

In [None]:
using Pkg;
Pkg.add("CSV");
Pkg.add("Random");
Pkg.add("DataStructures");
Pkg.add("BenchmarkTools");
Pkg.add("DataFrames");
Pkg.add("Statistics");
Pkg.add("Dates");
Pkg.add("Gadfly");
Pkg.add("MLBase");
Pkg.add("DecisionTree");
Pkg.add("GLM");

In [None]:
using CSV, DataFrames, GLM, Statistics, Dates, Gadfly, Random, MLBase;
include("utils/precipitation.jl");

## Build features

### Get and filter the features

#### Latitude, Longitude, Height

In [None]:
features = CSV.read("data/ouvrages-surverses.csv");
colnames = ["N_Env", "ID_SOMA", "ID_OUVRAGE", "NOM", "SOMA_SEC", "REGION", "TP_X", "TP_Y", "TP_Z", "TP_LAT", "TP_LNG", "EMI_X", "EMI_Y", "EMI_LNG", "EMI_LAT"];
names!(features, Symbol.(colnames));
select!(features, [:ID_OUVRAGE, :TP_LAT, :TP_LNG, :TP_Z]);

#### Replace missing Z index with mean

In [None]:
features.TP_Z = coalesce.(features.TP_Z, mean(features[completecases(features), :].TP_Z));
first(shuffleDf(features), 10)

In [None]:
plot(features, x=:TP_Z, Geom.histogram(bincount=50), Guide.xlabel("Height of TropPlein"),Guide.ylabel("Frequency"))

In [None]:
plot(features,x=:TP_LNG, y=:TP_LAT, Geom.point)

### Load dates and surverses

In [None]:
surverses = CSV.read("data/surverses.csv", missingstring="-99999");

first(shuffleDf(surverses),5)

#### Filter months

In [None]:
surverses = filter(row -> month(row.DATE) > 4, surverses);
surverses = filter(row -> month(row.DATE) < 11, surverses);

#### Filter non rain surverses

In [None]:
raison = coalesce.(surverses[:,:RAISON],"Inconnue");
surverses[!,:RAISON] = raison;

surverses = filter(row -> row.RAISON ∈ ["P","Inconnue","TS"], surverses);
select!(surverses, [:NO_OUVRAGE, :DATE, :SURVERSE]);

#### Remove missing data and rename

In [None]:
surverses = dropmissing(surverses, disallowmissing=true);
rename!(surverses, :NO_OUVRAGE => :ID_OUVRAGE);
first(shuffleDf(surverses),10)

### Augment features with dates and label

In [None]:
comb = join(features, surverses, on =:ID_OUVRAGE);
first(shuffleDf(comb), 10)

In [None]:
df_for_geo_plot = filter(row -> row.DATE == Date(2018,7,25), comb)
df_for_geo_plot[:SURVERSE] = convert(Array{Bool,1}, df_for_geo_plot[:SURVERSE])
plot(df_for_geo_plot, x=:TP_LNG, y=:TP_LAT, Geom.point, color=:SURVERSE, Guide.title("2018-07-25, état des surverses"))
#first(df_for_geo_plot,5) 

### Load precipitation data

#### Load and filter months between May & October included

In [None]:
precipitation = CSV.read("data/precipitations.csv",missingstring="-99999");
rename!(precipitation, Symbol("St-Hubert")=>:StHubert);

precipitation = filter(row -> month(row.date) > 4, precipitation);
precipitation = filter(row -> month(row.date) < 11, precipitation); 

In [None]:
first(shuffleDf(precipitation),5)

#### Replace missing data by 0

In [None]:
precipitation[!,:McTavish] = coalesce.(precipitation[:,:McTavish], 0);
precipitation[!,:Bellevue] = coalesce.(precipitation[:,:Bellevue], 0);
precipitation[!,:Assomption] = coalesce.(precipitation[:,:Assomption], 0);
precipitation[!,:Trudeau] = coalesce.(precipitation[:,:Trudeau], 0);
precipitation[!,:StHubert] = coalesce.(precipitation[:,:StHubert], 0);

first(shuffleDf(precipitation), 5)

### Extract features from precipitation

#### Sum of precipitation for the day

In [None]:
pcp_sum = by(precipitation, :date,  McTavish = :McTavish=>sum, Bellevue = :Bellevue=>sum, 
   Assomption = :Assomption=>sum, Trudeau = :Trudeau=>sum, StHubert = :StHubert=>sum);
first(shuffleDf(pcp_sum), 5)

In [None]:
df_for_plot = pcp_sum
df_for_plot = filter(row -> year(row.date) == 2018, pcp_sum);
df_for_plot = melt(df_for_plot, :date)

plot(df_for_plot, x=:date, y=:value, Geom.line, color=:variable)

#### Maximum precipitation in an hour for the day

In [None]:
pcp_max = by(precipitation, :date,  McTavish = :McTavish=>maximum, Bellevue = :Bellevue=>maximum, 
   Assomption = :Assomption=>maximum, Trudeau = :Trudeau=>maximum, StHubert = :StHubert=>maximum)
first(shuffleDf(pcp_max),5)

In [None]:
#C'est intéractif ! Vous pouvez choisir quelles distribution voir !

df_for_plot = pcp_max
df_for_plot = filter(row -> year(row.date) == 2018, pcp_sum);
df_for_plot = melt(df_for_plot, :date)
plot(df_for_plot, x=:date, y=:value, Geom.line, color=:variable)

#### Maximum precipitation during three consecutive hours in a day

In [None]:
pcp_max3h = by(precipitation, :date,  McTavish = :McTavish=>maximum3, Bellevue = :Bellevue=>maximum3, 
   Assomption = :Assomption=>maximum3, Trudeau = :Trudeau=>maximum3, StHubert = :StHubert=>maximum3)
first(shuffleDf(pcp_max3h),5)

In [None]:
df_for_plot = pcp_max3h
df_for_plot = filter(row -> year(row.date) == 2018, pcp_sum);
df_for_plot = melt(df_for_plot, :date)
plot(df_for_plot, x=:date, y=:value, Geom.line, color=:variable)

#### Visualisation of all three agregations for 1 meteo station

In [None]:
mct_sum = pcp_sum[:,[1,2]]
rename!(mct_sum,:McTavish => :Sum);
mct_sum = filter(row -> year(row.date) == 2018, mct_sum);

mct_max = pcp_max[:,[1,2]]
rename!(mct_max,:McTavish => :Max);
mct_max = filter(row -> year(row.date) == 2018, mct_max);

mct_max3h = pcp_max3h[:,[1,2]]
rename!(mct_max3h,:McTavish => :Max3h);
mct_max3h = filter(row -> year(row.date) == 2018, mct_max3h);

df_for_plot = join(mct_sum, mct_max3h, on = :date);
df_for_plot = join(df_for_plot, mct_max, on = :date);
df_for_plot = melt(df_for_plot, :date)


plot(df_for_plot, x=:date, y=:value, Geom.line, color=:variable)

### Add precipitation data to features

#### Get stations lat-lng

In [None]:
station_df = DataFrame(STATION = String[], LAT = Float64[], LNG = Float64[]);

push!(station_df, ["McTavish", 45.504742, -73.579167]);
push!(station_df, ["Bellevue", 45.427222, -73.929167]);
push!(station_df, ["Assomption", 45.809444, -73.434722]);
push!(station_df, ["Trudeau", 45.467778, -73.741667]);
push!(station_df, ["StHubert", 45.5175, -73.416944]);

station_df

### Standardize TP and station data

In [None]:
meanlat = mean(comb.TP_LAT);
stdlat = std(comb.TP_LAT);
comb.TP_LAT = (comb.TP_LAT .- meanlat) ./ stdlat;
station_df.LAT = (station_df.LAT .- meanlat) ./ stdlat;

meanlng = mean(comb.TP_LNG);
stdlng = std(comb.TP_LNG);
comb.TP_LNG = (comb.TP_LNG .- meanlng) ./ stdlng;
station_df.LNG = (station_df.LNG .- meanlng) ./ stdlng;

meanz = mean(comb.TP_Z);
stdz = std(comb.TP_Z);
comb.TP_Z = (comb.TP_Z .- meanz) ./ stdz;

In [None]:
station_df

### Augment Features

#### Add pcp_sum and pcp_max columns

In [None]:
comb.PCP_SUM = zeros(size(comb, 1));
comb.PCP_MAX = zeros(size(comb, 1));
comb.PCP_MAX3 = zeros(size(comb, 1));
station_ref_df = DataFrame(METEO = fill("", size(comb, 1)))
comb = hcat(comb, station_ref_df)

permutecols!(comb, [:ID_OUVRAGE, :TP_LAT, :TP_LNG, :TP_Z, :DATE, :METEO, :PCP_SUM, :PCP_MAX, :PCP_MAX3, :SURVERSE]);
first(comb, 5)

In [None]:
first(shuffleDf(comb), 10)

#### Find closest station to each ouvrage and add pcp_sum and pcp_max to it

In [None]:
for i=1:size(comb, 1)
    id_ouvrage = comb[i, 1]; 
    closest_station = "McTavish"; # initial value
    shortest_dist = -1;
    
    # Find closest station
    for j=1:size(station_df, 1)
        dist = findDistance(comb[i, :TP_LAT], comb[i, :TP_LNG], station_df[j, :LAT], station_df[j, :LNG]);
        
        if shortest_dist == -1 || dist < shortest_dist
            shortest_dist = dist;
            closest_station = station_df[j, :STATION];
        end
    end
    
    # Augment comb with a weighted p_sum, based on the distance to the station
    p_sum = pcp_sum[∈([comb[i, :DATE]]).(pcp_sum.date), Symbol(closest_station)];
#     comb[i, :PCP_SUM] = p_sum[1] * (1 - shortest_dist);
    comb[i, :PCP_SUM] = p_sum[1]; 
    
    # Augment comb with a weighted p_max, based on the distance to the station
    p_max = pcp_max[∈([comb[i, :DATE]]).(pcp_max.date), Symbol(closest_station)]
#     comb[i, :PCP_MAX] = p_max[1] * (1 - shortest_dist);
    comb[i, :PCP_MAX] = p_max[1];
    
    # Augment comb with a weighted p_max3h, based on the distance to the station
    p_max3 = pcp_max3h[∈([comb[i, :DATE]]).(pcp_max3h.date), Symbol(closest_station)]
#     comb[i, :PCP_MAX3] = p_max3[1] * (1 - shortest_dist);
    comb[i, :PCP_MAX3] = p_max3[1]; 
    
    comb[i, :METEO] = closest_station
end

#### Remove outlier in PCP_SUM and PCP_MAX AND PCP_MAX3 that cause compression

In [None]:
comb[comb[:PCP_SUM] .> 750, :PCP_SUM] = 750;
comb[comb[:PCP_MAX] .> 500, :PCP_MAX] = 500;
comb[comb[:PCP_MAX3] .> 750, :PCP_MAX3] = 750;

In [None]:
first(shuffleDf(filter(row -> row.SURVERSE == 1, comb)), 10)

In [None]:
df_for_geo_plot = filter(row -> row.DATE == Date(2018,7,25), comb)
df_for_geo_plot[:SURVERSE] = convert(Array{Bool,1}, df_for_geo_plot[:SURVERSE])
plot(df_for_geo_plot, x=:TP_LNG, y=:TP_LAT, Geom.point, color=:METEO, Guide.title("2018-07-25, Regroupement par station météo"))
#first(df_for_geo_plot,5) 

In [None]:

# 3260-01D dans Rivière-des-Prairies
# 3350-07D dans Ahunstic
# 4240-01D dans Pointe-aux-Trembles
# 4350-01D dans le Vieux-Montréal
# 4380-01D dans Verdun

id_ouvrage_to_show = "3350-07D"
df_temp = filter(row -> row.ID_OUVRAGE ∈ [id_ouvrage_to_show], comb)
df_temp = filter(row -> year(row.DATE) == 2018, df_temp);
df_temp = df_temp[!,[:ID_OUVRAGE, :DATE, :PCP_MAX3, :SURVERSE]]
df_temp[:SURVERSE] = convert(Array{Bool,1}, df_temp[:SURVERSE])



plot(df_temp, x=:DATE, y=:PCP_MAX3, Geom.point, color=:SURVERSE,Guide.title(id_ouvrage_to_show))
#first(shuffleDf(df_temp), 10)

In [None]:
# 3260-01D dans Rivière-des-Prairies
# 3350-07D dans Ahunstic
# 4240-01D dans Pointe-aux-Trembles
# 4350-01D dans le Vieux-Montréal
# 4380-01D dans Verdun

id_ouvrage_to_show = "4380-01D"
df_temp = filter(row -> row.ID_OUVRAGE ∈ [id_ouvrage_to_show], comb)
df_temp = filter(row -> year(row.DATE) == 2018, df_temp);
df_temp = df_temp[!,[:ID_OUVRAGE, :DATE, :TP_Z, :SURVERSE]]
df_temp[:SURVERSE] = convert(Array{Bool,1}, df_temp[:SURVERSE])



plot(df_temp, x=:TP_Z, Geom.histogram(bincount=10), color=:SURVERSE,Guide.title("Surverse en fonction de TP_Z"))

#plot(df_temp, x=:DATE, y=:TP_Z, Geom.point, color=:SURVERSE,Guide.title("Surverse en fonction de TP_Z"))
#first(shuffleDf(df_temp), 10)

### Split dates into months and days

In [None]:
comb.MONTH = month.(comb.DATE);
comb.DAY = day.(comb.DATE);
first(shuffleDf(comb[!, [:DATE, :MONTH, :DAY]]), 5)

## Standardize the PCP and Date

In [None]:
mean_pcpsum = mean(comb.PCP_SUM);
std_pcpsum = std(comb.PCP_SUM);
comb.PCP_SUM = (comb.PCP_SUM .- mean_pcpsum) ./ std_pcpsum;

mean_pcpmax = mean(comb.PCP_MAX);
std_pcpmax = std(comb.PCP_MAX);
comb.PCP_MAX = (comb.PCP_MAX .- mean_pcpmax) ./ std_pcpmax;

mean_pcpmax3 = mean(comb.PCP_MAX3);
std_pcpmax3 = std(comb.PCP_MAX3);
comb.PCP_MAX3 = (comb.PCP_MAX3 .- mean_pcpmax3) ./ std_pcpmax3;

meanmonth = mean(comb.MONTH);
stdmonth = std(comb.MONTH);
comb.MONTH = (comb.MONTH .- meanmonth) ./ stdmonth;

meanday = mean(comb.DAY);
stdday = std(comb.DAY);
comb.DAY = (comb.DAY .- meanday) ./ stdday;

In [None]:
first(shuffleDf(filter(row -> row.SURVERSE == 1, comb)), 10)