In [1]:
using CSV, DataFrames, Statistics, Dates, Gadfly, Random;
include("utils/precipitation.jl");

┌ Info: Loading DataFrames support into Gadfly.jl
└ @ Gadfly /home/chaime/.julia/packages/Gadfly/09PWZ/src/mapping.jl:228


On garde les ouvrages d'intérêt.

In [2]:
ouvrages = CSV.read("data/ouvrages-surverses.csv");
colnames = ["N_Env", "ID_SOMA", "ID_OUVRAGE", "NOM", "SOMA_SEC", "REGION", "TP_X", "TP_Y", "TP_Z", "TP_LAT", "TP_LNG", "EMI_X", "EMI_Y", "EMI_LNG", "EMI_LAT"];
names!(ouvrages, Symbol.(colnames));
select!(ouvrages, [:ID_OUVRAGE, :TP_LAT, :TP_LNG, :TP_Z]);
ouvrages.TP_Z = coalesce.(ouvrages.TP_Z, mean(ouvrages[completecases(ouvrages), :].TP_Z));

│   caller = top-level scope at In[2]:3
└ @ Core In[2]:3


In [3]:
size(ouvrages)

(167, 4)

In [4]:
important_ouvrages = ["3260-01D", "3350-07D", "4240-01D", "4350-01D", "4380-01D"];
ouvrages = filter(row -> row.ID_OUVRAGE ∈ important_ouvrages, ouvrages);

In [5]:
first(ouvrages, 5)

Unnamed: 0_level_0,ID_OUVRAGE,TP_LAT,TP_LNG,TP_Z
Unnamed: 0_level_1,String,Float64,Float64,Float64
1,3260-01D,45.6507,-73.5803,20.17
2,3350-07D,45.5461,-73.6921,20.75
3,4240-01D,45.6497,-73.4877,11.91
4,4350-01D,45.4991,-73.555,19.3526
5,4380-01D,45.4677,-73.5637,19.3526


In [41]:
surverses = CSV.read("data/surverses.csv", missingstring="-99999");
surverses = filter(row -> month(row.DATE) > 4, surverses);
surverses = filter(row -> month(row.DATE) < 11, surverses);
surverses[!,:RAISON] = coalesce.(surverses[:,:RAISON],"Inconnue");

surverses = filter(row -> row.RAISON ∈ ["P","Inconnue","TS"], surverses);
select!(surverses, [:NO_OUVRAGE, :DATE, :SURVERSE]);
rename!(surverses, :NO_OUVRAGE => :ID_OUVRAGE);

In [42]:
surverses = filter(row -> row.ID_OUVRAGE ∈ important_ouvrages, surverses);
dropmissing!(surverses);

In [43]:
describe(surverses[!, :SURVERSE])

Summary Stats:
Length:         5129
Missing Count:  0
Mean:           0.085202
Minimum:        0.000000
1st Quartile:   0.000000
Median:         0.000000
3rd Quartile:   0.000000
Maximum:        1.000000
Type:           Int64


In [44]:
curr = filter(row -> row.ID_OUVRAGE == important_ouvrages[3], surverses);
describe(curr[!, :SURVERSE])

Summary Stats:
Length:         1100
Missing Count:  0
Mean:           0.062727
Minimum:        0.000000
1st Quartile:   0.000000
Median:         0.000000
3rd Quartile:   0.000000
Maximum:        1.000000
Type:           Int64


Le mean correspond au taux de surverses ici -> Beaucoup plus de non surverses que de surverses

### Beaucoup plus de 0 que de 1 -> Class imbalance problem
On le solve avec du over sampling de 1 et du under sampling de 0, plus tard

## Précipitations

In [10]:
precipitations = CSV.read("data/precipitations.csv",missingstring="-99999");
rename!(precipitations, Symbol("St-Hubert")=>:StHubert);

precipitations = filter(row -> month(row.date) > 4, precipitations);
precipitations = filter(row -> month(row.date) < 11, precipitations); 
names(precipitations)

7-element Array{Symbol,1}:
 :date      
 :heure     
 :McTavish  
 :Bellevue  
 :Assomption
 :Trudeau   
 :StHubert  

In [11]:
describe(precipitations[!, :StHubert])

Summary Stats:
Length:         30912
Missing Count:  5206
Mean:           1.223683
Minimum:        0.000000
1st Quartile:   0.000000
Median:         0.000000
3rd Quartile:   0.000000
Maximum:        307.000000
Type:           Union{Missing, Int64}


In [12]:
precipitation_by_day = by(precipitations, :date,  
                            McTavish = :McTavish=>mean_wo_missing, 
                            Bellevue = :Bellevue=>mean_wo_missing, 
                            Assomption = :Assomption=>mean_wo_missing,
                            Trudeau = :Trudeau=>mean_wo_missing,
                            StHubert = :StHubert=>mean_wo_missing)

for i=1:size(precipitations,1)
    if isequal(precipitations[i, :McTavish], missing)
        precipitations[i,:McTavish] = filter(row-> row.date == precipitations[i,:date], precipitation_by_day)[!,:McTavish][1]
    end
    if isequal(precipitations[i, :Bellevue], missing)
        precipitations[i,:Bellevue] = filter(row-> row.date == precipitations[i,:date], precipitation_by_day)[!,:Bellevue][1]
    end
    if isequal(precipitations[i, :Assomption], missing)
        precipitations[i,:Assomption] = filter(row-> row.date == precipitations[i,:date], precipitation_by_day)[!,:Assomption][1]
    end
    if isequal(precipitations[i, :Trudeau], missing)
        precipitations[i,:Trudeau] = filter(row-> row.date == precipitations[i,:date], precipitation_by_day)[!,:Trudeau][1]
    end
    if isequal(precipitations[i, :StHubert], missing)
        precipitations[i,:StHubert] = filter(row-> row.date == precipitations[i,:date], precipitation_by_day)[!,:StHubert][1]
    end
end

In [13]:
describe(precipitations[!, :StHubert])

Summary Stats:
Length:         30912
Missing Count:  0
Mean:           1.018957
Minimum:        0.000000
1st Quartile:   0.000000
Median:         0.000000
3rd Quartile:   0.000000
Maximum:        307.000000
Type:           Union{Missing, Int64}


In [17]:
first(shuffleDf(precipitations), 10)

Unnamed: 0_level_0,date,heure,McTavish,Bellevue,Assomption,Trudeau,StHubert
Unnamed: 0_level_1,Date,Int64,Int64⍰,Int64⍰,Int64⍰,Int64⍰,Int64⍰
1,2017-06-23,23,0,0,0,0,0
2,2016-05-09,15,0,0,0,0,0
3,2018-10-07,22,0,0,0,0,0
4,2013-10-26,20,14,6,0,6,0
5,2014-09-10,20,0,0,0,0,0
6,2016-07-22,5,0,0,0,0,0
7,2016-10-22,20,26,11,20,13,30
8,2015-08-15,8,0,0,0,3,0
9,2015-05-31,18,0,0,0,0,0
10,2014-07-03,17,0,0,0,0,0


In [18]:
pcp_sum = by(precipitations, :date,  
            McTavish = :McTavish=>sum, 
            Bellevue = :Bellevue=>sum,
            Assomption = :Assomption=>sum, 
            Trudeau = :Trudeau=>sum, 
            StHubert = :StHubert=>sum);

In [51]:
first(shuffleDf(pcp_sum), 5)

Unnamed: 0_level_0,date,McTavish,Bellevue,Assomption,Trudeau,StHubert
Unnamed: 0_level_1,Date,Int64,Int64,Int64,Int64,Int64
1,2013-08-03,0,189,270,5,0
2,2016-05-24,0,0,0,0,0
3,2018-07-12,0,0,10,0,0
4,2016-06-05,562,234,360,283,424
5,2019-09-04,227,0,0,224,131


In [None]:
df_for_plot = melt(pcp_sum, :date)
set_default_plot_size(25cm, 13cm)

plot(df_for_plot, x=:date, y=:value, Geom.point, color=:variable)

On réduit le gros outlier 

In [19]:
filter(row -> row.McTavish > 750, pcp_sum)
pcp_sum[pcp_sum[:McTavish] .> 750, :McTavish] = 750;

│   caller = top-level scope at In[19]:2
└ @ Core In[19]:2
│   caller = top-level scope at In[19]:2
└ @ Core In[19]:2


In [20]:
pcp_max = by(precipitations, :date,  
            McTavish = :McTavish=>maximum,
            Bellevue = :Bellevue=>maximum, 
            Assomption = :Assomption=>maximum,
            Trudeau = :Trudeau=>maximum,
            StHubert = :StHubert=>maximum);

In [None]:
df_for_plot = melt(pcp_max, :date)
plot(df_for_plot, x=:date, y=:value, Geom.point, color=:variable)

In [21]:
filter(row -> row.McTavish > 300, pcp_max)
pcp_max[pcp_max[:McTavish] .> 300, :McTavish] = 300;

│   caller = top-level scope at In[21]:2
└ @ Core In[21]:2
│   caller = top-level scope at In[21]:2
└ @ Core In[21]:2


In [55]:
first(shuffleDf(pcp_max), 5)

Unnamed: 0_level_0,date,McTavish,Bellevue,Assomption,Trudeau,StHubert
Unnamed: 0_level_1,Date,Int64,Int64,Int64,Int64,Int64
1,2016-09-06,0,0,0,0,0
2,2017-10-05,0,0,0,0,2
3,2019-05-02,12,7,7,8,10
4,2013-08-08,0,0,175,0,0
5,2015-10-13,26,5,20,13,10


In [22]:
pcp_max3 = by(precipitations, :date,
                McTavish = :McTavish=>maximum3,
                Bellevue = :Bellevue=>maximum3,
                Assomption = :Assomption=>maximum3,
                Trudeau = :Trudeau=>maximum3,
                StHubert = :StHubert=>maximum3);

In [None]:
df_for_plot = melt(pcp_max3, :date)
plot(df_for_plot, x=:date, y=:value, Geom.point, color=:variable)

In [23]:
filter(row -> row.McTavish > 550, pcp_max3)
pcp_max3[pcp_max3[:McTavish] .> 550, :McTavish] = 550;

│   caller = top-level scope at In[23]:2
└ @ Core In[23]:2
│   caller = top-level scope at In[23]:2
└ @ Core In[23]:2


In [58]:
first(shuffleDf(pcp_max3), 5)

Unnamed: 0_level_0,date,McTavish,Bellevue,Assomption,Trudeau,StHubert
Unnamed: 0_level_1,Date,Int64,Int64,Int64,Int64,Int64
1,2015-08-22,0,0,0,0,0
2,2016-08-26,0,0,0,0,0
3,2014-07-25,0,0,0,0,0
4,2013-06-14,0,0,0,0,0
5,2014-08-23,0,0,20,0,0


## Standardiser les données de précipitations

In [72]:
function standardize_col(col)
    mean_col = mean(col);
    std_col = std(col);
    
    res = (col .- mean_col) ./ std_col;
    
    return res;
end

standardize_col (generic function with 1 method)

In [73]:
pcp_sum[!, :McTavish] = standardize_col(pcp_sum[!, :McTavish]);
pcp_sum[!, :Bellevue] = standardize_col(pcp_sum[!, :Bellevue]);
pcp_sum[!, :Assomption] = standardize_col(pcp_sum[!, :Assomption]);
pcp_sum[!, :Trudeau] = standardize_col(pcp_sum[!, :Trudeau]);
pcp_sum[!, :StHubert] = standardize_col(pcp_sum[!, :StHubert]);

pcp_max[!, :McTavish] = standardize_col(pcp_max[!, :McTavish]);
pcp_max[!, :Bellevue] = standardize_col(pcp_max[!, :Bellevue]);
pcp_max[!, :Assomption] = standardize_col(pcp_max[!, :Assomption]);
pcp_max[!, :Trudeau] = standardize_col(pcp_max[!, :Trudeau]);
pcp_max[!, :StHubert] = standardize_col(pcp_max[!, :StHubert]);

pcp_max3[!, :McTavish] = standardize_col(pcp_max3[!, :McTavish]);
pcp_max3[!, :Bellevue] = standardize_col(pcp_max3[!, :Bellevue]);
pcp_max3[!, :Assomption] = standardize_col(pcp_max3[!, :Assomption]);
pcp_max3[!, :Trudeau] = standardize_col(pcp_max3[!, :Trudeau]);
pcp_max3[!, :StHubert] = standardize_col(pcp_max3[!, :StHubert]);

In [75]:
first(shuffleDf(pcp_sum), 10)

Unnamed: 0_level_0,date,McTavish,Bellevue,Assomption,Trudeau,StHubert
Unnamed: 0_level_1,Date,Float64,Float64,Float64,Float64,Float64
1,2019-07-23,-0.391555,-0.377483,-0.396733,-0.402043,-0.358123
2,2015-05-06,-0.391555,-0.377483,-0.272301,-0.402043,-0.358123
3,2018-08-15,-0.242534,-0.377483,-0.272301,-0.318575,-0.358123
4,2013-10-09,-0.391555,-0.377483,-0.272301,-0.402043,-0.358123
5,2018-06-06,-0.391555,-0.315351,-0.396733,-0.402043,-0.358123
6,2013-09-25,-0.391555,-0.377483,-0.396733,-0.402043,-0.358123
7,2015-06-13,-0.366718,-0.346417,-0.396733,-0.402043,-0.328835
8,2016-10-12,-0.391555,-0.377483,-0.396733,-0.402043,-0.358123
9,2017-08-07,-0.391555,-0.377483,-0.396733,-0.402043,-0.358123
10,2014-08-22,-0.391555,-0.377483,-0.259858,-0.402043,-0.358123


## Stations

In [76]:
station_df = DataFrame(STATION = String[], LAT = Float64[], LNG = Float64[]);

push!(station_df, ["McTavish", 45.504742, -73.579167]);
push!(station_df, ["Bellevue", 45.427222, -73.929167]);
push!(station_df, ["Assomption", 45.809444, -73.434722]);
push!(station_df, ["Trudeau", 45.467778, -73.741667]);
push!(station_df, ["StHubert", 45.5175, -73.416944]);

station_df

Unnamed: 0_level_0,STATION,LAT,LNG
Unnamed: 0_level_1,String,Float64,Float64
1,McTavish,45.5047,-73.5792
2,Bellevue,45.4272,-73.9292
3,Assomption,45.8094,-73.4347
4,Trudeau,45.4678,-73.7417
5,StHubert,45.5175,-73.4169


### On ajoute les colonnes de précipitations

In [77]:
train_data = surverses;

train_data[!, :FS_dist] = zeros(size(train_data, 1));
train_data[!, :SS_dist] = zeros(size(train_data, 1));
train_data[!, :FS_sum] = zeros(size(train_data, 1));
train_data[!, :FS_max] = zeros(size(train_data, 1));
train_data[!, :FS_max3] = zeros(size(train_data, 1));
train_data[!, :SS_sum] = zeros(size(train_data, 1));
train_data[!, :SS_max] = zeros(size(train_data, 1));
train_data[!, :SS_max3] = zeros(size(train_data, 1));

In [78]:
first(shuffleDf(train_data), 5)

Unnamed: 0_level_0,ID_OUVRAGE,DATE,SURVERSE,FS_dist,SS_dist,FS_sum,FS_max,FS_max3,SS_sum
Unnamed: 0_level_1,String,Date,Int64,Float64,Float64,Float64,Float64,Float64,Float64
1,3260-01D,2013-08-19,0,0.0,0.0,0.0,0.0,0.0,0.0
2,3260-01D,2014-08-01,0,0.0,0.0,0.0,0.0,0.0,0.0
3,4350-01D,2015-09-18,0,0.0,0.0,0.0,0.0,0.0,0.0
4,4380-01D,2017-08-20,0,0.0,0.0,0.0,0.0,0.0,0.0
5,4380-01D,2014-06-23,0,0.0,0.0,0.0,0.0,0.0,0.0


In [79]:
describe(train_data[!, :SURVERSE])

Summary Stats:
Length:         5129
Missing Count:  0
Mean:           0.085202
Minimum:        0.000000
1st Quartile:   0.000000
Median:         0.000000
3rd Quartile:   0.000000
Maximum:        1.000000
Type:           Int64


Populate les fields de chaque data

In [84]:
for i=1:size(train_data, 1)
    curr_ouvrage = train_data[i, 1];
    ouvrage_data = filter(row -> row.ID_OUVRAGE == curr_ouvrage, ouvrages);
    
    closest_station = nothing;
    closest_distance = 9999;
    
    second_closest_station = nothing;
    second_closest_distance = 9999;
    
    # Pour chaque station
    for j=1:5
       current_station = station_df[j, :STATION];
       dist = findDistance(ouvrage_data[1, :TP_LAT], ouvrage_data[1, :TP_LNG], station_df[j, :LAT], station_df[j, :LNG]);
       
        if dist < closest_distance
            second_closest_distance = closest_distance;
            second_closest_station = closest_station;
            closest_distance = dist;
            closest_station = current_station;
        elseif dist < second_closest_distance
            second_closest_distance = dist;
            second_closest_station = current_station;
        end
    end
    
    train_data[i, :FS_dist] = closest_distance;
    train_data[i, :SS_dist] = second_closest_distance;
    
    # Add data for first station
    sum_p = pcp_sum[∈([train_data[i, :DATE]]).(pcp_sum.date), Symbol(closest_station)];
    train_data[i, :FS_sum] = sum_p[1];
    max_p = pcp_max[∈([train_data[i, :DATE]]).(pcp_max.date), Symbol(closest_station)];
    train_data[i, :FS_max] = max_p[1];
    max3_p = pcp_max3[∈([train_data[i, :DATE]]).(pcp_max3.date), Symbol(closest_station)];
    train_data[i, :FS_max3] = max3_p[1];
    
    # Find multiplier for second station
    ratio = second_closest_distance / closest_distance;
    logratio = log(sqrt(ratio));
    multiplier = 1 - logratio;
    
    # Add data for second station
    s_sum_p = pcp_sum[∈([train_data[i, :DATE]]).(pcp_sum.date), Symbol(second_closest_station)];
    train_data[i, :SS_sum] = s_sum_p[1] * multiplier;
    s_max_p = pcp_max[∈([train_data[i, :DATE]]).(pcp_max.date), Symbol(second_closest_station)];
    train_data[i, :SS_max] = s_max_p[1] * multiplier;
    s_max3_p = pcp_max3[∈([train_data[i, :DATE]]).(pcp_max3.date), Symbol(second_closest_station)];
    train_data[i, :SS_max3] = s_max3_p[1] * multiplier;
end

In [95]:
cols = [:ID_OUVRAGE, :SURVERSE, :FS_dist, :SS_dist, :FS_sum, :SS_sum, :FS_max, :SS_max];
first(shuffleDf(train_data[!, cols]), 10)

Unnamed: 0_level_0,ID_OUVRAGE,SURVERSE,FS_dist,SS_dist,FS_sum,SS_sum,FS_max,SS_max
Unnamed: 0_level_1,String,Int64,Float64,Float64,Float64,Float64,Float64,Float64
1,3260-01D,0,0.145981,0.210769,-0.391555,-0.292356,-0.41013,-0.306429
2,3260-01D,0,0.145981,0.210769,-0.292208,-0.208672,-0.348339,-0.161624
3,3260-01D,0,0.145981,0.210769,-0.391555,-0.292356,-0.41013,-0.306429
4,4350-01D,0,0.0248354,0.139269,-0.391555,-0.0493965,-0.41013,-0.0517744
5,4380-01D,0,0.0401006,0.154948,1.14832,-0.116085,0.887471,-0.121674
6,3350-07D,0,0.0927373,0.12027,-0.332486,-0.319051,-0.31887,-0.303061
7,3350-07D,0,0.0927373,0.12027,-0.0542623,-0.0813586,-0.0259745,-0.088026
8,4380-01D,1,0.0401006,0.154948,3.25944,1.19881,1.84522,0.660295
9,3260-01D,0,0.145981,0.210769,-0.391555,-0.292356,-0.41013,-0.306429
10,4350-01D,0,0.0248354,0.139269,0.800609,0.162693,0.238671,0.0705573


### Save dataframes in files per ouvrage

In [90]:
ouvrage_3260 = filter(row -> row.ID_OUVRAGE == "3260-01D", train_data);
select!(ouvrage_3260, Not(:ID_OUVRAGE));
CSV.write("data/parsed/ouvrage_3260.csv",ouvrage_3260)

"data/parsed/ouvrage_3260.csv"

In [96]:
ouvrage_3350 = filter(row -> row.ID_OUVRAGE == "3350-07D", train_data)
select!(ouvrage_3350, Not(:ID_OUVRAGE));
CSV.write("data/parsed/ouvrage_3350.csv",ouvrage_3350)

"data/parsed/ouvrage_3350.csv"

In [97]:
ouvrage_4240 = filter(row -> row.ID_OUVRAGE == "4240-01D", train_data)
select!(ouvrage_4240, Not(:ID_OUVRAGE));
CSV.write("data/parsed/ouvrage_4240.csv",ouvrage_4240)

"data/parsed/ouvrage_4240.csv"

In [98]:
ouvrage_4350 = filter(row -> row.ID_OUVRAGE == "4350-01D", train_data)
select!(ouvrage_4350, Not(:ID_OUVRAGE));
CSV.write("data/parsed/ouvrage_4350.csv",ouvrage_4350)

"data/parsed/ouvrage_4350.csv"

In [99]:
ouvrage_4380 = filter(row -> row.ID_OUVRAGE == "4380-01D", train_data)
select!(ouvrage_4380, Not(:ID_OUVRAGE));
CSV.write("data/parsed/ouvrage_4380.csv",ouvrage_4380)

"data/parsed/ouvrage_4380.csv"

### Tests

In [107]:
test_data = CSV.read("data/test.csv");
rename!(test_data, :NO_OUVRAGE => :ID_OUVRAGE);

In [108]:
levels(test_data[:,:ID_OUVRAGE])

5-element Array{String,1}:
 "3260-01D"
 "3350-07D"
 "4240-01D"
 "4350-01D"
 "4380-01D"

In [109]:
test_data[!, :FS_dist] = zeros(size(test_data, 1));
test_data[!, :SS_dist] = zeros(size(test_data, 1));
test_data[!, :FS_sum] = zeros(size(test_data, 1));
test_data[!, :FS_max] = zeros(size(test_data, 1));
test_data[!, :FS_max3] = zeros(size(test_data, 1));
test_data[!, :SS_sum] = zeros(size(test_data, 1));
test_data[!, :SS_max] = zeros(size(test_data, 1));
test_data[!, :SS_max3] = zeros(size(test_data, 1));

In [110]:
for i=1:size(test_data, 1)
    curr_ouvrage = test_data[i, 1];
    ouvrage_data = filter(row -> row.ID_OUVRAGE == curr_ouvrage, ouvrages);
    
    closest_station = nothing;
    closest_distance = 9999;
    
    second_closest_station = nothing;
    second_closest_distance = 9999;
    
    # Pour chaque station
    for j=1:5
       current_station = station_df[j, :STATION];
       dist = findDistance(ouvrage_data[1, :TP_LAT], ouvrage_data[1, :TP_LNG], station_df[j, :LAT], station_df[j, :LNG]);
       
        if dist < closest_distance
            second_closest_distance = closest_distance;
            second_closest_station = closest_station;
            closest_distance = dist;
            closest_station = current_station;
        elseif dist < second_closest_distance
            second_closest_distance = dist;
            second_closest_station = current_station;
        end
    end
    
    test_data[i, :FS_dist] = closest_distance;
    test_data[i, :SS_dist] = second_closest_distance;
    
    # Add data for first station
    sum_p = pcp_sum[∈([test_data[i, :DATE]]).(pcp_sum.date), Symbol(closest_station)];
    test_data[i, :FS_sum] = sum_p[1];
    max_p = pcp_max[∈([test_data[i, :DATE]]).(pcp_max.date), Symbol(closest_station)];
    test_data[i, :FS_max] = max_p[1];
    max3_p = pcp_max3[∈([test_data[i, :DATE]]).(pcp_max3.date), Symbol(closest_station)];
    test_data[i, :FS_max3] = max3_p[1];
    
    # Find multiplier for second station
    ratio = second_closest_distance / closest_distance;
    logratio = log(sqrt(ratio));
    multiplier = 1 - logratio;
    
    # Add data for second station
    s_sum_p = pcp_sum[∈([test_data[i, :DATE]]).(pcp_sum.date), Symbol(second_closest_station)];
    test_data[i, :SS_sum] = s_sum_p[1] * multiplier;
    s_max_p = pcp_max[∈([test_data[i, :DATE]]).(pcp_max.date), Symbol(second_closest_station)];
    test_data[i, :SS_max] = s_max_p[1] * multiplier;
    s_max3_p = pcp_max3[∈([test_data[i, :DATE]]).(pcp_max3.date), Symbol(second_closest_station)];
    test_data[i, :SS_max3] = s_max3_p[1] * multiplier;
end

In [113]:
cols = [:ID_OUVRAGE, :FS_dist, :SS_dist, :FS_sum, :SS_sum, :FS_max, :SS_max];
first(shuffleDf(test_data[!, cols]), 10)

Unnamed: 0_level_0,ID_OUVRAGE,FS_dist,SS_dist,FS_sum,SS_sum,FS_max,SS_max
Unnamed: 0_level_1,String,Float64,Float64,Float64,Float64,Float64,Float64
1,4240-01D,0.149987,0.168295,-0.358123,-0.373886,-0.375363,-0.386708
2,4350-01D,0.0248354,0.139269,-0.391555,-0.0493965,-0.41013,-0.0517744
3,3260-01D,0.145981,0.210769,-0.155606,-0.184762,-0.162968,-0.161624
4,4240-01D,0.149987,0.168295,-0.358123,-0.373886,-0.375363,-0.386708
5,4240-01D,0.149987,0.168295,-0.358123,-0.373886,-0.375363,-0.386708
6,4350-01D,0.0248354,0.139269,4.38952,0.588891,1.62896,0.192889
7,3350-07D,0.0927373,0.12027,-0.402043,-0.340659,-0.416502,-0.35682
8,4240-01D,0.149987,0.168295,1.56027,-0.373886,1.15011,-0.386708
9,3260-01D,0.145981,0.210769,-0.230116,-0.124988,-0.132072,0.0411031
10,3350-07D,0.0927373,0.12027,0.54392,0.664131,0.559817,0.315166


In [114]:
test_3260 = filter(row -> row.ID_OUVRAGE == "3260-01D", test_data);
select!(test_3260, Not(:ID_OUVRAGE));
CSV.write("data/parsed/test_3260.csv",test_3260)

"data/parsed/test_3260.csv"

In [115]:
test_3350 = filter(row -> row.ID_OUVRAGE == "3350-07D", test_data);
select!(test_3350, Not(:ID_OUVRAGE));
CSV.write("data/parsed/test_3350.csv",test_3350)

"data/parsed/test_3350.csv"

In [116]:
test_4240 = filter(row -> row.ID_OUVRAGE == "4240-01D", test_data);
select!(test_4240, Not(:ID_OUVRAGE));
CSV.write("data/parsed/test_4240.csv",test_4240)

"data/parsed/test_4240.csv"

In [117]:
test_4350 = filter(row -> row.ID_OUVRAGE == "4350-01D", test_data);
select!(test_4350, Not(:ID_OUVRAGE));
CSV.write("data/parsed/test_4350.csv",test_4350)

"data/parsed/test_4350.csv"

In [118]:
test_4380 = filter(row -> row.ID_OUVRAGE == "4380-01D", test_data);
select!(test_4380, Not(:ID_OUVRAGE));
CSV.write("data/parsed/test_4380.csv",test_4380)

"data/parsed/test_4380.csv"