# Time prediction

## Description / Goal

Goal: Create data model to predict properties (e.g., execution time, validity metric) of a simulation thanks to features regarding that simulation. The end goal is to define the approximation factor tu use for that simulation to match a defined execution budget (e.g., execution time).



## Import libraries

In [1]:
import os
import pandas as pd
import random
random.seed(42)
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

## Functions to load 

In [2]:
def retrieve_list_cases_and_split_data_BVE(data_complete, sites_completes, ratio_train):
    #Computing the number of sites that are used for the training
    training_nb_cases = round(len(sites_completes) * ratio_train)

    # Selecting the sites for the training and testing sets randomly 
    training_cases = random.sample(sites_completes.tolist(), training_nb_cases)
    testing_cases = [x for x in sites_completes.tolist() if x not in training_cases]
    
    # Retrieving the dataset for the training phase
    data_train = pd.DataFrame(columns=data_complete.columns)
    for cas in training_cases:
        train = data_complete.loc[(data_complete['SiteNumber'] == cas)]
        data_train = pd.concat([data_train, train], sort=False)
    # Retrieving the dataset for the testing phase
    data_test = pd.DataFrame(columns=data_complete.columns)
    for case in testing_cases:
        test = data_complete.loc[(data_complete['SiteNumber'] == case)]
        data_test = pd.concat([data_test, test], sort=False)

    return data_train, data_test, training_cases, testing_cases

In [52]:
def extract_features_and_outputs_datasets_BVE(data_train, data_test, features):
    # load names of all the features contained in the dataset to extract the data we want from
    global all_features
    
    y_train = data_train.filter(["SiteNumber", "Time"], axis=1)
    X_train = data_train.drop(["Chronicle", "Validation Metric", "Accuracy", "Time"], axis=1)
    del y_train["SiteNumber"]
    del X_train["SiteNumber"]
    
    y_test = data_test.filter(["SiteNumber", "Time"], axis=1)
    X_test = data_test.drop(["Chronicle", "Validation Metric", "Accuracy", "Time"], axis=1)
    del y_test["SiteNumber"]
    del X_test["SiteNumber"]
    
    
    features_to_remove = [feature for feature in all_features if feature not in features]
    for feature in features_to_remove:
        del X_train[str(feature)]
        del X_test[str(feature)]

    return X_train, y_train, X_test, y_test 

In [53]:
def train_forest(X_train, y_train):
    forest = RandomForestRegressor(
        n_estimators=1000, criterion='squared_error', random_state=1, n_jobs=-1, oob_score = True, bootstrap = True
    )
    forest.fit(X_train, y_train.values.ravel())

    return forest

In [54]:
def compute_standard_metrics(y_test, y_test_pred):

    mse = mean_squared_error(y_test.values.ravel(), y_test_pred)
    r2 = r2_score(y_test.values.ravel(), y_test_pred)

    return mse, r2

In [59]:
def update_and_store_data_with_time_pred(path, data_test, y_test, y_test_pred, testing_cases, one_case=False):
    suffixe = "_".join(map(str,list(map(int, sorted(testing_cases)))))
    # If we want the testing dataset to correspond to one unique site instead of a ratio (e.g., 20%)
    if one_case:
        suffixe += "_OneCase"
    data_test = data_test.assign(Timetest=y_test.values.ravel())
    data_test = data_test.assign(TimePred=y_test_pred)
    
    data_test.to_csv(os.path.join(path,"data/Output_Data/Data_Test_With_Time_pred_Rates_" + str(nb_rates) + "_Features_" + str('_'.join(features)) + "_" + str(scale)  + "_" + str(suffixe) + ".csv"), index=False, sep=";")
    return data_test

# Parameters

In [56]:
#path = "/Users/june/Dev/LAPrediction/"
path = os.path.abspath(os.path.join(os.path.abspath(""), os.pardir))
print(path)
nb_rates = 30
chronicle = 0
#features = "Geomorph_CVHV_Saturation_Cells"
scale = "BVE"
ratio_train = 0.8

# Different sets of features 
set_geomorph = ["Slope", "Elevation", "LC", "CW", "Area"]
set_CVHV = ["Coastal Vulnerability", "Hydrological Vulnerability"]
set_saturation= ["Satured Zone Area", "Vulnerability Sum", "Vulnerability Rate"]
set_cells = ["Number of Cells"]
all_features = set_geomorph + set_CVHV + set_saturation + set_cells

# We select the type of features we want to use for training our model
features = set_cells
print(features)

/Users/june/Dev/LAPrediction
['Number of Cells']


# Pipeline

In [65]:
def pipeline_pred_time(path, nb_rates, features, scale, ratio_train):
    df_data = pd.read_csv(os.path.join(path, "data/", "Input_Data_Time_ValidMetric_Features_Rates_" + str(nb_rates) + "_Features_" + "Geomorph_CVHV_Saturation_Cells" + "_" + str(scale) + "_Comparable.csv"), sep=";")
    #data_complete = extract_complete_data_for_BVE(df_data)
    sites_completes = df_data.SiteNumber.unique()
    data_train, data_test, training_cases, testing_cases = retrieve_list_cases_and_split_data_BVE(df_data, sites_completes, ratio_train)
    X_train, y_train, X_test, y_test = extract_features_and_outputs_datasets_BVE(data_train, data_test, features)
    forest = train_forest(X_train, y_train)
    y_test_pred = forest.predict(X_test)
    mse, r2 = compute_standard_metrics(y_test, y_test_pred)
    print("MSE: ", mse)
    print("R2: ", r2)
    data_test = update_and_store_data_with_time_pred(path, data_test, y_test, y_test_pred, testing_cases, one_case=False)
    return mse, r2

    

In [64]:
scores_r2 = []
for i in range(10):
    _, r2 = pipeline_pred_time(path, nb_rates, features, scale, ratio_train)
    scores_r2.append(r2)

MSE:  950185.5224610138
R2:  0.9485038184126715
MSE:  2132328.587199666
R2:  0.9463765724938249
MSE:  458149.17684877035
R2:  0.9910560694498923
MSE:  722627.0676465275
R2:  0.9810094462932545
MSE:  510244.4690254335
R2:  0.9569621481251954
MSE:  1999661.3422209334
R2:  0.9400487946207016
MSE:  637558.5071142132
R2:  0.9828691453099762
MSE:  2257499.110813523
R2:  0.9363827905882856
MSE:  563997.4232822
R2:  0.8217189517166186
MSE:  1529448.1337656127
R2:  0.9703002896914849


In [70]:
import statistics
print(scores_r2)
statistics.mean(scores_r2)

[0.9485038184126715, 0.9463765724938249, 0.9910560694498923, 0.9810094462932545, 0.9569621481251954, 0.9400487946207016, 0.9828691453099762, 0.9363827905882856, 0.8217189517166186, 0.9703002896914849]


0.9475228026701905