# Approach : Context-Aware Approximate Scientific Computing

## Description / Goal

Goal: Tailoring simulation model to a context of usage through an execution budget, by automatically and systematically applying approximate computing thanks to predictive models.

# Pipeline

## Import libraries

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import os
import pandas as pd
import random
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn import metrics
random.seed(42)

## Functions to load 

In [3]:
def pipeline_time_prediction(budget, ratio_train):
    # Retrieve data
    df_data = pd.read_csv(os.path.join(path, "data/", "Input_Data_Time_ValidMetric_Features_Rates_" + str(nb_rates) + "_Features_" + "Geomorph_CVHV_Saturation_Cells" + "_" + str(scale) + "_Comparable.csv"), sep=";")
    sites_completes = df_data.SiteNumber.unique()
    # Split geographical sites into training and testing cases
    training_cases, testing_cases = split_training_testing_cases(sites_completes, ratio_train)
    # Extracting training and testing datasets
    X_test, y_test_pred, y_test, data_test = get_time_prediction(df_data, training_cases, testing_cases)
    
    true_positive_time, true_negative_time, false_positive_time, false_negative_time, idxs = get_evaluation_from_time_prediction(y_test_pred, y_test, X_test, data_test)


    return df_data, training_cases, testing_cases, idxs, true_positive_time, true_negative_time, false_positive_time, false_negative_time

In [4]:
def split_training_testing_cases(sites_completes, ratio_train):
    #Computing the number of sites that are used for the training
    training_nb_cases = round(len(sites_completes) * ratio_train)

    # Selecting the sites for the training and testing sets randomly 
    training_cases = random.sample(sites_completes.tolist(), training_nb_cases)
    testing_cases = [x for x in sites_completes.tolist() if x not in training_cases]
    
    return training_cases, testing_cases

In [5]:
def get_time_prediction(df_data, training_cases, testing_cases):
    data_train, data_test = retrieve_list_cases_and_split_data_BVE(df_data, training_cases, testing_cases)
    X_train, y_train, X_test, y_test = extract_features_and_outputs_datasets_BVE(data_train, data_test, features)
    forest = train_forest(X_train, y_train)
    y_test_pred = forest.predict(X_test)
    #mse, r2, rmse = compute_standard_metrics(y_test, y_test_pred)
    #print("MSE: ", mse)
    #print("RMSE: ", rmse)
    #print("R2: ", r2)
    data_test = update_and_store_data_with_time_pred(path, data_test, y_test, y_test_pred, testing_cases)
    return X_test, y_test_pred, y_test, data_test
    

In [6]:
def retrieve_list_cases_and_split_data_BVE(df_data, training_cases, testing_cases):
    
    # Retrieving the dataset for the training phase
    data_train = extract_data_from_list_of_sites(df_data, training_cases)
    
    # Retrieving the dataset for the testing phase
    data_test = extract_data_from_list_of_sites(df_data, testing_cases)

    return data_train, data_test

In [7]:
def extract_data_from_list_of_sites(data_complete, cases):
    data_cases = pd.DataFrame(columns=data_complete.columns)
    for case in cases:
        data_case = data_complete.loc[(data_complete['SiteNumber'] == case)]
        data_cases = pd.concat([data_cases, data_case], sort=False)
        
    return data_cases

In [8]:
def extract_features_and_outputs_datasets_BVE(data_train, data_test, features):
    # load names of all the features contained in the dataset to extract the data we want from
    global all_features
    
    y_train = data_train.filter(["SiteNumber", "Time"], axis=1)
    X_train = data_train.drop(["Chronicle", "Validation Metric", "Accuracy", "Time"], axis=1)
    del y_train["SiteNumber"]
    del X_train["SiteNumber"]
    
    y_test = data_test.filter(["SiteNumber", "Time"], axis=1)
    X_test = data_test.drop(["Chronicle", "Validation Metric", "Accuracy", "Time"], axis=1)
    del y_test["SiteNumber"]
    del X_test["SiteNumber"]
    
    
    features_to_remove = [feature for feature in all_features if feature not in features]
    for feature in features_to_remove:
        del X_train[str(feature)]
        del X_test[str(feature)]

    return X_train, y_train, X_test, y_test 

In [9]:
def train_forest(X_train, y_train):
    forest = RandomForestRegressor(
        n_estimators=1000, criterion='squared_error', random_state=1, n_jobs=-1, oob_score = True, bootstrap = True
    )
    forest.fit(X_train, y_train.values.ravel())

    return forest

In [10]:
def compute_standard_metrics(y_test, y_test_pred):

    mse = mean_squared_error(y_test.values.ravel(), y_test_pred)
    r2 = r2_score(y_test.values.ravel(), y_test_pred)
    rmse = metrics.mean_squared_error(y_test.values.ravel(), y_test_pred, squared=False)

    return mse, r2, rmse

In [11]:
def update_and_store_data_with_time_pred(path, data_test, y_test, y_test_pred, testing_cases):
    suffixe = "_".join(map(str,list(map(int, sorted(testing_cases)))))
    # If we want the testing dataset to correspond to one unique site instead of a ratio (e.g., 20%)
    data_test = data_test.assign(Timetest=y_test.values.ravel())
    data_test = data_test.assign(TimePred=y_test_pred)
    
    data_test.to_csv(os.path.join(path,"data/Output_Data/Data_Test_With_Time_pred_Rates_" + str(nb_rates) + "_Features_" + str('_'.join(features)) + "_" + str(scale)  + "_" + str(suffixe) + ".csv"), index=False, sep=";")
    return data_test

In [12]:
def get_evaluation_from_time_prediction(y_test_pred, y_test, X_test, data_test):
    chuncks_pred = list()
    true_positive_time = 0
    true_negative_time = 0
    false_positive_time = 0
    false_negative_time = 0

    for i in range(0, len(y_test_pred), 30):
        chuncks_pred.append(y_test_pred[i:i+30])
    idxs = list()
    site = 0
    for site_data in chuncks_pred:
        first_time_under_budget = 0
        item_site = 0
        site += 1
        for pred_time in site_data:
            item_site +=1
            if pred_time < budget:
                first_time_under_budget +=1
                if first_time_under_budget == 2:
                    index_for_site = item_site-1
                    
                    idxs.append(((site-1)*30)+ index_for_site)
                    pred_time_approach = round(pred_time)

                    pred_rate = int(X_test.iloc[[((site-1)*30)+ index_for_site]]["Rate"])
                    real_time_approach = int(y_test.iloc[[((site-1)*30)+ index_for_site]]["Time"])
                    
                    # In reality, is the simulation of this approximation rate respecting the budget
                    prediction_time_evaluation = get_evaluation_with_time_prediction_under_budget(budget, real_time_approach)
                    
                    if prediction_time_evaluation:
                        true_positive_time += 1
                        print("TP p")
                    else:
                        false_positive_time += 1
                        print("FP p")
                    break
                    
        if item_site == 30 :
            idxs.append(-9)
            index_debut = (site - 1) * 30
            for i in range(index_debut, index_debut + 30):
                real_time_approach = int(data_test.iloc[[i]]["Time"])
                if real_time_approach < budget:
                    false_negative_time += 1
                    print("FN p")
                    break
                if i == index_debut + 30 - 1:
                    true_negative_time += 1
                    print("TN p")

            
    
    return true_positive_time, true_negative_time, false_positive_time, false_negative_time, idxs

In [13]:
def get_evaluation_with_time_prediction_under_budget(budget, real_time_approach):
    # In reality, is the simulation of this approximation rate respecting the budget
    if real_time_approach < budget:
        prediction_time_evaluation = 1
    else:
        prediction_time_evaluation = 0
        
    return prediction_time_evaluation

## Pipeline for validity 

In [14]:
 def pipeline_prediction_validation(df_data, training_cases, testing_cases, idxs, features_validity):   
    y_test_pred_valid, y_test_valid = get_validity_prediction(df_data, training_cases, testing_cases, features_validity)
    predicted_and_real_validity_values = get_validity_prediction_values_for_corresponding_approximation_rates(y_test_pred_valid, y_test_valid, idxs)
    true_positive_valid = 0
    true_negative_valid = 0
    false_positive_valid = 0
    false_negative_valid = 0

    for predicted_validity in predicted_and_real_validity_values:
        predicted_validity_value = predicted_validity[0]
        real_validity_value = predicted_validity[1]
        if predicted_validity_value == real_validity_value:
            if predicted_validity_value:
                true_positive_valid +=1
                print("TP validity")
            else:
                true_negative_valid += 1
                print("TN validity")
        else:
            if predicted_validity_value:
                false_positive_valid +=1
                print("FP validity ")
            else:
                false_negative_valid += 1
                print("FN validity")

    return true_positive_valid, true_negative_valid, false_positive_valid, false_negative_valid

In [15]:
def get_validity_prediction(df_data, training_cases, testing_cases, features_validity):
    global k
    df_data["Validity"] = np.where(df_data["Validation Metric"] > validity_threshold, 0, 1)
    data_train_validity, data_test_validity = retrieve_list_cases_and_split_data_BVE(df_data, training_cases, testing_cases)
    X_train_validity, y_train_validity, X_test_validity, y_test_validity = extract_features_and_outputs_datasets_BVE_validity(data_train_validity, data_test_validity, features_validity)
    classifier = KNeighborsClassifier(n_neighbors=k)
    classifier.fit(X_train_validity, y_train_validity.values.ravel().astype(int))
    y_test_pred_validity = classifier.predict(X_test_validity)
    #acc_score = compute_score(y_test_validity, y_test_pred_validity)
    #print("Validity Accuracy Score:", acc_score)
    
    return y_test_pred_validity, y_test_validity

In [16]:
def extract_features_and_outputs_datasets_BVE_validity(data_train, data_test, features):
    # load names of all the features contained in the dataset to extract the data we want from
    global all_features
    
    X_train, y_train = create_input_and_output_datasets_for_validity(data_train)
    X_test, y_test = create_input_and_output_datasets_for_validity(data_test)
    
    
    features_to_remove = [feature for feature in all_features if feature not in features]
    for feature in features_to_remove:
        del X_train[str(feature)]
        del X_test[str(feature)]

    return X_train, y_train, X_test, y_test 

In [17]:
def create_input_and_output_datasets_for_validity(data):
    y_data = data.filter(["SiteNumber", "Validity"], axis=1)
    X_data = data.drop(["Chronicle", "Validation Metric", "Accuracy", "Time", "Validity"], axis=1)
    del y_data["SiteNumber"]
    del X_data["SiteNumber"]
    return X_data, y_data

In [18]:
def compute_score(y_test, y_test_pred):
    acc_score = metrics.accuracy_score(y_test.values.ravel().astype(int), y_test_pred)
    return acc_score

In [19]:
def get_validity_prediction_values_for_corresponding_approximation_rates(y_test_pred_valid, y_test_real_valid, idxs):
    chuncks_pred_valid = list()
    predicted_and_real_validity_values = list()

    for i in range(0, len(y_test_pred_valid), 30):
        chuncks_pred_valid.append(y_test_pred_valid[i:i+30])

    site = 0
    for value_sites in chuncks_pred_valid:
        # Checking if the prediction for the approximation rate led to no simulation under the execution budget
        if idxs[site] == -9: 
            predicted_and_real_validity_values.append([1, 1])
        else:
            valid_pred = y_test_pred_valid[idxs[site]]
            valid_real = int(y_test_real_valid.iloc[[idxs[site]]]["Validity"])
        
            predicted_and_real_validity_values.append([valid_pred, valid_real])
        site+=1
    
    return predicted_and_real_validity_values

## Parameters

In [20]:
path = os.path.abspath(os.path.join(os.path.abspath(""), os.pardir))
nb_rates = 30
chronicle = 0
scale = "BVE"

# Different sets of features 
set_geomorph = ["Slope", "Elevation", "LC", "CW", "Area"]
set_CVHV = ["Coastal Vulnerability", "Hydrological Vulnerability"]
set_saturation= ["Satured Zone Area", "Vulnerability Sum", "Vulnerability Rate"]
set_cells = ["Number of Cells"]
all_features = set_geomorph + set_CVHV + set_saturation + set_cells

# We select the type of features we want to use for training our model
features = set_cells

## Replications x 25

In [37]:
validity_threshold = 0.1
k=3
features_valid = set_geomorph
budgets = [300, 600, 1800, 3600, 18000]
ratio_train = 0.95

true_positive_times = 0
true_negative_times = 0
false_positive_times = 0
false_negative_times = 0
true_positive_valids = 0
true_negative_valids = 0
false_positive_valids = 0
false_negative_valids = 0
true_positive_approach = 0
true_negative_approach = 0
false_positive_approach = 0
false_negative_approach = 0


for i in range(25):
    print("------------")
    print("Replication", i+1)
    correct_approach_replication = 0
    for budget in budgets:
        print('----')
        print("Budget:", budget)
        df_data, training_cases, testing_cases, idxs, true_positive_time, true_negative_time, false_positive_time, false_negative_time = pipeline_time_prediction(budget, ratio_train)
        true_positive_valid, true_negative_valid, false_positive_valid, false_negative_valid = pipeline_prediction_validation(df_data, training_cases, testing_cases, idxs,features_valid)
                
        true_positive_times += true_positive_time
        true_negative_times += true_negative_time
        false_positive_times += false_positive_time
        false_negative_times += false_negative_time
        true_positive_valids += true_positive_valid
        true_negative_valids += true_negative_valid
        false_positive_valids += false_positive_valid
        false_negative_valids += false_negative_valid

        if true_positive_time and true_positive_valid:
            true_positive_approach += 1
        elif (false_positive_time and true_positive_valid) or (false_positive_time and false_positive_valid) or (true_positive_time and false_positive_valid):
            false_positive_approach += 1
        elif (true_positive_time and false_negative_valid) or (false_negative_time and true_positive_valid):
            false_negative_approach += 1
        else:
            true_negative_approach += 1

print("------------")

------------
Replication 1
----
Budget: 300
TP p
TN validity
----
Budget: 600
FP p
TN validity
----
Budget: 1800
TP p
FN validity
----
Budget: 3600
TP p
TP validity
----
Budget: 18000
TP p
TP validity
------------
Replication 2
----
Budget: 300
TP p
TN validity
----
Budget: 600
TP p
TN validity
----
Budget: 1800
TP p
TN validity
----
Budget: 3600
FP p
FP validity 
----
Budget: 18000
TP p
TP validity
------------
Replication 3
----
Budget: 300
TP p
TN validity
----
Budget: 600
TP p
TN validity
----
Budget: 1800
TP p
TP validity
----
Budget: 3600
TP p
TP validity
----
Budget: 18000
TP p
TP validity
------------
Replication 4
----
Budget: 300
TP p
TN validity
----
Budget: 600
TP p
TN validity
----
Budget: 1800
FP p
TP validity
----
Budget: 3600
TP p
FN validity
----
Budget: 18000
TP p
TP validity
------------
Replication 5
----
Budget: 300
TP p
TN validity
----
Budget: 600
TP p
TN validity
----
Budget: 1800
TP p
FN validity
----
Budget: 3600
TP p
TP validity
----
Budget: 18000
TP p
TP val

# Accuracy scores

## Approximation rate

In [42]:
Accuracy_p = 0

true_prediction_times = true_positive_times + true_negative_times
number_of_prediction_times = true_prediction_times + false_positive_times + false_negative_times

Accuracy_times = float(true_prediction_times) / float(number_of_prediction_times)
print(Accuracy_times)

0.824


## Validity

In [43]:
Accuracy_validity = 0

true_prediction_valids = true_positive_valids + true_negative_valids
number_of_prediction_valids = true_prediction_valids + false_positive_valids + false_negative_valids

Accuracy_valids = float(true_prediction_valids) / float(number_of_prediction_valids)
print(Accuracy_valids)

0.952


## Approach

In [44]:
Accuracy_approach =0

true_prediction_approach = true_positive_approach + true_negative_approach
number_of_prediction_approach = true_prediction_approach + false_positive_approach + false_negative_approach

Accuracy_approach = float(true_prediction_approach) / float(number_of_prediction_approach)
print(Accuracy_approach)

0.872
