# Testing paralelized versus serialized code time

Here we compare the running time of serialized and paralelized monte carlo, and then we profile several steps of our hypothesis testing 

In [1]:
import numpy as np
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from nonconformist.cp import IcpRegressor
from nonconformist.nc import NcFactory
from sklearn.model_selection import train_test_split
from scipy import stats
import os
import time

# importing our module
from lcv.valid_pred_sets import Valid_pred_sets
from lcv.valid_pred_sets import LinearQuantileRegression
from lcv.valid_pred_sets import GradientBoostingQuantileRegression

from simulation import simulation

First fitting our hypothesis testing object to any kind of simulated data and taking ICP and RF as prediction model and coverage evaluator:

In [2]:
def split(X, y, test_size = 0.4, calibrate = True, random_seed = 1250):
    X_train, X_test, y_train, y_test = train_test_split(X, y,test_size = test_size,
                                                        random_state = random_seed)
    if calibrate:
        X_train, X_calib, y_train, y_calib = train_test_split(X_train, y_train, test_size = 0.25,
                                                             random_state = random_seed)
        return {"X_train":X_train, "X_calib": X_calib, "X_test" : X_test, 
                "y_train" : y_train, "y_calib" : y_calib, "y_test": y_test}
    else:
        return{"X_train":X_train,"X_test" : X_test, 
                "y_train" : y_train,"y_test": y_test}
                
def prep_hyp_icp(kind = "homoscedastic", n = 10000, d = 20, coef = 2, B = 1000, 
                 random_seed = 1250, sig = 0.05, coverage_evaluator = "RF"):
    sim_obj = simulation(dim = d, coef = coef)
    sim_kind = getattr(sim_obj, kind)
    sim_kind(n, random_seed = random_seed)
    split_icp = split(sim_obj.X, sim_obj.y)

    # model given by simulation quantiles
    

    model = RandomForestRegressor(random_state = random_seed)
    nc = NcFactory.create_nc(model)
    icp = IcpRegressor(nc)
    icp.fit(split_icp["X_train"], split_icp["y_train"])
    icp.calibrate(split_icp["X_calib"], split_icp["y_calib"])
    hyp_icp = Valid_pred_sets(icp, sig, coverage_evaluator = coverage_evaluator)
    hyp_icp.fit(split_icp["X_test"], split_icp["y_test"], random_seed = random_seed)
    return hyp_icp

hyp_icp = prep_hyp_icp()

Testing execution time for homoscedastic data using RF as coverage evaluator and ICP calibrated model, taking $B$ as 1000:

In [3]:
start = time.time()
print("Serialized code:")
res_ser = hyp_icp.monte_carlo_test(B = 1000, random_seed = 1250)
res_ser
end = time.time() - start
print("Time Elapsed: ", end)

Serialized code:
Time Elapsed:  2268.2713577747345


Verifying serialized test results

In [4]:
res_ser

{'p-value': 0.619, 'Observed statistic': 0.034725}

In [5]:
start = time.time()
print("Paralelized code:")
res_par = hyp_icp.monte_carlo_test(B = 1000, random_seed = 1250, par = True)
res_par
end = time.time() - start
print("Time Elapsed: ", end)

Paralelized code:
Time Elapsed:  659.6085753440857


In [6]:
res_par

{'p-value': 0.587, 'Observed statistic': 0.034725}

Testing nnet coverage evaluator. First preping and tuning the nnet hypothesis object:

In [3]:
hyp_icp_nnet = prep_hyp_icp(coverage_evaluator="nnet")

[32m[I 2022-08-11 22:39:56,223][0m A new study created in memory with name: no-name-75c18a68-ff80-4a46-b5bf-65ffd9dfd694[0m
[32m[I 2022-08-11 22:40:05,932][0m Trial 0 finished with value: 0.11644074693322182 and parameters: {'n_layers': 1, 'n_units_l0': 26, 'dropout_l0': 0.2175008215425142, 'optimizer': 'RMSprop', 'lr': 0.005679432815250312}. Best is trial 0 with value: 0.11644074693322182.[0m
[32m[I 2022-08-11 22:40:12,606][0m Trial 1 finished with value: 0.11238567810505629 and parameters: {'n_layers': 4, 'n_units_l0': 47, 'dropout_l0': 0.23032529809825056, 'n_units_l1': 44, 'dropout_l1': 0.4307933982426165, 'n_units_l2': 112, 'dropout_l2': 0.4342109630511539, 'n_units_l3': 102, 'dropout_l3': 0.42679448672765236, 'optimizer': 'RMSprop', 'lr': 0.001665289601082679}. Best is trial 1 with value: 0.11238567810505629.[0m
[32m[I 2022-08-11 22:40:30,995][0m Trial 2 finished with value: 0.1339472010731697 and parameters: {'n_layers': 2, 'n_units_l0': 24, 'dropout_l0': 0.2082559248

Serialized version working fine:

In [8]:
start = time.time()
print("Serialized code:")
res_ser = hyp_icp_nnet.monte_carlo_test(B = 1000, random_seed = 1250)
res_ser
end = time.time() - start
print("Time Elapsed: ", end)

Serialized code:
Time Elapsed:  8674.781376600266


Trying paralelized version:

In [4]:
start = time.time()
print("Paralelized code:")
res_par = hyp_icp_nnet.monte_carlo_test(B = 1000, random_seed = 1250, par = True)
res_par
end = time.time() - start
print("Time Elapsed: ", end)

Paralelized code:
