In [None]:
# CODE WRITTEN BY: Max Bakker

from functions import *
from pystoned.plot import plot2d
import numpy as np
import pandas as pd
from sklearn.linear_model import Lasso
import copy
import multiprocessing
import functions
from itertools import repeat
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import kneed

# Specify the correlation matrix

In [None]:
# CORRELATION FOR TRUE INPUTS
# Fill in the desired correlations between the variables in the dictionary corrs
nr_of_corr_variables = 9

corrs_TRUE = {i: {} for i in range(0, nr_of_corr_variables+1)}
corrs_FALSE = {i: {} for i in range(0, nr_of_corr_variables+1)}

corrs_TRUE[1][2] = 0.8

# CORRELATION FOR FALSE INPUTS
corrs_FALSE[1][2] = 0.8
corrs_FALSE[1][3] = 0.8
corrs_FALSE[2][3] = 0.8
corrs_FALSE[4][5] = 0.8
corrs_FALSE[6][7] = 0.8


# Perform a single LASSO-SCNLS regression with plot
### If ETA = 0, you're performing a regular SCNLS regression

In [None]:
# REGRESSION PARAMETERS
CORRELATION = CORRELATION_REDUNDANT_VARIABLES = True
SEED = 1
TRUE_INPUTS = 2
REDUNDANT_INPUTS = 7
NR_DMU = 25
VAR_mu = 0.1
ETA = 0.05
EMAIL = 'maxklaasbakker@gmail.com'

if CORRELATION:
    x = sample_correlated_parameters(i=TRUE_INPUTS,k=NR_DMU, rho_dict=corrs_TRUE, min_value=10, max_value=20, seed=SEED)
else:
    x = sample_uniform_parameters(i=TRUE_INPUTS,k=NR_DMU, min_value=10, max_value=20, seed=SEED)

# x=x.T.apply(lambda x:np.sort(x)).T
y_log_true = output_from_parameters(x, cons = 0)
y_log = output_from_parameters_with_noise(x, cons=0, var=VAR_mu)

if CORRELATION_REDUNDANT_VARIABLES:
    x = add_random_correlated_variables(x, REDUNDANT_INPUTS, corrs_FALSE, min_value = 10, max_value = 20, seed=SEED+1)
else:
    x = add_random_variables(x, REDUNDANT_INPUTS, min_value = 10, max_value = 20, seed=SEED+1)



model_cnls = perform_CNLS_LASSO(x=np.log(x), y=y_log, eta=ETA, email=EMAIL)
beta = model_cnls.get_beta()
alpha = model_cnls.get_alpha()
beta = pd.DataFrame(beta).round(2)
beta.loc['Total',:] = beta.mean(axis=0).round(2)
SSR_model = (model_cnls.get_residual()**2).sum()
MSE_model = (model_cnls.get_residual()**2).mean()
nr_variables_deleted = (beta.loc['Total',:] == 0).sum()
nr_correct_variables_deleted = (beta.loc['Total',[0,1]] == 0).sum()
display(beta)
plot2d(model_cnls, x_select=8, label_name="CNLS", fig_name='CNLS_frontier_0_1_variance_ETA_0_1')

# SIMULATION RUNS; 500 repetitions, DMUs=25, 2 True inputs, 7 False inputs

### LASSO-SCNLS

In [None]:
# Specify the input parameters
CORRELATION = CORRELATION_REDUNDANT_VARIABLES = False
TRUE_INPUTS = 2
REDUNDANT_INPUTS = 7
NR_DMU = 25
ETA = 1
VAR_mu = 0.1
REPITITIONS = 500
EMAIL = 'maxklaasbakker@gmail.com'

pool = multiprocessing.Pool()

# Loop through different combination values of var_mu and ETA
for CORRELATION, CORRELATION_REDUNDANT_VARIABLES, VAR_mu, ETA in zip([True,True, False, False],[True,True,False,False],[0.1, 0.7,0.1, 0.7], [0.04,0.09,0.05,0.15]):
    result = pool.starmap(functions.run_simulation_CNLS_LASSO, 
        zip(range(REPITITIONS),
        repeat(CORRELATION), repeat(CORRELATION_REDUNDANT_VARIABLES),
        repeat(TRUE_INPUTS), repeat(REDUNDANT_INPUTS), repeat(NR_DMU), 
        repeat(ETA), repeat(VAR_mu),
        repeat(corrs_TRUE), repeat(corrs_FALSE), repeat(EMAIL))) 
    result_df = pd.DataFrame(result)
    result_df.to_csv(
        f"1_results_SCNLS_LASSO_2_true_7_false_25_dmu_not_sorted_{str(VAR_mu).replace('.', '_')}_var_eta_{str(ETA).replace('.', '_')}_CORR_{CORRELATION}.csv", sep=';')


### RANDOM-SCNLS

In [None]:
CORRELATION = CORRELATION_REDUNDANT_VARIABLES = False
TRUE_INPUTS = 2
REDUNDANT_INPUTS = 7
NR_DMU = 25
ETA = 0
VAR_mu = 0.1
REPITITIONS = 500
EMAIL = 'maxklaasbakker@gmail.com'

for CORRELATION, CORRELATION_REDUNDANT_VARIABLES, VAR_mu in zip([True,True, False, False],[True,True,False,False],[0.1, 0.7,0.1, 0.7]):
    result = pool.starmap(functions.run_simulation_RANDOM, 
        zip(range(REPITITIONS),
        repeat(CORRELATION), repeat(CORRELATION_REDUNDANT_VARIABLES),
        repeat(TRUE_INPUTS), repeat(REDUNDANT_INPUTS), repeat(NR_DMU), 
        repeat(ETA), repeat(VAR_mu),
        repeat(corrs_TRUE), repeat(corrs_FALSE), repeat(EMAIL))) 
    result_df = pd.DataFrame(result)
    result_df.to_csv(
        f"1_results_SCNLS_Random_2_true_7_false_25_dmu_not_sorted_{str(VAR_mu).replace('.', '_')}_var_eta_{str(ETA).replace('.', '_')}_CORR_{CORRELATION}.csv", sep=';')

### Regular LASSO

In [None]:
CORRELATION = CORRELATION_REDUNDANT_VARIABLES = False
TRUE_INPUTS = 2
REDUNDANT_INPUTS = 7
NR_DMU = 25
ETA = 1
VAR_mu = 0.1
REPITITIONS = 500
EMAIL = 'maxklaasbakker@gmail.com'

for CORRELATION, CORRELATION_REDUNDANT_VARIABLES, VAR_mu in zip([True,True, False, False],[True,True,False,False],[0.1, 0.7,0.1, 0.7]):
    print('CORRELATION = ', CORRELATION, 'CORRELATION_REDUNDANT_VARIABLES = ', CORRELATION_REDUNDANT_VARIABLES)

    # Decide what eta to use by doing a grid search
    eta_reg_LASSO = perform_grid_search_reg_LASSO(alphas=np.array([0.01, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 1.1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.7, 1.8, 1.9, 2.0]), reps=30, 
        CORRELATION=CORRELATION, CORRELATION_REDUNDANT_VARIABLES=CORRELATION_REDUNDANT_VARIABLES, TRUE_INPUTS=TRUE_INPUTS, REDUNDANT_INPUTS=REDUNDANT_INPUTS, 
        NR_DMU=NR_DMU, VAR_mu=VAR_mu, corrs_TRUE=corrs_TRUE, corrs_FALSE=corrs_FALSE)
    print(f'Eta from grid search: {eta_reg_LASSO}')
    pool = multiprocessing.Pool()
    result = pool.starmap(functions.run_simulation_regular_LASSO, 
        zip(range(REPITITIONS),
        repeat(CORRELATION), repeat(CORRELATION_REDUNDANT_VARIABLES),
        repeat(TRUE_INPUTS), repeat(REDUNDANT_INPUTS), repeat(NR_DMU), 
        repeat(ETA), repeat(VAR_mu),
        repeat(corrs_TRUE), repeat(corrs_FALSE), repeat(eta_reg_LASSO), repeat(EMAIL)))
    result = pd.DataFrame(result).to_dict()
    results_reg_LASSO = pd.DataFrame(result)
    results_reg_LASSO.to_csv(f"../Coding/Datasets_used_for_inference/1_results_regular_LASSO_2_true_7_false_25_dmu_not_sorted_{str(VAR_mu).replace('.', '_')}_var_eta_{str(eta_reg_LASSO.round(3)).replace('.', '_')}_CORR_{str(CORRELATION).replace('.', '_')}.csv", sep = ';')

### PCA-DEA

In [None]:
# Specify the input parameters
CORRELATION = CORRELATION_REDUNDANT_VARIABLES = False
TRUE_INPUTS = 2
REDUNDANT_INPUTS = 7
NR_DMU = 25
ETA = 1
VAR_mu = 0.1
REPITITIONS = 500
EMAIL = 'maxklaasbakker@gmail.com'

pool = multiprocessing.Pool()

# Loop through different combination values of var_mu and ETA
for CORRELATION, CORRELATION_REDUNDANT_VARIABLES, VAR_mu in zip([True,True, False, False],[True,True,False,False],[0.1, 0.7,0.1, 0.7]):
    result = pool.starmap(functions.run_simulation_PCA_DEA, 
        zip(range(REPITITIONS),
        repeat(CORRELATION), repeat(CORRELATION_REDUNDANT_VARIABLES),
        repeat(TRUE_INPUTS), repeat(REDUNDANT_INPUTS), repeat(NR_DMU), 
        repeat(ETA), repeat(VAR_mu),
        repeat(corrs_TRUE), repeat(corrs_FALSE), repeat(EMAIL))) 
    result_df = pd.DataFrame(result)
    result_df.to_csv(
        f"1_results_PCA_DEA_2_true_7_false_25_dmu_not_sorted_{str(VAR_mu).replace('.', '_')}_var_eta_{str(ETA).replace('.', '_')}_CORR_{CORRELATION}.csv", sep=';')

# Simulation for incremental analysis: 100 DMUs 2 True, 7 False, 30 repitition for each nr. of included variables

### SCNLS-LASSO

In [None]:
# Specify the input parameters
CORRELATION = CORRELATION_REDUNDANT_VARIABLES = False
TRUE_INPUTS = 2
REDUNDANT_INPUTS = 7
NR_DMU = 100
ETA = 1
VAR_mu = 0.1
REPITITIONS = 30
EMAIL = 'maxklaasbakker@gmail.com'

pool = multiprocessing.Pool()

# Loop through different combination values of var_mu and ETA
    
for CORRELATION, CORRELATION_REDUNDANT_VARIABLES, VAR_mu, ETA in zip([True,True, False, False],[True,True,False,False],[0.1, 0.7,0.1, 0.7], [0.04,0.09,0.05,0.15]):
    results_SCNLS_LASSO_run = {}
    for i in range(1,10):
        print(i)
        # Incrementally add variables for each run
        if i >2:
            TRUE_INPUTS = 2
            REDUNDANT_INPUTS = i-2
        else:
            TRUE_INPUTS = i
            REDUNDANT_INPUTS = 0

        result = pool.starmap(functions.run_simulation_CNLS_LASSO, 
            zip(range(REPITITIONS),
            repeat(CORRELATION), repeat(CORRELATION_REDUNDANT_VARIABLES),
            repeat(TRUE_INPUTS), repeat(REDUNDANT_INPUTS), repeat(NR_DMU), 
            repeat(ETA), repeat(VAR_mu),
            repeat(corrs_TRUE), repeat(corrs_FALSE), repeat(EMAIL))) 
        

        result_df = pd.DataFrame(result)
        results_SCNLS_LASSO_run[i] = result_df
    results_SCNLS_LASSO_df = pd.DataFrame()
    for i in range(1,10):
        results_SCNLA_LASSO_df_i = results_SCNLS_LASSO_run[i].rename(columns = lambda x: x+f'_{i}')
        results_SCNLS_LASSO_df = pd.concat([results_SCNLS_LASSO_df, results_SCNLA_LASSO_df_i], axis=1)
    results_SCNLS_LASSO_df.to_csv(f"1_testresults_SCNLS_LASSO_df_100_dmus_30_reps_eta_{str(ETA).replace('.', '_')}_notsorted_var_{str(VAR_mu).replace('.', '_')}_CORR_{CORRELATION}.csv")

### RANDOM-SCNLS

In [None]:
CORRELATION = CORRELATION_REDUNDANT_VARIABLES = False
TRUE_INPUTS = 2
REDUNDANT_INPUTS = 7
NR_DMU = 100
ETA = 1
VAR_mu = 0.1
REPITITIONS = 30
EMAIL = 'maxklaasbakker@gmail.com'

pool = multiprocessing.Pool()
for CORRELATION, CORRELATION_REDUNDANT_VARIABLES, VAR_mu in zip([True,True, False, False],[True,True,False,False],[0.1, 0.7,0.1, 0.7]):
    results_random_run = {}
    for i in range(1,10):
        print(i)
        # Incrementally add variables for each run
        if i >2:
            TRUE_INPUTS = 2
            REDUNDANT_INPUTS = i-2
        else:
            TRUE_INPUTS = i
            REDUNDANT_INPUTS = 0

        result = pool.starmap(functions.run_simulation_RANDOM, 
            zip(range(REPITITIONS),
            repeat(CORRELATION), repeat(CORRELATION_REDUNDANT_VARIABLES),
            repeat(TRUE_INPUTS), repeat(REDUNDANT_INPUTS), repeat(NR_DMU), 
            repeat(ETA), repeat(VAR_mu),
            repeat(corrs_TRUE), repeat(corrs_FALSE), repeat(EMAIL))) 
        result_df = pd.DataFrame(result)
        results_random_run[i] = result_df
        # result_df.to_csv(
        #     f"1_testresults_PCA_DEA_df_100_dmus_30_reps_eta_{str(VAR_mu).replace('.', '_')}_var_eta_{str(ETA).replace('.', '_')}_CORR_{CORRELATION}.csv", sep=';')
    results_Random_df = pd.DataFrame()
    for i in range(1,10):
        results_Random_df_i = results_random_run[i].rename(columns = lambda x: x+f'_{i}')
        results_Random_df = pd.concat([results_Random_df, results_Random_df_i], axis=1)
    results_Random_df.to_csv(f"1_testresults_Random_df_100_dmus_30_reps_eta_{str(0).replace('.', '_')}_notsorted_var_{str(VAR_mu).replace('.', '_')}_CORR_{CORRELATION}.csv")


### Regular LASSO

In [None]:
CORRELATION = CORRELATION_REDUNDANT_VARIABLES = False
TRUE_INPUTS = 2
REDUNDANT_INPUTS = 7
NR_DMU = 100
ETA = 1
VAR_mu = 0.1
REPITITIONS = 30
EMAIL = 'maxklaasbakker@gmail.com'

for CORRELATION, CORRELATION_REDUNDANT_VARIABLES, VAR_mu in zip([True,True, False, False],[True,True,False,False],[0.1, 0.7,0.1, 0.7]):
    results_regular_LASSO_run = {}
    for i in range(1,10):
        # print(i)
        if i >2:
            TRUE_INPUTS = 2
            REDUNDANT_INPUTS = i-2
        else:
            TRUE_INPUTS = i
            REDUNDANT_INPUTS = 0
    
    
        
        # decide what eta to use
        eta_reg_LASSO = perform_grid_search_reg_LASSO(alphas=np.array([0.01, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 1.1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.7, 1.8, 1.9, 2.0]), reps=30, 
            CORRELATION=CORRELATION, CORRELATION_REDUNDANT_VARIABLES=CORRELATION_REDUNDANT_VARIABLES, TRUE_INPUTS=TRUE_INPUTS, REDUNDANT_INPUTS=REDUNDANT_INPUTS, 
            NR_DMU=NR_DMU, VAR_mu=VAR_mu, corrs_TRUE=corrs_TRUE, corrs_FALSE=corrs_FALSE)
        print('Variables included = ', i, 'CORRELATION = ', CORRELATION, 'CORRELATION_REDUNDANT_VARIABLES = ', CORRELATION_REDUNDANT_VARIABLES, 'VAR_mu = ', VAR_mu, 'ETA from grid search= ', eta_reg_LASSO)

        
        # print(f'Eta from grid search: {eta_reg_LASSO}')
        pool = multiprocessing.Pool()
        result = pool.starmap(functions.run_simulation_regular_LASSO, 
            zip(range(REPITITIONS),
            repeat(CORRELATION), repeat(CORRELATION_REDUNDANT_VARIABLES),
            repeat(TRUE_INPUTS), repeat(REDUNDANT_INPUTS), repeat(NR_DMU), 
            repeat(ETA), repeat(VAR_mu),
            repeat(corrs_TRUE), repeat(corrs_FALSE), repeat(eta_reg_LASSO), repeat(EMAIL)))
        result = pd.DataFrame(result).to_dict()
        results_regular_LASSO_run[i] = pd.DataFrame(result)
        # results_reg_LASSO.to_csv(f"../Coding/Datasets_used_for_inference/1_results_regular_LASSO_2_true_7_false_25_dmu_not_sorted_{str(VAR_mu).replace('.', '_')}_var_eta_{str(eta_reg_LASSO.round(3)).replace('.', '_')}_CORR_{str(CORRELATION).replace('.', '_')}.csv", sep = ';')
    results_regular_LASSO_df = pd.DataFrame()
    for i in range(1,10):
        results_regular_LASSO_df_i = results_regular_LASSO_run[i].rename(columns = lambda x: x+f'_{i}')
        results_regular_LASSO_df = pd.concat([results_regular_LASSO_df, results_regular_LASSO_df_i], axis=1)
    results_regular_LASSO_df.to_csv(f"1_testresults_regular_LASSO_df_100_dmus_30_reps_eta_{str(eta_reg_LASSO.round(3)).replace('.', '_')}_notsorted_var_{str(VAR_mu).replace('.', '_')}_CORR_{CORRELATION}.csv")
    

### PCA-DEA

In [None]:
# Specify the input parameters
CORRELATION = CORRELATION_REDUNDANT_VARIABLES = False
TRUE_INPUTS = 2
REDUNDANT_INPUTS = 7
NR_DMU = 100
ETA = 1
VAR_mu = 0.1
REPITITIONS = 30
EMAIL = 'maxklaasbakker@gmail.com'

pool = multiprocessing.Pool()

# Loop through different combination values of var_mu and ETA
    
for CORRELATION, CORRELATION_REDUNDANT_VARIABLES, VAR_mu in zip([True,True, False, False],[True,True,False,False],[0.1, 0.7,0.1, 0.7]):
    results_PCA_run = {}
    for i in range(1,10):
        print(i)
        # Incrementally add variables for each run
        if i >2:
            TRUE_INPUTS = 2
            REDUNDANT_INPUTS = i-2
        else:
            TRUE_INPUTS = i
            REDUNDANT_INPUTS = 0

        result = pool.starmap(functions.run_simulation_PCA_DEA, 
            zip(range(REPITITIONS),
            repeat(CORRELATION), repeat(CORRELATION_REDUNDANT_VARIABLES),
            repeat(TRUE_INPUTS), repeat(REDUNDANT_INPUTS), repeat(NR_DMU), 
            repeat(ETA), repeat(VAR_mu),
            repeat(corrs_TRUE), repeat(corrs_FALSE), repeat(EMAIL))) 
        result_df = pd.DataFrame(result)
        results_PCA_run[i] = result_df
        # result_df.to_csv(
        #     f"1_testresults_PCA_DEA_df_100_dmus_30_reps_eta_{str(VAR_mu).replace('.', '_')}_var_eta_{str(ETA).replace('.', '_')}_CORR_{CORRELATION}.csv", sep=';')
    results_PCA_DEA_df = pd.DataFrame()
    for i in range(1,10):
        results_PCA_DEA_df_i = results_PCA_run[i].rename(columns = lambda x: x+f'_{i}')
        results_PCA_DEA_df = pd.concat([results_PCA_DEA_df, results_PCA_DEA_df_i], axis=1)
    results_PCA_DEA_df.to_csv(f"1_testresults_PCA_DEA_df_100_dmus_30_reps_eta_{str(0).replace('.', '_')}_notsorted_var_{str(VAR_mu).replace('.', '_')}_CORR_{CORRELATION}.csv")
