In [1]:
import os
os.chdir("..")

In [34]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from utils import Thesis_viz as viz
from utils import synthetic_data_generation as sd

## Parameters

In [8]:
modes = ['intercept', 'slope', 'both']

effective_groups = list(range(3,9))
visible_groups = list(range(10,260,20))
number_of_contineous_features = 5

# random_effects_distribution = 'normal'; re_intercept = (0,5) ; re_slope = (0,5)
random_effects_distribution = 'linspace'; re_intercept = (-10,10) ; re_slope = (-10,10)

# error_type = 'on_group' ; re_error = (0,3)
error_type = 'on_target' ; re_error = (0,3)

shuffle_groups = True

### Simulation

In [5]:
results_gV = pd.DataFrame([])
itr = -1
target_y = []

for mode in modes:
    
    for e in effective_groups:

        for v in visible_groups:

            seeds = [100]

            for s in seeds:

                itr = itr + 1
                print("\n",itr,mode,e,v)

                df,_ = sd.create_data(
                    n = 1000,
                    n_effective_groups = e,
                    n_visible_groups = v,
                    n_contineous_features = number_of_contineous_features,
                    mode = mode,
                    fixed_slope=(0, 1),
                    fixed_intercept=0,
                    fixed_error=(0, 1),
                    random_effects_distribution=random_effects_distribution,
                    re_intercept=re_intercept,
                    re_slope=re_slope,
                    error_type=error_type,
                    re_error=re_error,
                    random_seed=s,
                    shuffle_groups=shuffle_groups,
                    show_random_effects=False,
                )
                
                target_y.append(list(df.y))

                mse_Linear_Exclude_Group, time_Linear_Exclude_Group,_,_ = Linear_Exclude_Group(df)
                mse_Linear_Include_Group, time_Linear_Include_Group,_,_ = Linear_Include_Group(df, to_drop='gE')
                mse_linearohe, time_linearohe,_,_ = LinearOHE(df, to_drop='gE')
                mse_lmmnn, time_lmmnn,_,_ = LMMNN(df, to_drop='gE')
                mse_merf, time_merf,_,_ = MERForest(df, to_drop='gE', threshold=0.1)
                mse_armed, time_armed,_,_ = ARMED(df, to_drop='gE')

                results_gV.at[itr, 'mode'] = mode
                results_gV.at[itr, 'seed'] = s
                results_gV.at[itr, 'gE'] = e
                results_gV.at[itr, 'gV'] = v

                results_gV.at[itr, 'mse_Linear_Exclude_Group'] = mse_Linear_Exclude_Group
                results_gV.at[itr, 'time_Linear_Exclude_Group'] = time_Linear_Exclude_Group
                results_gV.at[itr, 'mse_Linear_Include_Group'] = mse_Linear_Include_Group
                results_gV.at[itr, 'time_Linear_Include_Group'] = time_Linear_Include_Group
                results_gV.at[itr, 'mse_linearohe'] = mse_linearohe
                results_gV.at[itr, 'time_linearohe'] = time_linearohe
                results_gV.at[itr, 'mse_lmmnn'] = mse_lmmnn
                results_gV.at[itr, 'time_lmmnn'] = time_lmmnn
                results_gV.at[itr, 'mse_merf'] = mse_merf
                results_gV.at[itr, 'time_merf'] = time_merf
                results_gV.at[itr, 'mse_armed'] = mse_armed
                results_gV.at[itr, 'time_armed'] = time_armed
                
                try:
                    mse_mixedlm, time_mixedlm, _, _ = MixedLM(df, to_drop='gE', mode=mode)
                    results_gV.at[itr, 'mse_mixedlm'] = mse_mixedlm
                    results_gV.at[itr, 'time_mixedlm'] = time_mixedlm
                except KeyError:
                    results_gV.at[itr, 'mse_mixedlm'] = None
                    results_gV.at[itr, 'time_mixedlm'] = None
            
results_gV['Target_y'] = target_y


 0 intercept 3 10
n_uniques:  [10]







 1 intercept 3 30
n_uniques:  [30]

 2 intercept 3 50
n_uniques:  [50]

 3 intercept 3 70
n_uniques:  [70]

 4 intercept 3 90
n_uniques:  [90]

 5 intercept 3 110
n_uniques:  [110]

 6 intercept 3 130
n_uniques:  [130]

 7 intercept 3 150
n_uniques:  [150]

 8 intercept 3 170
n_uniques:  [170]

 9 intercept 3 190
n_uniques:  [190]

 10 intercept 3 210
n_uniques:  [210]

 11 intercept 3 230
n_uniques:  [230]

 12 intercept 3 250
n_uniques:  [250]

 13 intercept 4 10
n_uniques:  [10]

 14 intercept 4 30
n_uniques:  [30]

 15 intercept 4 50
n_uniques:  [50]

 16 intercept 4 70
n_uniques:  [70]

 17 intercept 4 90
n_uniques:  [90]

 18 intercept 4 110
n_uniques:  [110]

 19 intercept 4 130
n_uniques:  [130]

 20 intercept 4 150
n_uniques:  [150]

 21 intercept 4 170
n_uniques:  [170]

 22 intercept 4 190
n_uniques:  [190]

 23 intercept 4 210
n_uniques:  [210]

 24 intercept 4 230
n_uniques:  [230]

 25 intercept 4 250
n_uniques:  [250]

 26 intercept 5 10
n_uniques:  [10]

 27 intercept 

In [7]:
results_gV.to_csv("Results\r6_single.csv", index=False)

# Effective group performance

#### For the same datasets, performance of models is calculated by considering effective group as grouping feature and avoiding visible groups.

#### For effective groups the datasets stay the same only if re_error is added using 'on_target'. 
#### If it is 'on_group', which adds an error on visible groups the entire simulation needs to be run. 
#### If it is 'on_target', for any number of visible groups, the data stays the same. The below simulation runs with the error type: 'on_target'.

### Parameters

In [32]:
modes = ['intercept', 'slope', 'both']

effective_groups = list(range(3,9))
visible_groups = [10]
number_of_contineous_features = 5

# random_effects_distribution = 'normal'; re_intercept = (0,5) ; re_slope = (0,5)
random_effects_distribution = 'linspace'; re_intercept = (-10,10) ; re_slope = (-10,10)

# error_type = 'on_group' ; re_error = (0,3)
error_type = 'on_target' ; re_error = (0,3)

shuffle_groups = True

In [35]:
results_gV = pd.DataFrame([])
itr = -1
target_y = []

for mode in modes:
    
    for e in effective_groups:

        for v in visible_groups:

            seeds = [100]

            for s in seeds:

                itr = itr + 1
                print("\n",itr,mode,e,v)

                df,_ = sd.create_data(
                    n = 1000,
                    n_effective_groups = e,
                    n_visible_groups = v,
                    n_contineous_features = number_of_contineous_features,
                    mode = mode,
                    fixed_slope=(0, 1),
                    fixed_intercept=0,
                    fixed_error=(0, 1),
                    random_effects_distribution=random_effects_distribution,
                    re_intercept=re_intercept,
                    re_slope=re_slope,
                    error_type=error_type,
                    re_error=re_error,
                    random_seed=s,
                    shuffle_groups=shuffle_groups,
                    show_random_effects=False,
                )
                
                target_y.append(list(df.y))

                mse_Linear_Exclude_Group, time_Linear_Exclude_Group,_,_ = Linear_Exclude_Group(df)
                mse_Linear_Include_Group, time_Linear_Include_Group,_,_ = Linear_Include_Group(df, to_drop='gV')
                mse_linearohe, time_linearohe,_,_ = LinearOHE(df, to_drop='gV')
                mse_lmmnn, time_lmmnn,_,_ = LMMNN(df, to_drop='gV')
                mse_merf, time_merf,_,_ = MERForest(df, to_drop='gV')
                mse_armed, time_armed,_,_ = ARMED(df, to_drop='gV')

                results_gV.at[itr, 'mode'] = mode
                results_gV.at[itr, 'seed'] = s
                results_gV.at[itr, 'gE'] = e
                results_gV.at[itr, 'gV'] = v

                results_gV.at[itr, 'mse_Linear_Exclude_Group'] = mse_Linear_Exclude_Group
                results_gV.at[itr, 'time_Linear_Exclude_Group'] = time_Linear_Exclude_Group
                results_gV.at[itr, 'mse_Linear_Include_Group'] = mse_Linear_Include_Group
                results_gV.at[itr, 'time_Linear_Include_Group'] = time_Linear_Include_Group
                results_gV.at[itr, 'mse_linearohe'] = mse_linearohe
                results_gV.at[itr, 'time_linearohe'] = time_linearohe
                results_gV.at[itr, 'mse_lmmnn'] = mse_lmmnn
                results_gV.at[itr, 'time_lmmnn'] = time_lmmnn
                results_gV.at[itr, 'mse_merf'] = mse_merf
                results_gV.at[itr, 'time_merf'] = time_merf
                results_gV.at[itr, 'mse_armed'] = mse_armed
                results_gV.at[itr, 'time_armed'] = time_armed
                
                try:
                    mse_mixedlm, time_mixedlm, _, _ = MixedLM(df, to_drop='gV', mode=mode)
                    results_gV.at[itr, 'mse_mixedlm'] = mse_mixedlm
                    results_gV.at[itr, 'time_mixedlm'] = time_mixedlm
                except KeyError:
                    results_gV.at[itr, 'mse_mixedlm'] = None
                    results_gV.at[itr, 'time_mixedlm'] = None
            
results_gV['Target_y'] = target_y


 0 intercept 3 10
n_uniques:  [3]

 1 intercept 4 10
n_uniques:  [4]

 2 intercept 5 10
n_uniques:  [5]

 3 intercept 6 10
n_uniques:  [6]

 4 intercept 7 10
n_uniques:  [7]

 5 intercept 8 10
n_uniques:  [8]

 6 slope 3 10
n_uniques:  [3]

 7 slope 4 10
n_uniques:  [4]

 8 slope 5 10
n_uniques:  [5]

 9 slope 6 10
n_uniques:  [6]

 10 slope 7 10
n_uniques:  [7]

 11 slope 8 10
n_uniques:  [8]

 12 both 3 10
n_uniques:  [3]

 13 both 4 10
n_uniques:  [4]

 14 both 5 10
n_uniques:  [5]

 15 both 6 10
n_uniques:  [6]

 16 both 7 10
n_uniques:  [7]

 17 both 8 10
n_uniques:  [8]


In [6]:
results_gV.to_csv("Results\r1_multiple_features_effective_groups.csv", index=False)