In [55]:
#Import modules for dataprocessing
import pandas as pd
import numpy as np
import itertools

#import analysis modules
from analysis import *

### I. Selecting all possible combinations

In [29]:
parameters = {}
parameters['seat_dist'] = [12]
parameters['init_patient'] = [3]
parameters['attend_rate'] = [0.25, 0.5, 0.75, 1]
parameters['inclass_lunch'] = [True, False]
parameters['mask_prob'] = [0, 0.25, 0.5, 0.75, 1]
parameters['iteration'] = [0]

In [30]:
combinations = list(itertools.product(*list(parameters.values())))
files = generate_file_names(combinations)
data = return_dataframe_params(files)
dataset = pd.DataFrame(combinations, columns=parameters.keys())
dataset['combination_number'] = dataset.index + 1

In [31]:
dataset.head()

Unnamed: 0,seat_dist,init_patient,attend_rate,inclass_lunch,mask_prob,iteration,combination_number
0,12,3,0.25,True,0.0,0,1
1,12,3,0.25,True,0.25,0,2
2,12,3,0.25,True,0.5,0,3
3,12,3,0.25,True,0.75,0,4
4,12,3,0.25,True,1.0,0,5


In [32]:
print(f'There are {len(data)} combined data elements in the merged dataframe.')

There are 54000 combined data elements in the merged dataframe.


### II. Generate Combination Plot

In [None]:
generate_facetgrid(data, dataset)

**We cannot display results due to filesize blowup. Please view report if necessary**

### III. Generate Heatmaps

In [None]:
generate_heatmaps(files)

**We cannot display results because of security concerns on the dataset. Please view report if necessary**

### IV. Assessing Preliminary Parameter Importances via a Random Forest Regressor 

In [12]:
dataset = pd.DataFrame(combinations, columns=parameters.keys())
dataset['output'] = (data.groupby('combo')['cov_positive'].last()).values
dataset['output'] = dataset['output'] / (500*dataset['attend_rate'])

In [13]:
regressor = RandomForestRegressor(n_estimators=50, random_state=200)
regressor.fit(dataset.iloc[:,:-1].values, dataset['output'])

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=50,
                      n_jobs=None, oob_score=False, random_state=200, verbose=0,
                      warm_start=False)

In [14]:
feature_importances = pd.DataFrame(regressor.feature_importances_).T
feature_importances.columns = columns = parameters.keys()
feature_importances

Unnamed: 0,seat_dist,init_patient,attend_rate,inclass_lunch,mask_prob,iteration
0,0.0,0.0,0.300142,0.45689,0.242969,0.0


In [15]:
dataset['combination_number'] = dataset.index + 1
merged = data.merge(dataset, left_on='combo', right_on='combination_number')
merged['cov_positive'] = merged['cov_positive'] / (500*merged['attend_rate'])

### V. Assessing if any specific parameter(s) produces significant differences in distributions

In [16]:
merged_grouped = merged.groupby(['combo'])
unique_combos = list(itertools.combinations(merged_grouped.groups.keys(), 2))

In [18]:
results = []
for group1, group2 in unique_combos:
    results.append([group1, group2, return_significant_difference(group1, group2, merged_grouped)])
results_df = pd.DataFrame(results, columns=['combo_1', 'combo_2', 'result'])

In [20]:
all_significant = results_df[results_df['result'] == True]

In [21]:
compare_combinations = all_significant.merge(dataset, left_on='combo_1', right_on='combination_number').merge(dataset, left_on='combo_2', right_on='combination_number')

In [22]:
significant_combinations = compare_combinations[['attend_rate_x', 'inclass_lunch_x', 'mask_prob_x','attend_rate_y', 'inclass_lunch_y', 'mask_prob_y']]

In [23]:
different_variables = significant_combinations.apply(get_different_column, axis=1)

In [24]:
different_variables.value_counts()

attend_rate-inclass_lunch-mask_prob    125
attend_rate-mask_prob                   71
inclass_lunch-mask_prob                 36
attend_rate-inclass_lunch               29
mask_prob                               16
attend_rate                              9
inclass_lunch                            9
dtype: int64

In [56]:
files

['outputs/output_seat_dist_12_init_patient_3_attend_rate_0.25_inclass_lunch_True_mask_prob_0_iteration_0',
 'outputs/output_seat_dist_12_init_patient_3_attend_rate_0.25_inclass_lunch_True_mask_prob_0.25_iteration_0',
 'outputs/output_seat_dist_12_init_patient_3_attend_rate_0.25_inclass_lunch_True_mask_prob_0.5_iteration_0',
 'outputs/output_seat_dist_12_init_patient_3_attend_rate_0.25_inclass_lunch_True_mask_prob_0.75_iteration_0',
 'outputs/output_seat_dist_12_init_patient_3_attend_rate_0.25_inclass_lunch_True_mask_prob_1_iteration_0',
 'outputs/output_seat_dist_12_init_patient_3_attend_rate_0.25_inclass_lunch_False_mask_prob_0_iteration_0',
 'outputs/output_seat_dist_12_init_patient_3_attend_rate_0.25_inclass_lunch_False_mask_prob_0.25_iteration_0',
 'outputs/output_seat_dist_12_init_patient_3_attend_rate_0.25_inclass_lunch_False_mask_prob_0.5_iteration_0',
 'outputs/output_seat_dist_12_init_patient_3_attend_rate_0.25_inclass_lunch_False_mask_prob_0.75_iteration_0',
 'outputs/output_