In [75]:
import numpy as np

In [76]:
%store -r thoracic_export
%store -r thoracic_headers


In [77]:
%run validation.ipynb

In [78]:
%store -r thoracic_gaussian_headers

In [79]:
def re_run_thoracic(headers):
    datasets = {
        "thoracic_surgery": [thoracic_export,headers],
    }
    return datasets

In [80]:
'''
Splits to train+validation and test based on train_validation_percent.
'''
def train_test_split(datasets, train_validation_percent):
    train_test_split = {}
    train_test_split['train_and_validation'] = {}
    train_test_split['test'] = {}
    for dataset_name, dataset in datasets.items():
        shuffled_dataset = dataset[0]
        np.random.shuffle(shuffled_dataset)
        
        
        length = len(shuffled_dataset)
        train_length = int(length*train_validation_percent)
        
        train_test_split['train_and_validation'][dataset_name] = []
        
        # TRAIN _AND _VALIDATION holds datasets dict with [data, headers] for each
        train_test_split['train_and_validation'][dataset_name].append(shuffled_dataset[:train_length])
        train_test_split['train_and_validation'][dataset_name].append(dataset[1])
        
        # SET ASIDE TEST holds the testing data only
        train_test_split['test'][dataset_name] = shuffled_dataset[train_length:]

    
    return train_test_split

In [81]:
print(datasets)


{'thoracic_surgery': [array([[ 3.  ,  3.88,  2.84, ...,  0.  , 66.  ,  1.  ],
       [ 4.  ,  2.72,  2.04, ...,  0.  , 75.  ,  0.  ],
       [ 3.  ,  3.52,  2.92, ...,  0.  , 63.  ,  0.  ],
       ...,
       [ 3.  ,  2.96,  2.32, ...,  0.  , 51.  ,  0.  ],
       [ 3.  ,  2.1 , 69.1 , ...,  0.  , 62.  ,  0.  ],
       [ 5.  ,  3.8 ,  2.98, ...,  0.  , 60.  ,  1.  ]]), array(['dgn_code_g', 'forced_vital_capacity_g',
       'volume_exhaled_after_fs_forced_expiration_g',
       'performance_status_g', 'pain_before_b', 'haemoptysis_before_b',
       'dyspnoea_before_b', 'cough_before_b', 'weakness_before_b',
       'size_of_original_tumour_g', 'type_2_diabetes_b',
       'mi_up_to_6_months_b', 'pad_b', 'smoking_b', 'asthma_b',
       'age_at_surgery_g', 'survival_after_one_year_b'], dtype=object)]}


In [82]:
datasets = re_run_thoracic(thoracic_headers)
'''
85% train + validation
15% test
'''
split_data = train_test_split(datasets,0.85)

test_val_new = split_data['test']
'''
Set train_val variable for tests
'''
train_val = split_data['train_and_validation']
        
best_logistic_model_args= [1,0.01,False,1e-2]
%run splits_tests.ipynb

Splitting with training 10.0% for each dataset -----
	 thoracic_surgery Train Size: 39
	 thoracic_surgery Test Size: 71

	Fitting thoracic_surgery with LogisticRegression 
	Predicting thoracic_surgery with LogisticRegression
		Got accuracy of: 0.8732394366197183

	Fitting thoracic_surgery with NaiveBayes 
	Predicting thoracic_surgery with NaiveBayes
		Got accuracy of: 0.7323943661971831

	Fitting thoracic_surgery with NaiveBayesLog 
	Predicting thoracic_surgery with NaiveBayesLog
		Got accuracy of: 0.6901408450704225
Splitting with training 30.0% for each dataset -----
	 thoracic_surgery Train Size: 119
	 thoracic_surgery Test Size: 71

	Fitting thoracic_surgery with LogisticRegression 
	Predicting thoracic_surgery with LogisticRegression
		Got accuracy of: 0.8732394366197183

	Fitting thoracic_surgery with NaiveBayes 
	Predicting thoracic_surgery with NaiveBayes
		Got accuracy of: 0.5774647887323944

	Fitting thoracic_surgery with NaiveBayesLog 
	Predicting thoracic_surgery with Naive

In [99]:
def thoracic_comparison_tests():
    for i in range(5):
        datasets = re_run_thoracic(thoracic_headers)
        '''
        85% train + validation
        15% test
        '''
        split_data = train_test_split(datasets,0.85)

        '''
        Set train_val variable for tests
        '''
        train_val = split_data['train_and_validation']
        models = {
            "NaiveBayes": {
                "data_headers": True,
                "model_args":[False] 
            },
            "NaiveBayesLog": {
                "data_headers": True,
                "model_args":[True] 
            },
        }

        print("Non-Gaussian Fitting:")
        model_nb = run_fit(train_val['thoracic_surgery'][0],"NaiveBayes", thoracic_headers,[] ,[False])
        model_nb_log = run_fit(train_val['thoracic_surgery'][0],"NaiveBayesLog", thoracic_headers,[] ,[True])
        
        print("Non-Gaussian Test NaiveBayes #", i,":", run_predict(split_data['test']['thoracic_surgery'],model_nb))
        print("Non-Gaussian Test NaiveBayesLog #", i,":", run_predict(split_data['test']['thoracic_surgery'],model_nb_log))

        print("Gaussian Fitting:")
        model_nb_2 = run_fit(train_val['thoracic_surgery'][0],"NaiveBayes", thoracic_gaussian_headers,[] ,[False])
        model_nb_2_log = run_fit(train_val['thoracic_surgery'][0],"NaiveBayesLog", thoracic_gaussian_headers,[] ,[True])
        
        print("Non-Gaussian Test NaiveBayes #", i,":", run_predict(split_data['test']['thoracic_surgery'],model_nb_2))
        print("Non-Gaussian Test NaiveBayesLog #", i,":", run_predict(split_data['test']['thoracic_surgery'],model_nb_2_log))

In [100]:
thoracic_comparison_tests()

Non-Gaussian Fitting:
Non-Gaussian Test NaiveBayes # 0 : 0.8028169014084507
Non-Gaussian Test NaiveBayesLog # 0 : 0.704225352112676
Gaussian Fitting:
Non-Gaussian Test NaiveBayes # 0 : 0.8028169014084507
Non-Gaussian Test NaiveBayesLog # 0 : 0.704225352112676
Non-Gaussian Fitting:
Non-Gaussian Test NaiveBayes # 1 : 0.7887323943661971
Non-Gaussian Test NaiveBayesLog # 1 : 0.6338028169014085
Gaussian Fitting:
Non-Gaussian Test NaiveBayes # 1 : 0.7887323943661971
Non-Gaussian Test NaiveBayesLog # 1 : 0.6338028169014085
Non-Gaussian Fitting:
Non-Gaussian Test NaiveBayes # 2 : 0.8309859154929577
Non-Gaussian Test NaiveBayesLog # 2 : 0.7323943661971831
Gaussian Fitting:
Non-Gaussian Test NaiveBayes # 2 : 0.8309859154929577
Non-Gaussian Test NaiveBayesLog # 2 : 0.7323943661971831
Non-Gaussian Fitting:
Non-Gaussian Test NaiveBayes # 3 : 0.7464788732394366
Non-Gaussian Test NaiveBayesLog # 3 : 0.7183098591549296
Gaussian Fitting:
Non-Gaussian Test NaiveBayes # 3 : 0.7464788732394366
Non-Gaussia