In [1]:
#import raw data and imputed data 
import pandas as pd
import numpy as np
def load_raw_data(path, data_name, index=None):
	df = pd.read_csv(path + data_name + '_full.csv', index_col=0 if index else None)
	return df
def load_data(path,index=None,imputation=False,imputation_method=None,missing_mech=None):
	if imputation:
		if imputation_method is None or missing_mech is None:
			raise ValueError("Both imputation_method and missing_mech must be specified if imputation is True.")
		full_path = path + imputation_method + '_data_'+ missing_mech
		df = pd.read_csv(full_path + '.csv', index_col=0 if index else None)
	else:
		return None
	return df

raw_data_path = '../../data/'
imputed_data_path = '../../imp_data/'

data_name = 'eeg_eye_state'

X_raw = load_raw_data(raw_data_path, data_name, index=None)


target_name = 'target'
y_raw = X_raw[target_name]

In [2]:
#loading imputed data
missing_mech = ['mcar','mar','mnar']
imputation_method = ['knn', 'gan', 'interpolated','locf']
X_imputed = {}	
for mech in missing_mech:
    for imp_method in imputation_method:
        df=load_data(imputed_data_path, index=None, imputation=True, imputation_method=imp_method, missing_mech=mech)
        if df is not None:
                X_imputed[f"{imp_method}_{mech}"] = df
        

In [3]:
#EDA
import pprint
def eda(X, y):
	print("EDA Summary:")
	print(f"Shape of X: {X.shape}")
	print(f"Shape of y: {y.shape}")
	pprint.pprint(f"Columns in X: {X.columns.tolist()}")
	pprint.pprint(f"Unique values in y: {y.unique()}")

eda(X_raw, y_raw)

EDA Summary:
Shape of X: (14980, 15)
Shape of y: (14980,)
("Columns in X: ['AF3', 'F7', 'F3', 'FC5', 'T7', 'P7', 'O1', 'O2', 'P8', 'T8', "
 "'FC6', 'F4', 'F8', 'AF4', 'target']")
'Unique values in y: [0 1]'


In [4]:
for key, df in X_imputed.items():
    print(f"\nEDA for {key}:")
    eda(df, df[target_name] if target_name in df.columns else None)


EDA for knn_mcar:
EDA Summary:
Shape of X: (14980, 15)
Shape of y: (14980,)
("Columns in X: ['AF3', 'F7', 'F3', 'FC5', 'T7', 'P7', 'O1', 'O2', 'P8', 'T8', "
 "'FC6', 'F4', 'F8', 'AF4', 'target']")
'Unique values in y: [0 1]'

EDA for gan_mcar:
EDA Summary:
Shape of X: (14980, 15)
Shape of y: (14980,)
("Columns in X: ['AF3', 'F7', 'F3', 'FC5', 'T7', 'P7', 'O1', 'O2', 'P8', 'T8', "
 "'FC6', 'F4', 'F8', 'AF4', 'target']")
'Unique values in y: [0 1]'

EDA for interpolated_mcar:
EDA Summary:
Shape of X: (14980, 15)
Shape of y: (14980,)
("Columns in X: ['AF3', 'F7', 'F3', 'FC5', 'T7', 'P7', 'O1', 'O2', 'P8', 'T8', "
 "'FC6', 'F4', 'F8', 'AF4', 'target']")
'Unique values in y: [0 1]'

EDA for locf_mcar:
EDA Summary:
Shape of X: (14980, 15)
Shape of y: (14980,)
("Columns in X: ['AF3', 'F7', 'F3', 'FC5', 'T7', 'P7', 'O1', 'O2', 'P8', 'T8', "
 "'FC6', 'F4', 'F8', 'AF4', 'target']")
'Unique values in y: [0 1]'

EDA for knn_mar:
EDA Summary:
Shape of X: (14980, 15)
Shape of y: (14980,)
("Columns 

In [5]:
#statistical similarity check between raw and imputed data
#mean and std
def compare_statistics(X_raw, X_imputed):
	stats = {}
	for key, df in X_imputed.items():
		stats[key] = {
			'mean_diff': (X_raw.mean() - df.mean()).abs(),
			'std_diff': (X_raw.std() - df.std()).abs()
		}
	return stats
stats = compare_statistics(X_raw, X_imputed)
for key, stat in stats.items():
	print(f"\nStatistics for {key}:")
	print(f"Mean difference:\n{stat['mean_diff']}")
	print(f"Standard deviation difference:\n{stat['std_diff']}")



Statistics for knn_mcar:
Mean difference:
AF3        0.679459
F7         0.524836
F3         0.247419
FC5        0.303246
T7         0.004005
P7         0.170951
O1        37.840965
O2         0.004969
P8         0.012137
T8         0.004838
FC6        0.032092
F4         0.294783
F8         9.623894
AF4       55.306972
target     0.000000
dtype: float64
Standard deviation difference:
AF3          0.274762
F7           1.388711
F3           0.408739
FC5          0.019852
T7           3.246947
P7           0.045293
O1        4568.988435
O2           1.254827
P8           0.009020
T8           0.465367
FC6          0.783864
F4           6.989073
F8        1176.021434
AF4       5847.548085
target       0.000000
dtype: float64

Statistics for gan_mcar:
Mean difference:
AF3       123.335722
F7         46.102608
F3         10.118474
FC5       270.965677
T7         10.319623
P7         45.285485
O1         27.819345
O2          1.017026
P8         16.712303
T8          1.485824
FC6        52

In [6]:
def compare_statistics(X_raw, X_imputed):
    stats = {}
    for key, df in X_imputed.items():
        stats[key] = {
            'mean_diff': (X_raw.mean() - df.mean()).abs().mean(),  # Average across all features
            'std_diff': (X_raw.std() - df.std()).abs().mean(),     # Average across all features,
            'variance_diff': (X_raw.var() - df.var()).abs().mean()  # Average across all features
        }
    return stats

stats = compare_statistics(X_raw, X_imputed)

# Print results in a more readable format
print("\nAggregate Statistics:")
print(f"{'Method':<20} {'Mean Diff':<15} {'Std Diff':<15} {'Variance Diff': <15}")
print("-" * 50)
for key, stat in stats.items():
    print(f"{key:<20} {stat['mean_diff']:<15.4f} {stat['std_diff']:<15.4f} {stat['variance_diff']:<15.4f}")
    


Aggregate Statistics:
Method               Mean Diff       Std Diff        Variance Diff  
--------------------------------------------------
knn_mcar             7.0034          773.8296        3822326.5021   
gan_mcar             55.0161         800.6301        3968696.5451   
interpolated_mcar    6.9227          774.1287        3822349.3074   
locf_mcar            7.0011          776.1782        3822557.0177   
knn_mar              3.8149          383.7331        815578.5530    
gan_mar              97.1730         406.2390        957254.4489    
interpolated_mar     3.5708          382.1999        815424.9380    
locf_mar             3.5496          381.7321        815361.4453    
knn_mnar             17.4589         1088.4172       4541047.2277   
gan_mnar             97.1730         406.2390        957254.4489    
interpolated_mnar    7.2198          387.0916        815801.0357    
locf_mnar            6.9309          386.9535        815803.4447    


In [6]:
import os
import glob
import pandas as pd

perf_root_dir = '../../classification_results/'
for csv_file in glob.glob(os.path.join(perf_root_dir, '*.csv')):
	df = pd.read_csv(csv_file)
	print(f"\nPerformance metrics from {csv_file}:")
	
	print(df.head())  # Display the first few rows of the DataFrame


Performance metrics from ../../classification_results/summary_results.csv:
  Missing Mechanism    Imputation          Accuracy         Precision  \
0               MAR           gan  0.7645 (±0.0058)  0.7643 (±0.0058)   
1               MAR  interpolated  0.9457 (±0.0035)  0.9458 (±0.0035)   
2               MAR           knn  0.7634 (±0.0042)  0.7632 (±0.0043)   
3               MAR          locf  0.9506 (±0.0008)  0.9507 (±0.0008)   
4              MCAR           gan  0.7408 (±0.0049)  0.7403 (±0.0046)   

             Recall                F1  
0  0.7645 (±0.0058)  0.7643 (±0.0057)  
1  0.9457 (±0.0035)  0.9457 (±0.0035)  
2  0.7634 (±0.0042)  0.7631 (±0.0043)  
3  0.9506 (±0.0008)  0.9506 (±0.0008)  
4  0.7408 (±0.0049)  0.7394 (±0.0050)  
