In [None]:
import pandas as pd
import xgboost as xgb
import numpy as np
import sklearn
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import make_pipeline
from imblearn.combine import SMOTEENN
from imblearn.over_sampling import SMOTENC
import matplotlib.pyplot as plt
from PIL import Image
from scipy.interpolate import BSpline, make_interp_spline, interp1d
#import rpy2.robjects as robjects
#from rpy2.robjects.packages import importr
import csv
from dfply import *
from xgboost import XGBClassifier
import itertools
import os
import logging
from glob import glob
import pickle
import utils_function

import pickle

In [None]:
def generate_corr(configs_variables):
    
    site, datafolder, home_directory = utils_function.get_commons(configs_variables)
    stg = configs_variables['stg']
    print('Running bt3corr on site '+site, flush = True)

    if not configs_variables['rerun_flag'] and os.path.exists(datafolder+site+'/bt3corr_'+site+'_'+stg+'_3000.pkl'):
        print('Existed: bt3corr_'+site+'_'+stg+'_3000.pkl')
        return         
    
    bt = pd.read_pickle(datafolder+site+'/bt3pos_'+site+'_'+stg+'_3000.pkl')
    bt = bt.drop(['PATID', 'ENCOUNTERID', 'SINCE_ADMIT'],axis=1)
    corr = bt.corr()
    corr.to_pickle(datafolder+site+'/bt3corr_'+site+'_'+stg+'_3000.pkl')

In [None]:
def calculate_corr_occurence_new(configs_variables):
#if True:    
    site, datafolder, home_directory = utils_function.get_commons(configs_variables[0])
    stg = configs_variables[0]['stg']
    print('Running bt3corr on site '+site, flush = True)
    threshold = float(configs_variables[0]['threshold_correlation'])
    
    files = []
    for configs_variable in configs_variables:
        site, datafolder, home_directory = utils_function.get_commons(configs_variable)
        files.append(datafolder+site+'/bt3corr_'+site+'_'+stg+'_3000.pkl')
    
    meltcorrlist = list() 
    flag_corr = list()
    
    for file in files:
        site = file.split('/')[-1].split('.')[0].split('_')[1]
        corr = pd.read_pickle(file)
        flag_corr_t = corr[['FLAG']]
        flag_corr_t['site'] = site
        flag_corr.append(flag_corr_t)
        
        corr = corr.drop('FLAG').drop('FLAG',axis=1)
        meltcorr = pd.melt(corr.reset_index(), id_vars=['index'])
        meltcorr = meltcorr[meltcorr['index'] != meltcorr['variable']]
        meltcorr['site'] = site
        meltcorrlist.append(meltcorr)

    flag_corr_all = pd.concat(flag_corr)
    flag_corr_median = flag_corr_all.groupby(flag_corr_all.index).median().sort_values('FLAG',ascending=False)
    flag_corr_dict = flag_corr_median.to_dict()['FLAG']
    meltcorrall = pd.concat(meltcorrlist)
    meltcorrall.columns = ['v1', 'v2', 'corr', 'site']

    feature_site_count = meltcorrall[['v1', 'site']].drop_duplicates().groupby('v1').count()
    feature_site_dict = feature_site_count.to_dict()['site']    
    
    removal_order = meltcorrall[['v1','v2','corr']].groupby(['v1','v2']).median().reset_index().sort_values('corr',ascending=False).reset_index(drop=True).dropna()
    removal_order['abs_corr'] = abs(removal_order['corr'] )
    removal_order = removal_order.sort_values('abs_corr',ascending=False)
    removal_order = removal_order[removal_order['abs_corr']>=threshold].reset_index(drop=True)

    removal_list = []

    for i in range(removal_order.shape[0]):
        row = removal_order.iloc[i]
        if row['v1'] in removal_list or row['v2'] in removal_list:
            if row['v1'] in removal_list:
                print(row['v1'])
            else:
                print(row['v2'])            
            continue

    #    if feature_site_dict[row['v1']] > feature_site_dict[row['v2']]: # preserve the most common variable
    #        removal_list.append(row['v1'])
    #    elif feature_site_dict[row['v1']] < feature_site_dict[row['v2']]:
    #        removal_list.append(row['v2'])
    #    else:

        # Only remove if one of them is avaliable in all site        
        if feature_site_dict[row['v1']] == len(configs_variables) and  feature_site_dict[row['v2']] == len(configs_variables):            
#            print(row['v1'], feature_site_dict[row['v1']], row['v2'], feature_site_dict[row['v2']])
#            print(row['v1'], flag_corr_dict[row['v1']], row['v2'], flag_corr_dict[row['v2']])
            
            if abs(flag_corr_dict[row['v1']]) > abs(flag_corr_dict[row['v2']]): # preserve the variable more correlated top FLAG
#                print(row['v2'])
                removal_list.append(row['v2'])
                
            else:
#                print(row['v1'])
                removal_list.append(row['v1']) 
        elif feature_site_dict[row['v1']] == len(configs_variables):
#            print(row['v1'], feature_site_dict[row['v1']], row['v2'], feature_site_dict[row['v2']])
            
#            print(row['v2'])
            removal_list.append(row['v2'])
        elif feature_site_dict[row['v2']] == len(configs_variables):
#            print(row['v1'], feature_site_dict[row['v1']], row['v2'], feature_site_dict[row['v2']])
            
#            print(row['v1'])
            removal_list.append(row['v1'])        
            
    removal_list = ['SEX_M' if x == 'SEX_F' else x for x in removal_list]
#    removal_list = ['HISPANIC_N' if x == 'HISPANIC_Y' else x for x in removal_list]
            
    removal_list = pd.DataFrame(removal_list, columns=['features'])        
    removal_list.to_pickle(datafolder+'/'+'meltcorrallcount_'+str(threshold)+'.pkl')
    
if __name__ == "__main__":
    site_list = ['KUMC', 'UTSW', 'MCW', 'UofU', 'UIOWA', 'UMHC', 'UPITT', 'UTHSCSA', 'UNMC']    
    configs_variables = [utils_function.read_config(site) for site in site_list]
    calculate_corr_occurence_new(configs_variables)
    

In [None]:
def remove_correlated_features(configs_variables):
    site, datafolder, home_directory = utils_function.get_commons(configs_variables)    
    threshold = float(configs_variables['threshold_correlation'])
    stg = configs_variables['stg']
    removal_list =pd.read_pickle(datafolder+'/'+'meltcorrallcount_'+str(threshold)+'.pkl')
    removal_list = removal_list['features'].to_list()
    
    print(f"Removing correlation {site}")
    
    bt = pd.read_pickle(datafolder+site+'/bt3pos_'+site+'_'+stg+'_3000.pkl')
    filtered_columns = [x for x in bt.columns if x not in removal_list]
    bt = bt[filtered_columns]
    
    bt.to_pickle(datafolder+site+'/bt3posnc_'+site+'_'+stg+'_3000.pkl')
    
if __name__ == "__main__":
    site_list = ['KUMC', 'UTSW', 'MCW', 'UofU', 'UIOWA', 'UMHC', 'UPITT', 'UTHSCSA', 'UNMC']    
    configs_variables = [utils_function.read_config(site) for site in site_list]
    for configs_variable in configs_variables:
        remove_correlated_features(configs_variable)