In [None]:
import numpy as np
import pandas as pd
import shap
import joblib
import os
from sklearn.model_selection import LeaveOneGroupOut
import shap
import pandas as pd

dims = [5, 30]
budgets = [500, 2000, 5000, 10000, 50000]
configs = [(0, 810),(270, 1080), (1,811), (2, 812),(3, 813), (4, 814), (5, 815), (6, 816), (7, 817), (8, 818), (271, 1081)]
features = ['diff_mean_02', 'diff_mean_05', 'diff_mean_10', 'diff_mean_25', 'diff_median_02', 'diff_median_05', 'diff_median_10', 'diff_median_25', 'dist_ratio.coeff_var', 'eps.max', 'eps.ratio', 'eps.s', 'expl_var.cor_init', 'expl_var.cor_x', 'expl_var.cov_init', 'expl_var.cov_x', 'expl_var_PC1.cor_init', 'expl_var_PC1.cor_x', 'expl_var_PC1.cov_init', 'expl_var_PC1.cov_x', 'h.max', 'kurtosis', 'lin_simple.adj_r2', 'lin_simple.coef.max', 'lin_simple.coef.max_by_min', 'lin_simple.coef.min', 'lin_simple.intercept', 'lin_w_interact.adj_r2', 'm0', 'nb_fitness.cor', 'nn_nb.cor', 'nn_nb.mean_ratio', 'nn_nb.sd_ratio', 'number_of_peaks', 'quad_simple.adj_r2', 'quad_simple.cond', 'quad_w_interact.adj_r2', 'ratio_mean_02', 'ratio_mean_05', 'ratio_mean_10', 'ratio_mean_25', 'ratio_median_02', 'ratio_median_05', 'ratio_median_10', 'ratio_median_25', 'skewness']

def get_global_shap(conf_name):
    loaded_model = joblib.load('./results/models/RF/'+conf_name+'.pkl')
    df = pd.read_csv("./Data/processed/"+conf_name+".csv", index_col = 0)
    X = df.drop(['target'], axis = 1)
    X = X.rename(columns=lambda s: s.split("/")[-1])
    y = df.iloc[:,-1]

    logo = LeaveOneGroupOut()
    groups = X['instance']
    logo.get_n_splits(X, y, groups)
    dt = []
    for train_index, test_index in logo.split(X, y, groups):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y[train_index], y[test_index]
        X_train_removed = X_train.drop(['instance', 'function'], axis=1)
        for instance in np.unique(np.array(X_train['instance'])):
            loaded_model = loaded_model.fit(X_train_removed, y_train)
            explainer = shap.TreeExplainer(loaded_model)
            data = X_train[X_train['instance']==instance]
            func = np.array(data['function']).reshape(24,1)
            ins = np.array(data['instance']).reshape(24,1)
            data = data.drop(['instance', 'function'], axis=1)
            shap_values = np.concatenate((func, ins, explainer.shap_values(data)), axis=1)
            dt.extend(shap_values)


    df_dt = pd.DataFrame(dt)


    final_shap = []
    for f in [10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 1, 20, 21, 22, 23, 24, 2, 3, 4, 5, 6, 7, 8, 9]:
        for i in range(1, 6):
            d = df_dt[(df_dt[0]==f) & (df_dt[1]==i)]
            d = d.drop([d.columns[0], d.columns[1]], axis=1)
            avg = np.average(d, axis = 0)
            final_shap.append(avg)
    return final_shap

def get_most_important_features(conf_name, num_features):
    loaded_model = joblib.load('./results/models/RF/'+conf_name+'.pkl')
    df = pd.read_csv("./Data/processed/"+conf_name+".csv", index_col = 0)
    X =df.drop(['target', 'function', 'instance'], axis = 1)
    X = X.rename(columns=lambda s: s.split("/")[-1])
    y = df.iloc[:,-1]
    # explainer = shap.TreeExplainer(loaded_model)
    # shap_values = explainer.shap_values(X)
    shap_values = get_global_shap(conf_name)
    vals= np.abs(shap_values).mean(0)
    feature_importance = pd.DataFrame(list(zip(X.columns,vals)),columns=['col_name','feature_importance_vals'])
    feature_importance.sort_values(by=['feature_importance_vals'],ascending=False,inplace=True)
    return np.array(feature_importance['col_name'])[0:num_features]
for top_features in [10, 15, 20]:
    for dim in dims:
        data = np.zeros((10,46))
        for i in range(0, len(budgets)):
            budget = budgets[i]
            for config in configs:
                module_on = config[0]
                module_off = config[1]
                features_off = get_most_important_features("dim_"+str(dim)+"_budget_"+str(budget)+"_conf_"+str(module_off), top_features)
                for feature_off in features_off:
                    feature_off_idx = features.index(feature_off)
                    data[i*2][feature_off_idx] = data[i*2][feature_off_idx]+1
                features_on = get_most_important_features("dim_"+str(dim)+"_budget_"+str(budget)+"_conf_"+str(module_on), top_features)
                for feature_on in features_on:
                    feature_on_idx = features.index(feature_on)
                    data[i*2+1][feature_on_idx] = data[i*2+1][feature_on_idx]+1 
        df = pd.DataFrame(data.transpose(), index=features, columns = ['0-500', '1-500', '0-2000', '1-2000', '0-5000', '1-5000','0-10000', '1-10000','0-50000', '1-50000'])
        df.to_csv("./results/frequency_tables/top_"+str(top_features)+"_elitism_dim_"+str(dim)+".csv")
        print("dim = "+str(dim))
        print("budget = "+str(budget))