In [1]:
!pip install entmax

Collecting entmax
  Downloading entmax-1.3-py3-none-any.whl.metadata (348 bytes)
Downloading entmax-1.3-py3-none-any.whl (13 kB)
Installing collected packages: entmax
Successfully installed entmax-1.3


In [2]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [3]:
import numpy as np
import pandas as pd
from sklearn.model_selection import cross_validate, train_test_split
from sklearn.metrics import accuracy_score, matthews_corrcoef, make_scorer
from ncart import NCARClassifier

In [4]:
# This is a function that loads training and testing data for 10-fold cross validation
def prepare_train_test_dataset(datasets, test_idx):
  test_df = datasets[test_idx]
  train_df = pd.concat([datasets[i] for i in range(len(datasets)) if i != test_idx])

  X_train = train_df.iloc[:, 1:-1].values
  X_test = test_df.iloc[:, 1:-1].values
  y_train = train_df.iloc[:, -1].values
  y_test = test_df.iloc[:, -1].values

  return X_train, X_test, y_train, y_test

In [5]:
import torch

def train_with_base_ncart(datasets, key):
  print("Currently training NCART model with dataset key:", key)

  # Perform 10-fold cross validation
  dataset_list = datasets[key]

  best_mcc = 0
  best_model = None

  for i in range(10):
    X_train, X_test, y_train, y_test = prepare_train_test_dataset(dataset_list, i)

    # Train the model
    model = NCARClassifier()
    model.fit(X=X_train, y=y_train, X_val=X_test, y_val=y_test)

    # Evaluate the model
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    mcc = matthews_corrcoef(y_test, y_pred)

    if mcc > best_mcc:
      best_mcc = mcc
      best_model = model

  print("Best MCC:", best_mcc)

  return best_model

In [6]:
def generate_NCART_feature_importance_from_dataset(datasets, key):
  trained_model = train_with_base_ncart(datasets, key)

  dataset = datasets[key][0]
  column_names = dataset.columns[1:-1]

  importance = trained_model.get_importance()
  feature_names = column_names

  feature_importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': importance})
  feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

  return feature_importance_df

In [7]:
# Load the validation datasets

datasets = {}

# All

IX_all_nopcc_1 = pd.read_csv('/content/drive/MyDrive/OrderedDataset/IX/all/cv/nopcc/fold_1.csv')
IX_all_nopcc_2 = pd.read_csv('/content/drive/MyDrive/OrderedDataset/IX/all/cv/nopcc/fold_2.csv')
IX_all_nopcc_3 = pd.read_csv('/content/drive/MyDrive/OrderedDataset/IX/all/cv/nopcc/fold_3.csv')
IX_all_nopcc_4 = pd.read_csv('/content/drive/MyDrive/OrderedDataset/IX/all/cv/nopcc/fold_4.csv')
IX_all_nopcc_5 = pd.read_csv('/content/drive/MyDrive/OrderedDataset/IX/all/cv/nopcc/fold_5.csv')
IX_all_nopcc_6 = pd.read_csv('/content/drive/MyDrive/OrderedDataset/IX/all/cv/nopcc/fold_6.csv')
IX_all_nopcc_7 = pd.read_csv('/content/drive/MyDrive/OrderedDataset/IX/all/cv/nopcc/fold_7.csv')
IX_all_nopcc_8 = pd.read_csv('/content/drive/MyDrive/OrderedDataset/IX/all/cv/nopcc/fold_8.csv')
IX_all_nopcc_9 = pd.read_csv('/content/drive/MyDrive/OrderedDataset/IX/all/cv/nopcc/fold_9.csv')
IX_all_nopcc_10 = pd.read_csv('/content/drive/MyDrive/OrderedDataset/IX/all/cv/nopcc/fold_10.csv')
datasets['IX_all_nopcc'] = [IX_all_nopcc_1, IX_all_nopcc_2, IX_all_nopcc_3, IX_all_nopcc_4, IX_all_nopcc_5,
                                       IX_all_nopcc_6, IX_all_nopcc_7, IX_all_nopcc_8, IX_all_nopcc_9, IX_all_nopcc_10]

IX_all_pcc95_1 = pd.read_csv('/content/drive/MyDrive/OrderedDataset/IX/all/cv/pcc95/fold_1.csv')
IX_all_pcc95_2 = pd.read_csv('/content/drive/MyDrive/OrderedDataset/IX/all/cv/pcc95/fold_2.csv')
IX_all_pcc95_3 = pd.read_csv('/content/drive/MyDrive/OrderedDataset/IX/all/cv/pcc95/fold_3.csv')
IX_all_pcc95_4 = pd.read_csv('/content/drive/MyDrive/OrderedDataset/IX/all/cv/pcc95/fold_4.csv')
IX_all_pcc95_5 = pd.read_csv('/content/drive/MyDrive/OrderedDataset/IX/all/cv/pcc95/fold_5.csv')
IX_all_pcc95_6 = pd.read_csv('/content/drive/MyDrive/OrderedDataset/IX/all/cv/pcc95/fold_6.csv')
IX_all_pcc95_7 = pd.read_csv('/content/drive/MyDrive/OrderedDataset/IX/all/cv/pcc95/fold_7.csv')
IX_all_pcc95_8 = pd.read_csv('/content/drive/MyDrive/OrderedDataset/IX/all/cv/pcc95/fold_8.csv')
IX_all_pcc95_9 = pd.read_csv('/content/drive/MyDrive/OrderedDataset/IX/all/cv/pcc95/fold_9.csv')
IX_all_pcc95_10 = pd.read_csv('/content/drive/MyDrive/OrderedDataset/IX/all/cv/pcc95/fold_10.csv')
datasets['IX_all_pcc95'] = [IX_all_pcc95_1, IX_all_pcc95_2, IX_all_pcc95_3, IX_all_pcc95_4, IX_all_pcc95_5,
                                       IX_all_pcc95_6, IX_all_pcc95_7, IX_all_pcc95_8, IX_all_pcc95_9, IX_all_pcc95_10]

IX_all_pcc75_1 = pd.read_csv('/content/drive/MyDrive/OrderedDataset/IX/all/cv/pcc75/fold_1.csv')
IX_all_pcc75_2 = pd.read_csv('/content/drive/MyDrive/OrderedDataset/IX/all/cv/pcc75/fold_2.csv')
IX_all_pcc75_3 = pd.read_csv('/content/drive/MyDrive/OrderedDataset/IX/all/cv/pcc75/fold_3.csv')
IX_all_pcc75_4 = pd.read_csv('/content/drive/MyDrive/OrderedDataset/IX/all/cv/pcc75/fold_4.csv')
IX_all_pcc75_5 = pd.read_csv('/content/drive/MyDrive/OrderedDataset/IX/all/cv/pcc75/fold_5.csv')
IX_all_pcc75_6 = pd.read_csv('/content/drive/MyDrive/OrderedDataset/IX/all/cv/pcc75/fold_6.csv')
IX_all_pcc75_7 = pd.read_csv('/content/drive/MyDrive/OrderedDataset/IX/all/cv/pcc75/fold_7.csv')
IX_all_pcc75_8 = pd.read_csv('/content/drive/MyDrive/OrderedDataset/IX/all/cv/pcc75/fold_8.csv')
IX_all_pcc75_9 = pd.read_csv('/content/drive/MyDrive/OrderedDataset/IX/all/cv/pcc75/fold_9.csv')
IX_all_pcc75_10 = pd.read_csv('/content/drive/MyDrive/OrderedDataset/IX/all/cv/pcc75/fold_10.csv')
datasets['IX_all_pcc75'] = [IX_all_pcc75_1, IX_all_pcc75_2, IX_all_pcc75_3, IX_all_pcc75_4, IX_all_pcc75_5,
                                       IX_all_pcc75_6, IX_all_pcc75_7, IX_all_pcc75_8, IX_all_pcc75_9, IX_all_pcc75_10]

# 500

IX_500_nopcc_1 = pd.read_csv('/content/drive/MyDrive/OrderedDataset/IX/500/cv/nopcc/fold_1.csv')
IX_500_nopcc_2 = pd.read_csv('/content/drive/MyDrive/OrderedDataset/IX/500/cv/nopcc/fold_2.csv')
IX_500_nopcc_3 = pd.read_csv('/content/drive/MyDrive/OrderedDataset/IX/500/cv/nopcc/fold_3.csv')
IX_500_nopcc_4 = pd.read_csv('/content/drive/MyDrive/OrderedDataset/IX/500/cv/nopcc/fold_4.csv')
IX_500_nopcc_5 = pd.read_csv('/content/drive/MyDrive/OrderedDataset/IX/500/cv/nopcc/fold_5.csv')
IX_500_nopcc_6 = pd.read_csv('/content/drive/MyDrive/OrderedDataset/IX/500/cv/nopcc/fold_6.csv')
IX_500_nopcc_7 = pd.read_csv('/content/drive/MyDrive/OrderedDataset/IX/500/cv/nopcc/fold_7.csv')
IX_500_nopcc_8 = pd.read_csv('/content/drive/MyDrive/OrderedDataset/IX/500/cv/nopcc/fold_8.csv')
IX_500_nopcc_9 = pd.read_csv('/content/drive/MyDrive/OrderedDataset/IX/500/cv/nopcc/fold_9.csv')
IX_500_nopcc_10 = pd.read_csv('/content/drive/MyDrive/OrderedDataset/IX/500/cv/nopcc/fold_10.csv')
datasets['IX_500_nopcc'] = [IX_500_nopcc_1, IX_500_nopcc_2, IX_500_nopcc_3, IX_500_nopcc_4, IX_500_nopcc_5,
                                       IX_500_nopcc_6, IX_500_nopcc_7, IX_500_nopcc_8, IX_500_nopcc_9, IX_500_nopcc_10]

IX_500_pcc95_1 = pd.read_csv('/content/drive/MyDrive/OrderedDataset/IX/500/cv/pcc95/fold_1.csv')
IX_500_pcc95_2 = pd.read_csv('/content/drive/MyDrive/OrderedDataset/IX/500/cv/pcc95/fold_2.csv')
IX_500_pcc95_3 = pd.read_csv('/content/drive/MyDrive/OrderedDataset/IX/500/cv/pcc95/fold_3.csv')
IX_500_pcc95_4 = pd.read_csv('/content/drive/MyDrive/OrderedDataset/IX/500/cv/pcc95/fold_4.csv')
IX_500_pcc95_5 = pd.read_csv('/content/drive/MyDrive/OrderedDataset/IX/500/cv/pcc95/fold_5.csv')
IX_500_pcc95_6 = pd.read_csv('/content/drive/MyDrive/OrderedDataset/IX/500/cv/pcc95/fold_6.csv')
IX_500_pcc95_7 = pd.read_csv('/content/drive/MyDrive/OrderedDataset/IX/500/cv/pcc95/fold_7.csv')
IX_500_pcc95_8 = pd.read_csv('/content/drive/MyDrive/OrderedDataset/IX/500/cv/pcc95/fold_8.csv')
IX_500_pcc95_9 = pd.read_csv('/content/drive/MyDrive/OrderedDataset/IX/500/cv/pcc95/fold_9.csv')
IX_500_pcc95_10 = pd.read_csv('/content/drive/MyDrive/OrderedDataset/IX/500/cv/pcc95/fold_10.csv')
datasets['IX_500_pcc95'] = [IX_500_pcc95_1, IX_500_pcc95_2, IX_500_pcc95_3, IX_500_pcc95_4, IX_500_pcc95_5,
                                       IX_500_pcc95_6, IX_500_pcc95_7, IX_500_pcc95_8, IX_500_pcc95_9, IX_500_pcc95_10]

IX_500_pcc75_1 = pd.read_csv('/content/drive/MyDrive/OrderedDataset/IX/500/cv/pcc75/fold_1.csv')
IX_500_pcc75_2 = pd.read_csv('/content/drive/MyDrive/OrderedDataset/IX/500/cv/pcc75/fold_2.csv')
IX_500_pcc75_3 = pd.read_csv('/content/drive/MyDrive/OrderedDataset/IX/500/cv/pcc75/fold_3.csv')
IX_500_pcc75_4 = pd.read_csv('/content/drive/MyDrive/OrderedDataset/IX/500/cv/pcc75/fold_4.csv')
IX_500_pcc75_5 = pd.read_csv('/content/drive/MyDrive/OrderedDataset/IX/500/cv/pcc75/fold_5.csv')
IX_500_pcc75_6 = pd.read_csv('/content/drive/MyDrive/OrderedDataset/IX/500/cv/pcc75/fold_6.csv')
IX_500_pcc75_7 = pd.read_csv('/content/drive/MyDrive/OrderedDataset/IX/500/cv/pcc75/fold_7.csv')
IX_500_pcc75_8 = pd.read_csv('/content/drive/MyDrive/OrderedDataset/IX/500/cv/pcc75/fold_8.csv')
IX_500_pcc75_9 = pd.read_csv('/content/drive/MyDrive/OrderedDataset/IX/500/cv/pcc75/fold_9.csv')
IX_500_pcc75_10 = pd.read_csv('/content/drive/MyDrive/OrderedDataset/IX/500/cv/pcc75/fold_10.csv')
datasets['IX_500_pcc75'] = [IX_500_pcc75_1, IX_500_pcc75_2, IX_500_pcc75_3, IX_500_pcc75_4, IX_500_pcc75_5,
                                       IX_500_pcc75_6, IX_500_pcc75_7, IX_500_pcc75_8, IX_500_pcc75_9, IX_500_pcc75_10]

# 400

IX_400_nopcc_1 = pd.read_csv('/content/drive/MyDrive/OrderedDataset/IX/400/cv/nopcc/fold_1.csv')
IX_400_nopcc_2 = pd.read_csv('/content/drive/MyDrive/OrderedDataset/IX/400/cv/nopcc/fold_2.csv')
IX_400_nopcc_3 = pd.read_csv('/content/drive/MyDrive/OrderedDataset/IX/400/cv/nopcc/fold_3.csv')
IX_400_nopcc_4 = pd.read_csv('/content/drive/MyDrive/OrderedDataset/IX/400/cv/nopcc/fold_4.csv')
IX_400_nopcc_5 = pd.read_csv('/content/drive/MyDrive/OrderedDataset/IX/400/cv/nopcc/fold_5.csv')
IX_400_nopcc_6 = pd.read_csv('/content/drive/MyDrive/OrderedDataset/IX/400/cv/nopcc/fold_6.csv')
IX_400_nopcc_7 = pd.read_csv('/content/drive/MyDrive/OrderedDataset/IX/400/cv/nopcc/fold_7.csv')
IX_400_nopcc_8 = pd.read_csv('/content/drive/MyDrive/OrderedDataset/IX/400/cv/nopcc/fold_8.csv')
IX_400_nopcc_9 = pd.read_csv('/content/drive/MyDrive/OrderedDataset/IX/400/cv/nopcc/fold_9.csv')
IX_400_nopcc_10 = pd.read_csv('/content/drive/MyDrive/OrderedDataset/IX/400/cv/nopcc/fold_10.csv')
datasets['IX_400_nopcc'] = [IX_400_nopcc_1, IX_400_nopcc_2, IX_400_nopcc_3, IX_400_nopcc_4, IX_400_nopcc_5,
                                       IX_400_nopcc_6, IX_400_nopcc_7, IX_400_nopcc_8, IX_400_nopcc_9, IX_400_nopcc_10]

IX_400_pcc95_1 = pd.read_csv('/content/drive/MyDrive/OrderedDataset/IX/400/cv/pcc95/fold_1.csv')
IX_400_pcc95_2 = pd.read_csv('/content/drive/MyDrive/OrderedDataset/IX/400/cv/pcc95/fold_2.csv')
IX_400_pcc95_3 = pd.read_csv('/content/drive/MyDrive/OrderedDataset/IX/400/cv/pcc95/fold_3.csv')
IX_400_pcc95_4 = pd.read_csv('/content/drive/MyDrive/OrderedDataset/IX/400/cv/pcc95/fold_4.csv')
IX_400_pcc95_5 = pd.read_csv('/content/drive/MyDrive/OrderedDataset/IX/400/cv/pcc95/fold_5.csv')
IX_400_pcc95_6 = pd.read_csv('/content/drive/MyDrive/OrderedDataset/IX/400/cv/pcc95/fold_6.csv')
IX_400_pcc95_7 = pd.read_csv('/content/drive/MyDrive/OrderedDataset/IX/400/cv/pcc95/fold_7.csv')
IX_400_pcc95_8 = pd.read_csv('/content/drive/MyDrive/OrderedDataset/IX/400/cv/pcc95/fold_8.csv')
IX_400_pcc95_9 = pd.read_csv('/content/drive/MyDrive/OrderedDataset/IX/400/cv/pcc95/fold_9.csv')
IX_400_pcc95_10 = pd.read_csv('/content/drive/MyDrive/OrderedDataset/IX/400/cv/pcc95/fold_10.csv')
datasets['IX_400_pcc95'] = [IX_400_pcc95_1, IX_400_pcc95_2, IX_400_pcc95_3, IX_400_pcc95_4, IX_400_pcc95_5,
                                       IX_400_pcc95_6, IX_400_pcc95_7, IX_400_pcc95_8, IX_400_pcc95_9, IX_400_pcc95_10]

IX_400_pcc75_1 = pd.read_csv('/content/drive/MyDrive/OrderedDataset/IX/400/cv/pcc75/fold_1.csv')
IX_400_pcc75_2 = pd.read_csv('/content/drive/MyDrive/OrderedDataset/IX/400/cv/pcc75/fold_2.csv')
IX_400_pcc75_3 = pd.read_csv('/content/drive/MyDrive/OrderedDataset/IX/400/cv/pcc75/fold_3.csv')
IX_400_pcc75_4 = pd.read_csv('/content/drive/MyDrive/OrderedDataset/IX/400/cv/pcc75/fold_4.csv')
IX_400_pcc75_5 = pd.read_csv('/content/drive/MyDrive/OrderedDataset/IX/400/cv/pcc75/fold_5.csv')
IX_400_pcc75_6 = pd.read_csv('/content/drive/MyDrive/OrderedDataset/IX/400/cv/pcc75/fold_6.csv')
IX_400_pcc75_7 = pd.read_csv('/content/drive/MyDrive/OrderedDataset/IX/400/cv/pcc75/fold_7.csv')
IX_400_pcc75_8 = pd.read_csv('/content/drive/MyDrive/OrderedDataset/IX/400/cv/pcc75/fold_8.csv')
IX_400_pcc75_9 = pd.read_csv('/content/drive/MyDrive/OrderedDataset/IX/400/cv/pcc75/fold_9.csv')
IX_400_pcc75_10 = pd.read_csv('/content/drive/MyDrive/OrderedDataset/IX/400/cv/pcc75/fold_10.csv')
datasets['IX_400_pcc75'] = [IX_400_pcc75_1, IX_400_pcc75_2, IX_400_pcc75_3, IX_400_pcc75_4, IX_400_pcc75_5,
                                       IX_400_pcc75_6, IX_400_pcc75_7, IX_400_pcc75_8, IX_400_pcc75_9, IX_400_pcc75_10]

# 300

IX_300_nopcc_1 = pd.read_csv('/content/drive/MyDrive/OrderedDataset/IX/300/cv/nopcc/fold_1.csv')
IX_300_nopcc_2 = pd.read_csv('/content/drive/MyDrive/OrderedDataset/IX/300/cv/nopcc/fold_2.csv')
IX_300_nopcc_3 = pd.read_csv('/content/drive/MyDrive/OrderedDataset/IX/300/cv/nopcc/fold_3.csv')
IX_300_nopcc_4 = pd.read_csv('/content/drive/MyDrive/OrderedDataset/IX/300/cv/nopcc/fold_4.csv')
IX_300_nopcc_5 = pd.read_csv('/content/drive/MyDrive/OrderedDataset/IX/300/cv/nopcc/fold_5.csv')
IX_300_nopcc_6 = pd.read_csv('/content/drive/MyDrive/OrderedDataset/IX/300/cv/nopcc/fold_6.csv')
IX_300_nopcc_7 = pd.read_csv('/content/drive/MyDrive/OrderedDataset/IX/300/cv/nopcc/fold_7.csv')
IX_300_nopcc_8 = pd.read_csv('/content/drive/MyDrive/OrderedDataset/IX/300/cv/nopcc/fold_8.csv')
IX_300_nopcc_9 = pd.read_csv('/content/drive/MyDrive/OrderedDataset/IX/300/cv/nopcc/fold_9.csv')
IX_300_nopcc_10 = pd.read_csv('/content/drive/MyDrive/OrderedDataset/IX/300/cv/nopcc/fold_10.csv')
datasets['IX_300_nopcc'] = [IX_300_nopcc_1, IX_300_nopcc_2, IX_300_nopcc_3, IX_300_nopcc_4, IX_300_nopcc_5,
                                       IX_300_nopcc_6, IX_300_nopcc_7, IX_300_nopcc_8, IX_300_nopcc_9, IX_300_nopcc_10]

IX_300_pcc95_1 = pd.read_csv('/content/drive/MyDrive/OrderedDataset/IX/300/cv/pcc95/fold_1.csv')
IX_300_pcc95_2 = pd.read_csv('/content/drive/MyDrive/OrderedDataset/IX/300/cv/pcc95/fold_2.csv')
IX_300_pcc95_3 = pd.read_csv('/content/drive/MyDrive/OrderedDataset/IX/300/cv/pcc95/fold_3.csv')
IX_300_pcc95_4 = pd.read_csv('/content/drive/MyDrive/OrderedDataset/IX/300/cv/pcc95/fold_4.csv')
IX_300_pcc95_5 = pd.read_csv('/content/drive/MyDrive/OrderedDataset/IX/300/cv/pcc95/fold_5.csv')
IX_300_pcc95_6 = pd.read_csv('/content/drive/MyDrive/OrderedDataset/IX/300/cv/pcc95/fold_6.csv')
IX_300_pcc95_7 = pd.read_csv('/content/drive/MyDrive/OrderedDataset/IX/300/cv/pcc95/fold_7.csv')
IX_300_pcc95_8 = pd.read_csv('/content/drive/MyDrive/OrderedDataset/IX/300/cv/pcc95/fold_8.csv')
IX_300_pcc95_9 = pd.read_csv('/content/drive/MyDrive/OrderedDataset/IX/300/cv/pcc95/fold_9.csv')
IX_300_pcc95_10 = pd.read_csv('/content/drive/MyDrive/OrderedDataset/IX/300/cv/pcc95/fold_10.csv')
datasets['IX_300_pcc95'] = [IX_300_pcc95_1, IX_300_pcc95_2, IX_300_pcc95_3, IX_300_pcc95_4, IX_300_pcc95_5,
                                       IX_300_pcc95_6, IX_300_pcc95_7, IX_300_pcc95_8, IX_300_pcc95_9, IX_300_pcc95_10]

IX_300_pcc75_1 = pd.read_csv('/content/drive/MyDrive/OrderedDataset/IX/300/cv/pcc75/fold_1.csv')
IX_300_pcc75_2 = pd.read_csv('/content/drive/MyDrive/OrderedDataset/IX/300/cv/pcc75/fold_2.csv')
IX_300_pcc75_3 = pd.read_csv('/content/drive/MyDrive/OrderedDataset/IX/300/cv/pcc75/fold_3.csv')
IX_300_pcc75_4 = pd.read_csv('/content/drive/MyDrive/OrderedDataset/IX/300/cv/pcc75/fold_4.csv')
IX_300_pcc75_5 = pd.read_csv('/content/drive/MyDrive/OrderedDataset/IX/300/cv/pcc75/fold_5.csv')
IX_300_pcc75_6 = pd.read_csv('/content/drive/MyDrive/OrderedDataset/IX/300/cv/pcc75/fold_6.csv')
IX_300_pcc75_7 = pd.read_csv('/content/drive/MyDrive/OrderedDataset/IX/300/cv/pcc75/fold_7.csv')
IX_300_pcc75_8 = pd.read_csv('/content/drive/MyDrive/OrderedDataset/IX/300/cv/pcc75/fold_8.csv')
IX_300_pcc75_9 = pd.read_csv('/content/drive/MyDrive/OrderedDataset/IX/300/cv/pcc75/fold_9.csv')
IX_300_pcc75_10 = pd.read_csv('/content/drive/MyDrive/OrderedDataset/IX/300/cv/pcc75/fold_10.csv')
datasets['IX_300_pcc75'] = [IX_300_pcc75_1, IX_300_pcc75_2, IX_300_pcc75_3, IX_300_pcc75_4, IX_300_pcc75_5,
                                       IX_300_pcc75_6, IX_300_pcc75_7, IX_300_pcc75_8, IX_300_pcc75_9, IX_300_pcc75_10]


In [8]:
def get_single_feature_list(feature_lists):
  features = {}
  feature_count = 1
  for feature_lists in feature_lists:
    #print(f"Loading feature list count: {feature_count}")
    feature_count += 1
    feature_names = feature_lists['Feature'].tolist()
    for idx in range(len(feature_names)):
      feature = feature_names[idx]
      if feature not in features:
        #print(f"Adding {feature} to list with {idx} points")
        features[feature] = idx
      else:
        #print(f"Adding {idx} points to {feature}")
        features[feature] += idx
  return features

In [9]:
IX_all_nopcc_results_df = generate_NCART_feature_importance_from_dataset(datasets, 'IX_all_nopcc')
print("\n---------------------------------------------------------------------\n")
IX_all_pcc95_results_df = generate_NCART_feature_importance_from_dataset(datasets, 'IX_all_pcc95')
print("\n---------------------------------------------------------------------\n")
IX_all_pcc75_results_df = generate_NCART_feature_importance_from_dataset(datasets, 'IX_all_pcc75')
print("\n---------------------------------------------------------------------\n")
IX_500_nopcc_results_df = generate_NCART_feature_importance_from_dataset(datasets, 'IX_500_nopcc')
print("\n---------------------------------------------------------------------\n")
IX_500_pcc95_results_df = generate_NCART_feature_importance_from_dataset(datasets, 'IX_500_pcc95')
print("\n---------------------------------------------------------------------\n")
IX_500_pcc75_results_df = generate_NCART_feature_importance_from_dataset(datasets, 'IX_500_pcc75')
print("\n---------------------------------------------------------------------\n")
IX_400_nopcc_results_df = generate_NCART_feature_importance_from_dataset(datasets, 'IX_400_nopcc')
print("\n---------------------------------------------------------------------\n")
IX_400_pcc95_results_df = generate_NCART_feature_importance_from_dataset(datasets, 'IX_400_pcc95')
print("\n---------------------------------------------------------------------\n")
IX_400_pcc75_results_df = generate_NCART_feature_importance_from_dataset(datasets, 'IX_400_pcc75')
print("\n---------------------------------------------------------------------\n")
IX_300_nopcc_results_df = generate_NCART_feature_importance_from_dataset(datasets, 'IX_300_nopcc')
print("\n---------------------------------------------------------------------\n")
IX_300_pcc95_results_df = generate_NCART_feature_importance_from_dataset(datasets, 'IX_300_pcc95')
print("\n---------------------------------------------------------------------\n")
IX_300_pcc75_results_df = generate_NCART_feature_importance_from_dataset(datasets, 'IX_300_pcc75')

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Epoch 65: Train Loss 0.533804 |  Val Loss 0.582488
Epoch 66: Train Loss 0.530407 |  Val Loss 0.581185
Epoch 67: Train Loss 0.526844 |  Val Loss 0.579382
Epoch 68: Train Loss 0.523001 |  Val Loss 0.576376
Epoch 69: Train Loss 0.519351 |  Val Loss 0.574360
Epoch 70: Train Loss 0.515423 |  Val Loss 0.572692
Epoch 71: Train Loss 0.511732 |  Val Loss 0.569480
Epoch 72: Train Loss 0.507758 |  Val Loss 0.567195
Epoch 73: Train Loss 0.503753 |  Val Loss 0.564708
Epoch 74: Train Loss 0.499891 |  Val Loss 0.562750
Epoch 75: Train Loss 0.495620 |  Val Loss 0.561594
Epoch 76: Train Loss 0.491670 |  Val Loss 0.559039
Epoch 77: Train Loss 0.487318 |  Val Loss 0.556748
Epoch 78: Train Loss 0.483152 |  Val Loss 0.553783
Epoch 79: Train Loss 0.478859 |  Val Loss 0.550786
Epoch 80: Train Loss 0.474338 |  Val Loss 0.548064
Epoch 81: Train Loss 0.470335 |  Val Loss 0.546648
Epoch 82: Train Loss 0.465831 |  Val Loss 0.544040
Epoch 83: Train L

In [10]:
top_IX_nopcc = [IX_all_nopcc_results_df, IX_500_nopcc_results_df, IX_400_nopcc_results_df, IX_300_nopcc_results_df]
top_IX_pcc95 = [IX_all_pcc95_results_df, IX_500_pcc95_results_df, IX_400_pcc95_results_df, IX_300_pcc95_results_df]
top_IX_pcc75 = [IX_all_pcc75_results_df, IX_500_pcc75_results_df, IX_400_pcc75_results_df, IX_300_pcc75_results_df]

In [11]:
most_important_features_IX_nopcc = get_single_feature_list(top_IX_nopcc)
most_important_features_IX_pcc95 = get_single_feature_list(top_IX_pcc95)
most_important_features_IX_pcc75 = get_single_feature_list(top_IX_pcc75)

most_important_features_IX_nopcc_df = pd.DataFrame(list(most_important_features_IX_nopcc.items()), columns=['Feature', 'Rank Importance'])
most_important_features_IX_nopcc_df = most_important_features_IX_nopcc_df.sort_values(by='Rank Importance', ascending=True)
most_important_features_IX_pcc95_df = pd.DataFrame(list(most_important_features_IX_pcc95.items()), columns=['Feature', 'Rank Importance'])
most_important_features_IX_pcc95_df = most_important_features_IX_pcc95_df.sort_values(by='Rank Importance', ascending=True)
most_important_features_IX_pcc75_df = pd.DataFrame(list(most_important_features_IX_pcc75.items()), columns=['Feature', 'Rank Importance'])
most_important_features_IX_pcc75_df = most_important_features_IX_pcc75_df.sort_values(by='Rank Importance', ascending=True)

In [12]:
most_important_features_IX_nopcc_df.head(20)

Unnamed: 0,Feature,Rank Importance
8,peoe_VSA6,20
6,smr_VSA10,22
1,peoe_VSA8,31
16,peoe_VSA7,33
18,slogp_VSA8,35
12,smr_VSA3,37
10,smr_VSA7,39
0,SlogP,54
2,smr_VSA9,55
23,peoe_VSA2,57


In [13]:
most_important_features_IX_pcc95_df.head(20)

Unnamed: 0,Feature,Rank Importance
2,smr_VSA9,9
0,smr_VSA3,16
4,slogp_VSA6,23
6,peoe_VSA6,26
18,peoe_VSA2,33
3,slogp_VSA12,39
12,slogp_VSA8,39
10,slogp_VSA1,45
14,SMR,51
21,slogp_VSA4,52


In [14]:
most_important_features_IX_pcc75_df.head(20)

Unnamed: 0,Feature,Rank Importance
8,smr_VSA10,11
9,peoe_VSA6,12
7,slogp_VSA1,16
5,SlogP,21
0,smr_VSA3,22
12,slogp_VSA12,25
11,peoe_VSA7,26
1,slogp_VSA10,29
4,peoe_VSA8,35
14,peoe_VSA11,40
