In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import copy as cp
import importanceMatrix
import shap
import os
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, f1_score
from sklearn.preprocessing import scale
from matplotlib import pyplot as plt

proj_path = os.path.dirname(os.getcwd())
fig_path = proj_path + '/2_docs/LaTeX/Figures'
datasets = []

In [None]:

def plot_confusion_matrix(y_true, y_pred, classes,
                          title='Normalized confusion matrix',
                          cmap=plt.cm.plasma):
    # Compute confusion matrix
    cm = confusion_matrix(y_true, y_pred)
    cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] * 100
    #print(cm)
    fig, ax = plt.subplots()
    im = ax.imshow(cm, interpolation='nearest', cmap=cmap)
    ax.figure.colorbar(im, ax=ax, label='Percentage')
    # We want to show all ticks...
    ax.set(xticks=np.arange(cm.shape[1]),
           yticks=np.arange(cm.shape[0]),
           # ... and label them with the respective list entries
           xticklabels=classes, yticklabels=classes,
           title=title,
           ylabel='True label',
           xlabel='Predicted label')

    # Rotate the tick labels and set their alignment.
    plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
             rotation_mode="anchor")

    # Loop over data dimensions and create text annotations.
    fmt = '.0f'
    thresh = cm.max() / 2.
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            ax.text(j, i, format(round(cm[i, j],0), fmt),
                    ha="center", va="center",
                    color="black" if cm[i, j] > thresh else "white")
    plt.tight_layout()
    return fig, ax

In [53]:
def top_features(rf_clf, feature_names, X_train, y_train, y):
    #Global importance ranking
    global_importance = rf_clf.feature_importances_
    global_index = np.argsort(global_importance)
    global_index = global_index[::-1]
    
    #Shap values ranking
    explainer = shap.TreeExplainer(rf_clf)
    shap_values = explainer.shap_values(X_train)
    shap_imp_mat = np.array([np.mean(abs(class_shap), axis=0) for class_shap in shap_values])
    ###NORMALIZE ROWS
    row_sums = shap_imp_mat.sum(axis=1)
    shap_imp_mat = shap_imp_mat / row_sums[:, np.newaxis]
    ###
    shap_imp = np.mean(shap_imp_mat, axis=0)
    shap_index = np.argsort(shap_imp)
    shap_index = shap_index[::-1]

    #Per-class importance ranking
    imp_mat = importanceMatrix.calcImportanceMatrix(rf_clf)
    mean_importance = np.mean(imp_mat, axis=0)
    pcfi_index = np.argsort(mean_importance)
    pcfi_index = pcfi_index[::-1]

    ###check y_train rare class
    unique_classes, count_classes = np.unique(y_train, return_counts=True)
    rare_class_index = np.argsort(count_classes)[0]
    rare_class_train = unique_classes[rare_class_index]
    
    unique_classes, count_classes = np.unique(y, return_counts=True)
    rare_class_index = np.argsort(count_classes)[0]
    rare_class = unique_classes[rare_class_index]
    print('Do y and y_train have same rare class?', rare_class==rare_class_train)
    
    rare_class_pcfi_index = np.argsort(imp_mat[rare_class_index])
    rare_class_pcfi_index = rare_class_pcfi_index[::-1]
    
    rare_class_shap_index = np.argsort(shap_imp_mat[rare_class_index])
    rare_class_shap_index = rare_class_shap_index[::-1]
    
    ##STOP AT THE FIRST DIFFERING INSTANCE
    index = [i for i in np.arange(len(feature_names)) if shap_index[i] != pcfi_index[i]]#SHAP OR GLOBAL?
    print(index)
    if len(index) > 0 and index[0] > 2:
        top_global_features = feature_names[global_index[:index[0]+1]]
        top_shap_features = feature_names[shap_index[:index[0]+1]]
        top_pcfi_features = feature_names[pcfi_index[:index[0]+1]]
        #top_rare_pcfi_features = feature_names[rare_class_pcfi_index[:index[0]+1]]
        #top_rare_shap_features = feature_names[rare_class_shap_index[:index[0]+1]]
        #return [top_global_features, top_shap_features, top_pcfi_features, 
        #        top_rare_shap_features, top_rare_pcfi_features], rare_class
    else:
        top_global_features = feature_names[global_index[:3]]
        top_shap_features = feature_names[shap_index[:3]]
        top_pcfi_features = feature_names[pcfi_index[:3]]
    top_rare_pcfi_features = feature_names[rare_class_pcfi_index[:3]]
    top_rare_shap_features = feature_names[rare_class_shap_index[:3]]
    
    if all(lab in top_rare_pcfi_features for lab in top_rare_shap_features):
        top_rare_shap_features = top_rare_pcfi_features
    #REORDER EQUAL LABS
    top_features_lst = [top_global_features, top_shap_features, top_pcfi_features, 
                        top_rare_shap_features, top_rare_pcfi_features]
    for k in np.arange(2):
        f1 = top_features_lst[k]
        for j in np.arange(k+1,3):
            f2 = top_features_lst[j]
            if all(lab in f1 for lab in f2):
                print(k,j)
                top_features_lst[j] = f1
    return top_features_lst, rare_class

In [54]:
def fit_scores(y_test, predictions):
    #errors = abs(predictions - y_test)
    acc = 100 * accuracy_score(y_test, predictions)
    f1score = 100 * f1_score(y_test, predictions, average='macro')
    return (round(acc, 2), round(f1score, 2))#round(np.mean(errors), 2), 

def fitRF_and_rank(data, feature_names, class_col, rnd_seed=45):#class_names, class_tags, 
    X = np.array(data.loc[:,feature_names])
    y = np.array(data.loc[:,class_col])
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rnd_seed)
    
    #Print dataset info and check that all classes are represented in  y_train and y_test 
    unique_classes, count_classes = np.unique(y, return_counts=True)
    is_balanced = all([i==j for i in count_classes for j in count_classes])
    print('Is the dataset balanced?', is_balanced)
    print(unique_classes, count_classes)
    print(np.unique(y_train, return_counts=True))
    print(np.unique(y_test, return_counts=True))
    
    # Train the classifier
    rf_gscv = GridSearchCV(estimator=RandomForestClassifier(random_state=rnd_seed),
                          param_grid={'n_estimators':[10, 25, 50, 75, 100, 150, 200, 250, 300],
                                      'max_depth': [10, 15, 20, 25, None]},
                          scoring='f1_macro', cv=3, iid=False)
    rf_gscv.fit(X_train, y_train)
    print('Best score:', rf_gscv.best_score_)
    rf_clf = rf_gscv.best_estimator_
          
    predictions = rf_clf.predict(X_test)
    accuracy, f1score = fit_scores(y_test, predictions)
    print('Model performances:')
    print('Accuracy: {}'.format(accuracy))
    print('F1 macro score : {}'.format(f1score))
    top_feature_lst, rare_class = top_features(rf_clf, feature_names, X_train, y_train, y)
    return (rf_gscv, top_feature_lst, rare_class)

def refitRF(data, feature_names, class_col, rf_gscv, rare_class, rnd_seed=45, global_score=False):
    X = np.array(data.loc[:,feature_names])
    y = np.array(data.loc[:,class_col])
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rnd_seed)
    
    # Train the classifier
    rf_clf = RandomForestClassifier(random_state=rnd_seed,
                                    max_depth=rf_gscv.best_params_['max_depth'],
                                    n_estimators=rf_gscv.best_params_['n_estimators']
                                   )
    rf_clf.fit(X_train, y_train)
          
    predictions = rf_clf.predict(X_test)
    if global_score:
        score = fit_scores(y_test, predictions)
        #print('Model performances:')
        #print('Accuracy: {}'.format(accuracy))
        #print('F1 macro score : {}'.format(f1score))
    else:
        print(rare_class)
        score = f1_score(y_test, predictions, labels=[rare_class], average=None)[0]

    return score

In [55]:
#Import dermatology data #PCFI ONLY GLOBAL
col_names = np.array([
    'erythema', 'scaling', 'definite borders',
    'itching', 'koebner phenomenon', 'polygonal papules',
    'follicular papules', 'oral mucosal involvement', 'knee and elbow involvement',
    'scalp involvement', 'family history', 'melanin incontinence',
    'eosinophils in the infiltrate', 'PNL infiltrate', 'fibrosis of the papillary dermis',
    'exocytosis', 'acanthosis', 'hyperkeratosis',
    'parakeratosis', 'clubbing of the rete ridges', 'elongation of the rete ridges',
    'thinning of the suprapapillary epidermis', 'spongiform pustule', 'munro microabcess',
    'focal hypergranulosis', 'disappearance of the granular layer',
    'vacuolisation and damage of basal layer',
    'spongiosis', 'saw-tooth appearance of retes', 'follicular horn plug',
    'perifollicular parakeratosis', 'inflammatory monoluclear inflitrate',
    'band-like infiltrate',
    'Age', 'Class'
])
col_names = np.array([lab.capitalize() for lab in col_names])
feature_names = np.array(col_names[:-1])
class_col = col_names[-1]
class_names = np.array(['psoriasis', 'seboreic dermatitis', 'lichen planus',
                        'pityriasis rosea', 'cronic dermatitis', 'pityriasis rubra pilaris'])
class_names = np.array([lab.capitalize() for lab in class_names])
class_tags = np.arange(len(class_names)) + 1
data = pd.read_csv(proj_path+'/0_data/dermatology.data.csv', header=None, names=col_names)
skip_rows = data.Age == '?'
data = data[~skip_rows]
data.Age = np.array(data.Age, dtype=int)

rnd_seed = 45
data_info = ['Dermatology', data, feature_names, class_col, class_names, class_tags, rnd_seed]
datasets.append(data_info)

###
rf_gscv, top_features_lst, rare_class = fitRF_and_rank(data, feature_names, class_col, rnd_seed)
for lab,feat in zip(['Global: ', 'Shap: ', 'Pcfi: ', 'Rare class Shap: ', 'Rare class Pcfi: '], top_features_lst):
    print(lab,feat)
label_lst = ['Global', 'Shap', 'Pcfi', 'Rare Shap', 'Rare Pfci']
is_global_score = [True, True, True, False, False]
scores = [refitRF(data, top_features, class_col, rf_gscv, rare_class, rnd_seed, global_score=global_score)
             for global_score,top_features in zip(is_global_score,top_features_lst)]
for lab,score in zip(label_lst, scores):
    print(lab, score)

Is the dataset balanced? False
[1 2 3 4 5 6] [111  60  71  48  48  20]
(array([1, 2, 3, 4, 5, 6]), array([80, 46, 58, 34, 36, 14]))
(array([1, 2, 3, 4, 5, 6]), array([31, 14, 13, 14, 12,  6]))
Best score: 0.9827938369873853
Model performances:
Accuracy: 98.89
F1 macro score : 98.81
Do y and y_train have same rare class? True
[0, 1, 2, 3, 5, 6, 7, 8, 9, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 30, 31, 32]
Global:  ['Clubbing of the rete ridges' 'Fibrosis of the papillary dermis'
 'Thinning of the suprapapillary epidermis']
Shap:  ['Clubbing of the rete ridges' 'Fibrosis of the papillary dermis'
 'Elongation of the rete ridges']
Pcfi:  ['Fibrosis of the papillary dermis' 'Clubbing of the rete ridges'
 'Koebner phenomenon']
Rare class Shap:  ['Perifollicular parakeratosis' 'Knee and elbow involvement'
 'Follicular horn plug']
Rare class Pcfi:  ['Perifollicular parakeratosis' 'Follicular horn plug'
 'Follicular papules']


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


6
6
Global (63.33, 45.44)
Shap (63.33, 43.87)
Pcfi (76.67, 62.59)
Rare Shap 1.0
Rare Pfci 1.0


In [56]:
#Students data #PCFI ONLY GLOBAL
col_names = np.array([
    'Gender', 'Caste', 'Class X Percentage', 'Class XII Percentage', 'Internal Assessment Percentage',
    'End Semester Percentage', 'Whether the student has back or arrear papers', 'Marital Status',
    'Lived in Town or Village', 'Admission Category', 'Family Monthly Income', 'Family Size',
    'Father Qualification', 'Mother Qualification', 'Father Occupation', 'Mother Occupation',
    'Number of Friends', 'Study Hours', 'Student School attended at Class X level', 'Medium',
    'Home to College Travel Time', 'Class Attendance Percentage'
])
col_names = np.array([lab.capitalize() for lab in col_names])
students_data = pd.read_csv(proj_path+'/0_data/Student_performances.csv', header=None, names=col_names)
students_data = students_data.loc[:,col_names!='Marital status']

string_to_int_list = [
    {'M':0,'F':1}, {'G':0,'ST':1,'SC':2,'OBC':3,'MOBC':4}, {'Best':4,'Vg':3,'Good':2,'Pass':1,'Fail':0},
    {'Best':4,'Vg':3,'Good':2,'Pass':1,'Fail':0}, {'Best':4,'Vg':3,'Good':2,'Pass':1,'Fail':0},
    {'Best':4,'Vg':3,'Good':2,'Pass':1,'Fail':0}, {'Y':1,'N':0},
    {'T':1,'V':0}, {'Free':0,'Paid':1},
    {'Vh':4,'High':3,'Am':2,'Medium':1,'Low':0}, {'Large':2,'Average':1,'Small':0},
    {'Il':0,'Um':1,'10':2,'12':3,'Degree':4,'Pg':5}, {'Il':0,'Um':1,'10':2,'12':3,'Degree':4,'Pg':5},
    {'Service':0,'Business':1,'Retired':2,'Farmer':3,'Others':4},
    {'Service':0,'Business':1,'Retired':2,'Housewife':3,'Others':4},
    {'Large':2,'Average':1,'Small':0}, {'Good':2,'Average':1,'Poor':0},
    {'Govt':1,'Private':0}, {'Eng':0,'Asm':1,'Hin':2,'Ben':3},
    {'Large':2,'Average':1,'Small':0}, {'Good':2,'Average':1,'Poor':0}
]
for col_name,string_to_int in zip(students_data.columns, string_to_int_list):
    #print(col_name,string_to_int,
    #     students_data.apply(lambda r: string_to_int[r[col_name]], axis=1))
    students_data.loc[:,col_name] = students_data.apply(lambda r: string_to_int[r[col_name]], axis=1)
    
feature_names = np.array(students_data.columns[students_data.columns!='End semester percentage'])
class_col = 'End semester percentage'
class_names = np.array(['Fail', 'Pass', 'Good', 'Vg', 'Best'])
class_tags = np.arange(len(class_names))
#print(feature_names, class_names, class_tags)

#44 70% accuracy, makes no point; 45 no rare class to test;46 too low acc; 47
rnd_seed = 44

data_info = ['Student_finals', students_data, feature_names, class_col, class_names, class_tags, rnd_seed]
datasets.append(data_info)


###
data = students_data
rf_gscv, top_features_lst, rare_class = fitRF_and_rank(data, feature_names, class_col, rnd_seed)
for lab,feat in zip(['Global: ', 'Shap: ', 'Pcfi: ', 'Rare class Shap: ', 'Rare class Pcfi: '], top_features_lst):
    print(lab,feat)
label_lst = ['Global', 'Shap', 'Pcfi', 'Rare Shap', 'Rare Pfci']
is_global_score = [True, True, True, False, False]
scores = [refitRF(data, top_features, class_col, rf_gscv, rare_class, rnd_seed, global_score=global_score)
             for global_score,top_features in zip(is_global_score,top_features_lst)]
for lab,score in zip(label_lst, scores):
    print(lab, score)

Is the dataset balanced? False
[1 2 3 4] [27 54 42  8]
(array([1, 2, 3, 4]), array([22, 41, 29,  6]))
(array([1, 2, 3, 4]), array([ 5, 13, 13,  2]))


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision'

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


Best score: 0.5892526488466422
Model performances:
Accuracy: 69.7
F1 macro score : 63.28
Do y and y_train have same rare class? True
[3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18]
Global:  ['Class x percentage' 'Internal assessment percentage'
 'Class xii percentage' 'Father qualification']
Shap:  ['Class x percentage' 'Internal assessment percentage'
 'Class xii percentage' 'Home to college travel time']
Pcfi:  ['Class x percentage' 'Internal assessment percentage'
 'Class xii percentage' 'Mother qualification']
Rare class Shap:  ['Internal assessment percentage' 'Class xii percentage'
 'Class x percentage']
Rare class Pcfi:  ['Internal assessment percentage' 'Class xii percentage'
 'Class x percentage']
4
4
Global (54.55, 40.46)
Shap (51.52, 35.53)
Pcfi (66.67, 53.97)
Rare Shap 0.0
Rare Pfci 0.0


In [57]:
#Import tumor data #PCFI GLOBAL
col_names = np.array([
    'class', 'age', 'sex', 'histologic-type', 'degree-of-diffe', 'bone', 'bone-marrow', 'lung', 'pleura',
    'peritoneum', 'liver', 'brain', 'skin', 'neck', 'supraclavicular', 'axillar', 'mediastinum', 'abdominal'
])
col_names = np.array([lab.capitalize() for lab in col_names])
feature_names = col_names[col_names!='Class']
class_names = np.array([
    'lung', 'head & neck', 'esophasus', 'thyroid', 'stomach', 'duoden & sm.int',
    'colon', 'rectum', 'anus', 'salivary glands', 'pancreas', 'gallblader',
    'liver', 'kidney', 'bladder', 'testis', 'prostate', 'ovary', 'corpus uteri', 
    'cervix uteri', 'vagina', 'breast'
])
#class_tags = np.arange(len(class_names))
tumor_data = pd.read_csv(proj_path+'/0_data/primary-tumor.data.csv', header=None, names=col_names)
tumor_data = tumor_data.loc[:,[col for col in col_names if not(col in ['Histologic-type', 'Degree-of-diffe'])]]
tumor_data.apply(lambda r: any(r=='?'),axis=1)
tumor_data = tumor_data.loc[~(tumor_data.apply(lambda r: any(r=='?'),axis=1)),:]
#print(np.unique(tumor_data.Class, return_counts=True))
class_tags, class_counts = np.unique(tumor_data.Class, return_counts=True)
keep = class_tags[class_counts>=15]

#low_data_classes = np.array([6, 9, 10, 15, 16, 20, 21])

#print('Keep classes', class_names[keep-1])
tumor_data = tumor_data.loc[tumor_data.Class.isin(keep)]

col_names = tumor_data.columns
feature_names = np.array(col_names[col_names!='Class'])
class_names = np.array([col for col in class_names if col in class_names[keep-1]])
class_tags = np.unique(tumor_data.Class)
class_col = 'Class'
#print(class_names, feature_names, np.unique(tumor_data.Class, return_counts=True))
#45
rnd_seed = 45
data_info = ['Tumor', tumor_data, feature_names, class_col, class_names, class_tags, rnd_seed]
datasets.append(data_info)

###
data = tumor_data
rf_gscv, top_features_lst, rare_class = fitRF_and_rank(data, feature_names, class_col, rnd_seed)
for lab,feat in zip(['Global: ', 'Shap: ', 'Pcfi: ', 'Rare class Shap: ', 'Rare class Pcfi: '], top_features_lst):
    print(lab,feat)
label_lst = ['Global', 'Shap', 'Pcfi', 'Rare Shap', 'Rare Pfci']
is_global_score = [True, True, True, False, False]
scores = [refitRF(data, top_features, class_col, rf_gscv, rare_class, rnd_seed, global_score=global_score)
             for global_score,top_features in zip(is_global_score,top_features_lst)]
for lab,score in zip(label_lst, scores):
    print(lab, score)

Is the dataset balanced? False
[ 1  2  5 11 12 14 18 22] [82 20 39 28 16 24 29 24]
(array([ 1,  2,  5, 11, 12, 14, 18, 22]), array([59, 15, 32, 23,  9, 21, 25, 12]))
(array([ 1,  2,  5, 11, 12, 14, 18, 22]), array([23,  5,  7,  5,  7,  3,  4, 12]))


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision'

Best score: 0.46806892045862636
Model performances:
Accuracy: 53.03
F1 macro score : 46.63
Do y and y_train have same rare class? True
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]
Global:  ['Sex' 'Age' 'Lung']
Shap:  ['Peritoneum' 'Sex' 'Liver']
Pcfi:  ['Sex' 'Age' 'Abdominal']
Rare class Shap:  ['Age' 'Sex' 'Abdominal']
Rare class Pcfi:  ['Age' 'Sex' 'Abdominal']
12
12
Global (25.76, 14.21)
Shap (39.39, 17.28)
Pcfi (34.85, 20.06)
Rare Shap 0.588235294117647
Rare Pfci 0.588235294117647


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


In [58]:
#Import flag data #SHAP WINS BOTH
col_names = np.array([
    'name', 'landmass', 'zone', 'area', 'population', 'language', 'religion', 'bars', 'stripes', 'colours',
    'red', 'green', 'blue', 'gold', 'white', 'black', 'orange', 'mainhue', 'circles', 'crosses', 'saltires',
    'quarters', 'sunstars', 'crescent', 'triangle', 'icon', 'animate', 'text', 'topleft', 'botright'
])
col_names = np.array([lab.capitalize() for lab in col_names])

feature_names = np.array(col_names[np.logical_and(col_names!='Religion', col_names!='Name')])
class_col = 'Religion'
class_names = np.array(['Catholic', 'Other Christian', 'Muslim',# 'Buddhist', 'Hindu',
                        'Ethnic', 'Marxist', 'Others'])

class_tags = np.arange(len(class_names))
flag_data = pd.read_csv(proj_path+'/0_data/flag.data.csv', header=None, names=col_names)

string_to_int = {'black':0, 'blue':1, 'brown':2, 'gold':3, 'green':4, 'orange':5, 'red':6, 'white':7}
for col_name in ['Mainhue', 'Topleft', 'Botright']:
    #print(col_name,string_to_int)
    flag_data.loc[:,col_name] = flag_data.apply(lambda r: string_to_int[r[col_name]], axis=1)
string_to_int = {0:0, 1:1, 2:2, 3:5, 4:5, 5:3, 6:4, 7:5}
flag_data.loc[:,['Religion']] = flag_data.apply(lambda r: string_to_int[r['Religion']], axis=1)

#45 (lower than 70 accuracy)
rnd_seed = 45

data_info = ['Flag', flag_data, feature_names, class_col, class_names, class_tags, rnd_seed]
datasets.append(data_info)


###
data = flag_data
rf_gscv, top_features_lst, rare_class = fitRF_and_rank(data, feature_names, class_col, rnd_seed)
for lab,feat in zip(['Global: ', 'Shap: ', 'Pcfi: ', 'Rare class Shap: ', 'Rare class Pcfi: '], top_features_lst):
    print(lab,feat)
label_lst = ['Global', 'Shap', 'Pcfi', 'Rare Shap', 'Rare Pfci']
is_global_score = [True, True, True, False, False]
scores = [refitRF(data, top_features, class_col, rf_gscv, rare_class, rnd_seed, global_score=global_score)
             for global_score,top_features in zip(is_global_score,top_features_lst)]
for lab,score in zip(label_lst, scores):
    print(lab, score)

Is the dataset balanced? False
[0 1 2 3 4 5] [40 60 36 27 15 16]
(array([0, 1, 2, 3, 4, 5]), array([33, 47, 27, 18,  9, 11]))
(array([0, 1, 2, 3, 4, 5]), array([ 7, 13,  9,  9,  6,  5]))


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision'

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision'

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


Best score: 0.4388230236841348
Model performances:
Accuracy: 61.22
F1 macro score : 53.81
Do y and y_train have same rare class? True
[2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26]
0 2
Global:  ['Language' 'Landmass' 'Population']
Shap:  ['Landmass' 'Language' 'Zone']
Pcfi:  ['Language' 'Landmass' 'Population']
Rare class Shap:  ['Zone' 'Landmass' 'Language']
Rare class Pcfi:  ['Area' 'Sunstars' 'Language']
4
4
Global (61.22, 50.71)
Shap (61.22, 59.32)
Pcfi (61.22, 50.71)
Rare Shap 0.5
Rare Pfci 0.0


In [59]:
#Breast tissue data # BAD PCFI GLOBAL

breast_data = pd.read_excel(proj_path+'/0_data/BreastTissue.xls', sheet_name=1)
breast_data = breast_data.iloc[:,1:]

feature_names = np.array(breast_data.columns[1:])
class_col = 'Class'
class_names = np.unique(breast_data.Class)
class_tags = np.arange(len(class_names))

string_to_int = dict(zip(class_names, class_tags))
breast_data.loc[:,'Class'] = breast_data.apply(lambda r: string_to_int[r.Class], axis=1)

#45 is nonsensical, 47 is 85%
rnd_seed = 45

data_info = ['Breast_tissue', breast_data, feature_names, class_col, class_names, class_tags, rnd_seed]
datasets.append(data_info)


###

data = breast_data
print(data.head())
print(feature_names)
rf_gscv, top_features_lst, rare_class = fitRF_and_rank(data, feature_names, class_col, rnd_seed)
for lab,feat in zip(['Global: ', 'Shap: ', 'Pcfi: ', 'Rare class Shap: ', 'Rare class Pcfi: '], top_features_lst):
    print(lab,feat)
label_lst = ['Global', 'Shap', 'Pcfi', 'Rare Shap', 'Rare Pfci']
is_global_score = [True, True, True, False, False]
scores = [refitRF(data, top_features, class_col, rf_gscv, rare_class, rnd_seed, global_score=global_score)
             for global_score,top_features in zip(is_global_score,top_features_lst)]
for lab,score in zip(label_lst, scores):
    print(lab, score)

   Class          I0     PA500       HFS          DA          Area       A/DA  \
0      1  524.794072  0.187448  0.032114  228.800228   6843.598481  29.910803   
1      1  330.000000  0.226893  0.265290  121.154201   3163.239472  26.109202   
2      1  551.879287  0.232478  0.063530  264.804935  11888.391827  44.894903   
3      1  380.000000  0.240855  0.286234  137.640111   5402.171180  39.248524   
4      1  362.831266  0.200713  0.244346  124.912559   3290.462446  26.342127   

      Max IP          DR           P  
0  60.204880  220.737212  556.828334  
1  69.717361   99.084964  400.225776  
2  77.793297  253.785300  656.769449  
3  88.758446  105.198568  493.701814  
4  69.389389  103.866552  424.796503  
['I0' 'PA500' 'HFS' 'DA' 'Area' 'A/DA' 'Max IP' 'DR' 'P']
Is the dataset balanced? False
[0 1 2 3 4 5] [22 21 14 15 16 18]
(array([0, 1, 2, 3, 4, 5]), array([17, 14, 11, 11, 12, 14]))
(array([0, 1, 2, 3, 4, 5]), array([5, 7, 3, 4, 4, 4]))


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision'

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


Best score: 0.6238492988492989
Model performances:
Accuracy: 74.07
F1 macro score : 70.94
Do y and y_train have same rare class? True
[5, 6]
0 1
Global:  ['I0' 'P' 'DA' 'PA500' 'Max IP' 'A/DA']
Shap:  ['I0' 'P' 'DA' 'PA500' 'Max IP' 'A/DA']
Pcfi:  ['I0' 'P' 'DA' 'PA500' 'Max IP' 'DR']
Rare class Shap:  ['I0' 'P' 'DA']
Rare class Pcfi:  ['I0' 'P' 'DA']
2
2
Global (81.48, 79.89)
Shap (81.48, 79.89)
Pcfi (77.78, 74.11)
Rare Shap 1.0
Rare Pfci 1.0


In [60]:
#Car data # SHAP GLOBAL

col_names = np.array([
    'buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety', 'class'
])
col_names = np.array([lab.capitalize() for lab in col_names])
car_data = pd.read_csv(proj_path+'/0_data/car.data.csv', header=None, names=col_names)

string_to_int_list = [
    {'vhigh':3, 'high':2, 'med':1, 'low':0},
    {'vhigh':3, 'high':2, 'med':1, 'low':0},
    {'2':2, '3':3, '4':4, '5more':5},
    {'2':0, '4':1, 'more':2},
    {'small':0, 'med':1, 'big':2},
    {'low':0, 'med':1, 'high':2},
    {'unacc':0, 'acc':1, 'good':2, 'vgood':3}
]

for col_name,string_to_int in zip(car_data.columns, string_to_int_list):
    car_data.loc[:,col_name] = car_data.apply(lambda r: string_to_int[r[col_name]], axis=1)

    
feature_names = np.array(car_data.columns[car_data.columns!='Class'])
class_col = 'Class'
class_names = np.array(['unacc', 'acc', 'good', 'vgood'])
class_tags = np.arange(len(class_names))
print(feature_names, class_names, class_tags)

rnd_seed = 45
data_info = ['Cars', car_data, feature_names, class_col, class_names, class_tags, rnd_seed]
datasets.append(data_info)


###
data = car_data
print(data.head())
print(feature_names)
rf_gscv, top_features_lst, rare_class = fitRF_and_rank(data, feature_names, class_col, rnd_seed)
for lab,feat in zip(['Global: ', 'Shap: ', 'Pcfi: ', 'Rare class Shap: ', 'Rare class Pcfi: '], top_features_lst):
    print(lab,feat)
label_lst = ['Global', 'Shap', 'Pcfi', 'Rare Shap', 'Rare Pfci']
is_global_score = [True, True, True, False, False]
scores = [refitRF(data, top_features, class_col, rf_gscv, rare_class, rnd_seed, global_score=global_score)
             for global_score,top_features in zip(is_global_score,top_features_lst)]
for lab,score in zip(label_lst, scores):
    print(lab, score)

['Buying' 'Maint' 'Doors' 'Persons' 'Lug_boot' 'Safety'] ['unacc' 'acc' 'good' 'vgood'] [0 1 2 3]
   Buying  Maint  Doors  Persons  Lug_boot  Safety  Class
0       3      3      2        0         0       0      0
1       3      3      2        0         0       1      0
2       3      3      2        0         0       2      0
3       3      3      2        0         1       0      0
4       3      3      2        0         1       1      0
['Buying' 'Maint' 'Doors' 'Persons' 'Lug_boot' 'Safety']
Is the dataset balanced? False
[0 1 2 3] [1210  384   69   65]
(array([0, 1, 2, 3]), array([908, 288,  51,  49]))
(array([0, 1, 2, 3]), array([302,  96,  18,  16]))
Best score: 0.8936141113688872
Model performances:
Accuracy: 97.22
F1 macro score : 93.67
Do y and y_train have same rare class? True
[2, 3]
0 2
Global:  ['Safety' 'Persons' 'Maint']
Shap:  ['Safety' 'Persons' 'Buying']
Pcfi:  ['Safety' 'Persons' 'Maint']
Rare class Shap:  ['Safety' 'Buying' 'Lug_boot']
Rare class Pcfi:  ['Safety'

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


3
3
Global (76.16, 35.7)
Shap (80.32, 50.56)
Pcfi (76.16, 35.7)
Rare Shap 0.4827586206896552
Rare Pfci 0.4827586206896552


In [61]:
#WiFi signals data #ALL =
wifi_data = pd.read_excel(proj_path+'/0_data/WiFi_signals.xls')

class_names = np.unique(wifi_data.Room)
feature_names = np.array(wifi_data.columns[wifi_data.columns!='Room'])
class_col = 'Room'
class_tags = class_names
#print(class_names, feature_names, np.unique(wifi_data.Room, return_counts=True))

rnd_seed = 45

data_info = ['Wifi', wifi_data, feature_names, class_col, class_names, class_tags, rnd_seed]
datasets.append(data_info)


###
data = wifi_data
print(data.head())
print(feature_names)
rf_gscv, top_features_lst, rare_class = fitRF_and_rank(data, feature_names, class_col, rnd_seed)
for lab,feat in zip(['Global: ', 'Shap: ', 'Pcfi: ', 'Rare class Shap: ', 'Rare class Pcfi: '], top_features_lst):
    print(lab,feat)
label_lst = ['Global', 'Shap', 'Pcfi', 'Rare Shap', 'Rare Pfci']
is_global_score = [True, True, True, False, False]
scores = [refitRF(data, top_features, class_col, rf_gscv, rare_class, rnd_seed, global_score=global_score)
             for global_score,top_features in zip(is_global_score,top_features_lst)]
for lab,score in zip(label_lst, scores):
    print(lab, score)

   Signal 1  Signal 2  Signal 3  Signal 4  Signal 5  Signal 6  Signal 7  Room
0       -64       -56       -61       -66       -71       -82       -81     1
1       -68       -57       -61       -65       -71       -85       -85     1
2       -63       -60       -60       -67       -76       -85       -84     1
3       -61       -60       -68       -62       -77       -90       -80     1
4       -63       -65       -60       -63       -77       -81       -87     1
['Signal 1' 'Signal 2' 'Signal 3' 'Signal 4' 'Signal 5' 'Signal 6'
 'Signal 7']
Is the dataset balanced? True
[1 2 3 4] [500 500 500 500]
(array([1, 2, 3, 4]), array([372, 379, 378, 371]))
(array([1, 2, 3, 4]), array([128, 121, 122, 129]))
Best score: 0.9847998472723013
Model performances:
Accuracy: 98.0
F1 macro score : 97.99
Do y and y_train have same rare class? False
[]
0 1
0 2
1 2
Global:  ['Signal 1' 'Signal 5' 'Signal 4']
Shap:  ['Signal 1' 'Signal 5' 'Signal 4']
Pcfi:  ['Signal 1' 'Signal 5' 'Signal 4']
Rare class Shap

In [62]:
#Import iris #ALL =
from sklearn.datasets import load_iris

iris = load_iris()
feature_names = np.array(iris.feature_names)
class_names = iris.target_names
class_tags = np.array([0, 1, 2])
class_col = 'Class'

iris_data = pd.DataFrame(iris.data,columns=feature_names)
iris_data[class_col] = pd.Series(iris.target)

rnd_seed = 45
data_info = ['Iris', iris_data, feature_names, class_col, class_names, class_tags, rnd_seed]
datasets.append(data_info)


###
data = iris_data
print(data.head())
print(feature_names)
rf_gscv, top_features_lst, rare_class = fitRF_and_rank(data, feature_names, class_col, rnd_seed)
for lab,feat in zip(['Global: ', 'Shap: ', 'Pcfi: ', 'Rare class Shap: ', 'Rare class Pcfi: '], top_features_lst):
    print(lab,feat)
label_lst = ['Global', 'Shap', 'Pcfi', 'Rare Shap', 'Rare Pfci']
is_global_score = [True, True, True, False, False]
scores = [refitRF(data, top_features, class_col, rf_gscv, rare_class, rnd_seed, global_score=global_score)
             for global_score,top_features in zip(is_global_score,top_features_lst)]
for lab,score in zip(label_lst, scores):
    print(lab, score)

   sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)  \
0                5.1               3.5                1.4               0.2   
1                4.9               3.0                1.4               0.2   
2                4.7               3.2                1.3               0.2   
3                4.6               3.1                1.5               0.2   
4                5.0               3.6                1.4               0.2   

   Class  
0      0  
1      0  
2      0  
3      0  
4      0  
['sepal length (cm)' 'sepal width (cm)' 'petal length (cm)'
 'petal width (cm)']
Is the dataset balanced? True
[0 1 2] [50 50 50]
(array([0, 1, 2]), array([36, 41, 35]))
(array([0, 1, 2]), array([14,  9, 15]))
Best score: 0.9641562141562142
Model performances:
Accuracy: 94.74
F1 macro score : 94.29
Do y and y_train have same rare class? False
[]
0 1
0 2
1 2
Global:  ['petal width (cm)' 'petal length (cm)' 'sepal length (cm)']
Shap:  ['petal width (cm)' 'p

In [63]:
#Import wine #ALL =
from sklearn.datasets import load_wine

wines = load_wine()
feature_names = np.array(wines.feature_names)
class_names = wines.target_names
class_tags = np.array([0, 1, 2])
class_col = 'Class'

wine_data = pd.DataFrame(wines.data,columns=feature_names)
wine_data[class_col] = pd.Series(wines.target)

rnd_seed = 45
data_info = ['Wine', wine_data, feature_names, class_col, class_names, class_tags, rnd_seed]
datasets.append(data_info)


###
data = wine_data
print(data.head())
print(feature_names)
rf_gscv, top_features_lst, rare_class = fitRF_and_rank(data, feature_names, class_col, rnd_seed)
for lab,feat in zip(['Global: ', 'Shap: ', 'Pcfi: ', 'Rare class Shap: ', 'Rare class Pcfi: '], top_features_lst):
    print(lab,feat)
label_lst = ['Global', 'Shap', 'Pcfi', 'Rare Shap', 'Rare Pfci']
is_global_score = [True, True, True, False, False]
scores = [refitRF(data, top_features, class_col, rf_gscv, rare_class, rnd_seed, global_score=global_score)
             for global_score,top_features in zip(is_global_score,top_features_lst)]
for lab,score in zip(label_lst, scores):
    print(lab, score)

   alcohol  malic_acid   ash  alcalinity_of_ash  magnesium  total_phenols  \
0    14.23        1.71  2.43               15.6      127.0           2.80   
1    13.20        1.78  2.14               11.2      100.0           2.65   
2    13.16        2.36  2.67               18.6      101.0           2.80   
3    14.37        1.95  2.50               16.8      113.0           3.85   
4    13.24        2.59  2.87               21.0      118.0           2.80   

   flavanoids  nonflavanoid_phenols  proanthocyanins  color_intensity   hue  \
0        3.06                  0.28             2.29             5.64  1.04   
1        2.76                  0.26             1.28             4.38  1.05   
2        3.24                  0.30             2.81             5.68  1.03   
3        3.49                  0.24             2.18             7.80  0.86   
4        2.69                  0.39             1.82             4.32  1.04   

   od280/od315_of_diluted_wines  proline  Class  
0           