In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from XRF import XRandomForestClassifier
from sklearn.metrics import f1_score, accuracy_score
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
import warnings
warnings.filterwarnings('ignore')

## Yeast Dataset

### Load dataset from URL

In [3]:
data_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/yeast/yeast.data'
names = ['Sequence_Name','mcg', 'gvh', 'alm', 'mit', 'erl','pox','vac','nuc', 'Localization_Site']
pdf_yeast = pd.read_csv(data_url, names=names, delim_whitespace=True)

In [4]:
pdf_yeast.head(5)

Unnamed: 0,Sequence_Name,mcg,gvh,alm,mit,erl,pox,vac,nuc,Localization_Site
0,ADT1_YEAST,0.58,0.61,0.47,0.13,0.5,0.0,0.48,0.22,MIT
1,ADT2_YEAST,0.43,0.67,0.48,0.27,0.5,0.0,0.53,0.22,MIT
2,ADT3_YEAST,0.64,0.62,0.49,0.15,0.5,0.0,0.53,0.22,MIT
3,AAR2_YEAST,0.58,0.44,0.57,0.13,0.5,0.0,0.54,0.22,NUC
4,AATM_YEAST,0.42,0.44,0.48,0.54,0.5,0.0,0.48,0.22,MIT


### Preprocessing the data

In [5]:
target = 'Localization_Site'
key = 'Sequence_Name'

# Remove scarce labels:
pdf_yeast = pdf_yeast[pdf_yeast[target].isin(['CYT', 'NUC', 'MIT', 'ME3'])]

X_yeast = pdf_yeast.drop(columns=[key, target], inplace=False)
y_yeast = pdf_yeast[[target]]

### Set hyper-parameters

In [6]:
# RF's hyper-parameters:
n_estimators = 40
max_depth = 5
max_samples = 0.4

# GA's hyper-parameters:
mutation_probability = 0.7
mating_probability = 0.5
num_generations = 20

# XRF's config:
normalize_weights = 'Softmax'
target_function = 'Cosine'
alpha = 3.5

# Random seed:
random_state = 42

### Split data into train-test

In [7]:
X_trn, X_tst, y_trn, y_tst = train_test_split(X_yeast, 
                                              y_yeast.values.ravel(), 
                                              test_size=0.2, 
                                              stratify=y_yeast, 
                                              random_state=random_state)

### Models

In [8]:
feature_preferences = [0, 0, -1, -1, 1, 1, 0, 0]

In [9]:
pdf_ftr_imprt = pd.DataFrame(columns=X_trn.columns.insert(0, 'ID'))
pdf_results = pd.DataFrame(columns=['ID', 'Set', 'Accuracy', 'Macro f1 score', 'XS'])

#### RF

In [10]:
rf_mdl = RandomForestClassifier(random_state=random_state, 
                                bootstrap=True, 
                                n_estimators=n_estimators, 
                                max_depth=max_depth, 
                                max_samples=max_samples)
_ = rf_mdl.fit(X_trn, y_trn)

In [11]:
rf_preds_trn = rf_mdl.predict(X_trn)
rf_preds_tst = rf_mdl.predict(X_tst)

rf_xs = (cosine_similarity(np.array(feature_preferences).reshape(1, -1), \
                           np.array(rf_mdl.feature_importances_).reshape(1, -1))[0][0] + 1) / 2

In [12]:
pdf_ftr_imprt.loc[len(pdf_ftr_imprt)] = ['RF FI'] + list(rf_mdl.feature_importances_)
pdf_results.loc[len(pdf_results)] = ['RF', 'Training', accuracy_score(y_trn, rf_preds_trn), f1_score(y_trn, rf_preds_trn, average="macro"), np.nan]
pdf_results.loc[len(pdf_results)] = ['RF', 'Test', accuracy_score(y_tst, rf_preds_tst), f1_score(y_tst, rf_preds_tst, average="macro"), rf_xs]

#### XRF

In [13]:
xrf_mdl = XRandomForestClassifier(n_estimators=n_estimators, 
                                  random_state=random_state, 
                                  max_depth=max_depth, 
                                  bootstrap=True, 
                                  max_samples=max_samples, 
                                  xai_weight=alpha, 
                                  mutation_probability=mutation_probability,
                                  mating_probability=mating_probability, 
                                  feature_preferences=feature_preferences, 
                                  ext_verbose=1, 
                                  num_generations=num_generations, 
                                  normalize_weights=normalize_weights, 
                                  target_function=target_function,
                                  performance_metric='accuracy', 
                                  n_jobs=-1)
_ = xrf_mdl.fit(X_trn, y_trn)

gen	nevals	avg    	std       	min    	max    
0  	100   	1.35971	0.00551144	1.34602	1.37832
1  	84    	1.36592	0.00530446	1.35621	1.37832
2  	88    	1.37157	0.00426008	1.36143	1.37938
3  	86    	1.37523	0.00364116	1.36283	1.38404
4  	86    	1.37766	0.00323074	1.36871	1.38664
5  	87    	1.3797 	0.00406199	1.36519	1.3883 
6  	88    	1.3838 	0.00272353	1.378  	1.38914
7  	69    	1.38627	0.00216032	1.37906	1.39176
8  	81    	1.38854	0.00237746	1.38319	1.39694
9  	81    	1.3919 	0.00339331	1.38409	1.40229
10 	92    	1.39597	0.00252292	1.38936	1.4055 
11 	81    	1.39895	0.00295929	1.39242	1.40578
12 	81    	1.40253	0.00311802	1.39058	1.40859
13 	83    	1.40524	0.00189508	1.39903	1.41112
14 	85    	1.40686	0.00268926	1.39757	1.41713
15 	85    	1.40946	0.00405056	1.40192	1.42559
16 	82    	1.41485	0.00543883	1.40426	1.42986
17 	89    	1.42104	0.00497782	1.40574	1.43332
18 	82    	1.42565	0.00340776	1.41486	1.43685
19 	88    	1.42959	0.00379322	1.41937	1.44079
20 	84    	1.43324	0.0034341 	1.42

In [14]:
xrf_preds_trn = xrf_mdl.predict(X_trn)
xrf_preds_tst = xrf_mdl.predict(X_tst)

xrf_xs = (cosine_similarity(np.array(feature_preferences).reshape(1, -1), \
                            np.array(xrf_mdl.feature_importances_).reshape(1, -1))[0][0] + 1) / 2

In [15]:

pdf_results.loc[len(pdf_results)] = ['XRF', 'Training', accuracy_score(y_trn, xrf_preds_trn), f1_score(y_trn, xrf_preds_trn, average="macro"), np.nan]
pdf_results.loc[len(pdf_results)] = ['XRF', 'Test', accuracy_score(y_tst, xrf_preds_tst), f1_score(y_tst, xrf_preds_tst, average="macro"), xrf_xs]

pdf_ftr_imprt.loc[len(pdf_ftr_imprt)] = ['FP'] + feature_preferences
pdf_ftr_imprt.loc[len(pdf_ftr_imprt)] = ['XRF FI'] + list(xrf_mdl.feature_importances_)

## Comparisons

In [16]:
if 'ID' in pdf_results.columns:
    pdf_results.set_index(['ID', 'Set'], drop=True, inplace=True)
if 'ID' in pdf_ftr_imprt.columns:
    pdf_ftr_imprt.set_index('ID', drop=True, inplace=True)

In [17]:
def highlight_cols(df_in):
    bbg = 'background-color: blue'
    mbg = 'background-color: magenta'
    df_sty = pd.DataFrame('', index=df_in.index, columns=df_in.columns)
    for i in range(len(feature_preferences)):
        if 0 < feature_preferences[i]:
            df_sty.iloc[:, i] = bbg
        elif 0 > feature_preferences[i]:
            df_sty.iloc[:, i] = mbg
    return df_sty

In [18]:
display(pdf_results)
display(pdf_ftr_imprt.style.apply(highlight_cols, axis=None))

Unnamed: 0_level_0,Unnamed: 1_level_0,Accuracy,Macro f1 score,XS
ID,Set,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
RF,Training,0.685274,0.71318,
RF,Test,0.611538,0.646408,0.192312
XRF,Training,0.689124,0.716157,
XRF,Test,0.626923,0.670479,0.215649


Unnamed: 0_level_0,mcg,gvh,alm,mit,erl,pox,vac,nuc
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
RF FI,0.078432,0.096517,0.361567,0.236278,0.002684,0.001634,0.058114,0.164776
FP,0.0,0.0,-1.0,-1.0,1.0,1.0,0.0,0.0
XRF FI,0.089688,0.112325,0.376093,0.179957,0.001769,0.007822,0.048062,0.184285
