In [7]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import roc_auc_score

from mlxtend.feature_selection import ExhaustiveFeatureSelector as EFS

In [8]:
# load dataset
data = pd.read_excel('../dati.xlsx', nrows=50000)
data.shape

ImportError: Missing optional dependency 'xlrd'. Install xlrd >= 1.0.0 for Excel support Use pip or conda to install xlrd.

In [53]:
data.tail()

Unnamed: 0,ctx,codice_dm,lista_id,pers_sesso,pers_eta_ingresso_in_lista,diagnosi,diagnosi_specifica,eltr,lista_ingresso_data,tot_tx_organo,...,donatore_HBeAg,donatore_HBeAb,donatore_antiCoreTot,donatore_hbv_dna,donatore_hcv_rna,donatore_steatosiMacro,donatore_biopsiaEpatica,donatore_biopsiaRenale,donatore_livelloRischioPreLT,donatore_variazioneLivello
2306,BOLOGNA,89357,40632,M,54,Epatocarcinoma associato a cirrosi,,E1 : Cancers-Hepatocellular carcinoma and cirr...,2020-10-19,0.0,...,,,NEG,NEG,NEG,,Descrizione macroscopica: biopsia epatica cune...,,NON STANDARD con RISCHIO ACCETTABILE,
2307,BOLOGNA,89384,40643,M,48,Localizzazione epatiche di neoplasie maligne o...,Metastasi epatiche non-resecabili di origine c...,E11 : Cancers-Secondary liver tumors - Colorectal,2020-10-21,0.0,...,,,NEG,NEG,NEG,,,,NON STANDARD con RISCHIO ACCETTABILE,
2308,BOLOGNA,89625,40686,M,62,Epatocarcinoma associato a cirrosi,,E1 : Cancers-Hepatocellular carcinoma and cirr...,2020-11-05,0.0,...,,,NEG,none seguito,none seguito,25.0,Cuneo epatico sottoglissoniano e agobiopsia ep...,,NON STANDARD con RISCHIO ACCETTABILE,MGUS
2309,BOLOGNA,89889,40719,F,69,Cirrosi biliare primitiva,,B2 : Cholestatic disease-Primary biliary cirrh...,2020-11-26,0.0,...,,,NEG,NEG,NEG,30.0,Cuneo sottoglissoniano e biopsia intraparenchi...,"Biopsia del rene destro, comprendente compless...",Standard,
2310,BOLOGNA,89931,40727,M,31,Altra epatopatia colestatica,,B4 : Cholestatic disease-Others specify,2020-11-27,0.0,...,,,POS,NEG,NEG,,,,NON STANDARD con RISCHIO TRASCURABILE,


In [54]:
# Encoding categorical variables into numbers
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
numerical_vars = list(data.select_dtypes(include=numerics).columns)
data = data[numerical_vars]
data.shape

(2311, 58)

In [73]:
# separate train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    data.drop(labels=['codice_dm', 'lista_id'], axis=1),
    data['codice_dm'],
    test_size=0.3,
    random_state=0)

X_train.shape, X_test.shape

((1617, 56), (694, 56))

In [74]:
# find and remove correlated features
# in order to reduce the feature space a bit
# so that the algorithm takes shorter

def correlation(dataset, threshold):
    col_corr = set()  # Set of all the names of correlated columns
    corr_matrix = dataset.corr()
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if abs(corr_matrix.iloc[i, j]) > threshold: # we are interested in absolute coeff value
                colname = corr_matrix.columns[i]  # getting the name of column
                col_corr.add(colname)
    return col_corr

corr_features = correlation(X_train, 0.8)
print('correlated features: ', len(set(corr_features)) )

correlated features:  23


In [59]:
# removed correlated  features
X_train.drop(labels=corr_features, axis=1, inplace=True)
X_test.drop(labels=corr_features, axis=1, inplace=True)

X_train.shape, X_test.shape

((1617, 33), (694, 33))

In [60]:
X_train.columns[0:20]

Index(['pers_eta_ingresso_in_lista', 'tot_tx_organo', 'Na', 'Peso', 'Altezza',
       'BMI', 'MELD_in_ingresso', 'HCC_noduli', 'HCC_diam_max',
       'HCC_diam_totale', 'AFP', 'ISO2_valid_increment', 'ISO2_valid_giorni',
       'codice_sit_donazione', 'donatore_eta', 'trap_don_decesso_id',
       'donatore_peso', 'donatore_altezza', 'donatore_BMI',
       'donatore_giorni_ricovero'],
      dtype='object')

In [62]:
# exhaustive feature selection
# Using 10 features with ROC_AUC Scoring

efs1 = EFS(RandomForestClassifier(n_jobs=4, random_state=0), 
           min_features=1,
           max_features=4, 
           #scoring='roc_auc',
           print_progress=True,
           cv=2)

efs1 = efs1.fit(np.array(X_train[X_train.columns[0:4]].fillna(0)), y_train)

Features: 15/15

In [63]:
def run_randomForests(X_train, X_test, y_train, y_test):
    rf = RandomForestClassifier(n_estimators=200, random_state=39, max_depth=4)
    rf.fit(X_train, y_train)
    print('Train set')
    pred = rf.predict_proba(X_train)
    print('Random Forests roc-auc: {}'.format(roc_auc_score(y_train, pred[:,1])))
    print('Test set')
    pred = rf.predict_proba(X_test)
    print('Random Forests roc-auc: {}'.format(roc_auc_score(y_test, pred[:,1])))

In [64]:
efs1.best_idx_

(0, 3)

In [65]:
selected_feat= X_train.columns[list(efs1.best_idx_)]
selected_feat

Index(['pers_eta_ingresso_in_lista', 'Peso'], dtype='object')

In [67]:
selected_feat= X_train.columns[list(efs1.best_idx_)]
selected_feat

Index(['pers_eta_ingresso_in_lista', 'Peso'], dtype='object')

In [68]:
# evaluate performance of classifier using selected features

run_randomForests(X_train[selected_feat].fillna(0),
                  X_test[selected_feat].fillna(0),
                  y_train, y_test)

Train set


ValueError: multi_class must be in ('ovo', 'ovr')