# Imports

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import zipfile
import _pickle
import pickle
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_curve, auc, roc_auc_score
import time

# Data loading
Remember to change path if needed

In [2]:
path = "D:/GitHub/Machine-Learning/Kaggle/"
# path = "C:/Users/Javi/Documents/GitHub/Machine-Learning/Kaggle/"

zf = zipfile.ZipFile(path+'zipped_TrainData.zip', 'r')
df_train = _pickle.loads(zf.open('TrainData.pkl').read())
zf.close()

zf = zipfile.ZipFile(path+'zipped_TestDataUnlabeled.zip', 'r')
df_test = _pickle.loads(zf.open('TestDataUnlabeled.pkl').read())
zf.close()

# Data split & Spectrum in regular bins
Data is split between a proper training set (later used in cross-validation), and a test_train set, which will help us in determining under/overfitting as we do have labels for them.

Spectrums are divided in regular size bins, always the same, so that we can treat them as features, not worrying about different mz scales. According to the literature the peaks contain the relevant information, then we only save the maximum value in the bin (range of mz coordinates) so that peak information is never lost. Moreover, by performing this regularization in bins, peaks at very close mz values (same compound, small mz differences due to experimental uncertainty) are seen by the machine as belonging to the same bin and therefore the same feature. Therefore, it facilitates to use peaks as values.

Also, peak values are normalized by the maximum peak value of the spectrum, as specific values are experiment-dependent and do not carry information, only the relation between peak sizes does.

In [3]:
def spectrum_in_bins(df,m,M,bin_size):
    # Now, let's define the mz ranges, and the label associated to each of them (the mean of the limiting values of each bin)
    range_min = []; range_max = []; range_label = [];
    for mz in range(m,M,bin_size):
        range_min.append(mz)
        range_max.append(mz+bin_size)
        range_label.append(np.mean([range_min[-1],range_max[-1]]).astype(int))


    N = len(df)  # number of samples
    L = len(range_min)  # length of new spectrum (number of bins)
    all_data = np.zeros((N,L))
    for idx in range(N): 
        intensity = df[['intensity']].iloc[idx].values[0]
        mzcoord   = df[['coord_mz']].iloc[idx].values[0]
        idx_data_in_bins = np.zeros((1,L))
        for i,mz in enumerate(range_min):
            intensity_range = intensity[(mzcoord > mz) & (mzcoord < (mz+bin_size))]
            if len(intensity_range) > 0 :
                idx_data_in_bins[0,i] = np.max(intensity_range)
            else: # if those mz coordinates are not in that spectrum
                idx_data_in_bins[0,i] = 0   

        # Normalize the amplitude of the spectrum
        idx_data_in_bins[0,:] = idx_data_in_bins[0,:] / np.max(idx_data_in_bins[0,:])
        
        
        all_data[idx,:] = idx_data_in_bins
    new_df = pd.DataFrame(data=all_data, columns = range_label, index = df.index)
    return new_df

In [4]:
# Extract data (spectra) and targets of the df_train set
data = df_train.iloc[:,-2:]
targets = df_train.iloc[:,1:-2]

m = 2000; M = 20000; 
bin_size = 50;

# Then, split into a train and test_train set
data_train, data_test_train, targets_train, targets_test_train = train_test_split(data, targets, test_size=0.2, random_state=42) # split the data
print('Training samples: '+str(len(data_train))+' and test_train samples: ' + str(len(data_test_train)) )

# apply the bins to all spectra, so that our feature space becomes the same for all samples (make them regular, all the same)
spectrum_train = spectrum_in_bins(data_train,m,M,bin_size)
spectrum_test_train = spectrum_in_bins(data_test_train,m,M,bin_size)
print('Spectrum regularized!')
# these spectrum_... are our X for training

Training samples: 412 and test_train samples: 104
Spectrum regularized!


# Try different classifiers
The try_clf function has been built for, given a classifier and a parameter dictionary (for hyperparameter cross-validation), create a classifier for each antibiotic, and return the results. This enables for fast testing of different classifiers. Moreover, the function also takes charge of suppressing NaN values in the targets ocurring for amikacina, levofloxacino and tobramicina.

In [5]:
def clean_nan_samples(spectrum,targets, c, cat):
# if there are any NaN values, we should remove those samples
    if (targets[cat].isnull().sum() > 0).all(): 
#         print('There are NaN values in',cat)
        merged = pd.concat([spectrum , targets],axis=1,copy=True)
        clean = merged.dropna(subset=[cat])
#         print('Dropped ',len(merged)-len(clean))

        Y = clean.iloc[:,-9+c].to_numpy().reshape(-1,)
        X = clean.iloc[:,:-9]

    else:
        Y = targets.iloc[:,c].to_numpy().reshape(-1,)
        X = spectrum.copy(deep=True)
    return X , Y

In [6]:
def try_clf(clf,params,n_cv=5):
    t1 = time.time()
    
    best_classifiers = [];
    accuracies_train = []; accuracies_test_train = [];
    AUC_train = []; AUC_test_train = [];
    
    categories = targets_train.columns[:]    
    for c,cat in enumerate(categories):

        print([cat]) # indicate in which antibiotic we are
        
        # Selection of train and test data (depending on whether there are NaN target values)
        X_train, Y_train = clean_nan_samples(spectrum_train,targets_train, c, cat)
        X_test_train, Y_test_train = clean_nan_samples(spectrum_test_train,targets_test_train, c, cat)
            
        # perform a GridSearchCV in order to train a classifier for this antibiotic
        grid = GridSearchCV(clf,param_grid=params, cv=n_cv, iid=False)
        grid.fit(X_train, Y_train)

        # print the best parameters (to detect edge values), and save that classifier
        print('The best parameters are: ',grid.best_params_)
        best_clf = grid.best_estimator_
        best_classifiers.append(best_clf)
        
        # compute the accuracy of the classifier
        acc_train = best_clf.score(X_train, Y_train)
        acc_test = best_clf.score(X_test_train, Y_test_train)
        print('Train accuracy: ',np.round(acc_train,4),' and test_train accuracy: ',np.round(acc_test,4))
        accuracies_train.append(acc_train)
        accuracies_test_train.append(acc_test)
        
        # compute the AUC of the classifier
        if callable(getattr(best_clf,"predict_proba",None)):
            pred_train = best_clf.predict_proba(X_train)[:,-1] # only take last column, the prob of Y = +1
            pred_test = best_clf.predict_proba(X_test_train)[:,-1]
        else:
            print('Using decision_function instead of predict_proba')
            pred_train = best_clf.decision_function(X_train)
            pred_test = best_clf.decision_function(X_test_train)            
        auc_score_train = roc_auc_score(Y_train, pred_train)
        auc_score_test = roc_auc_score(Y_test_train, pred_test)
        print('Train AUC: ',np.round(auc_score_train,4),' and test_train AUC: ',np.round(auc_score_test,4))
        AUC_train.append(auc_score_train)
        AUC_test_train.append(auc_score_test)
        
    avg_AUC_train = np.mean(AUC_train)
    avg_AUC_test_train = np.mean(AUC_test_train)
    print('\n\nThe average train AUC is',np.round(avg_AUC_train,4),'and the avg test_train AUC is',np.round(avg_AUC_test_train,4))
    
    t2 = time.time()
    print('\nFull execution took ',np.round(t2-t1,1),'seconds')
    print('\nDONE!')
    return best_classifiers, accuracies_train, accuracies_test_train, AUC_train, AUC_test_train

## L1 feature selection

In [7]:
def get_l1_clfs():
    from sklearn.linear_model import LogisticRegression
    clf = LogisticRegression(penalty='l1',solver='liblinear',max_iter=1e6, class_weight='balanced')
    params = {'C':10.**np.arange(-2,5)}
    l1_best_clfs,_,_,_,_ = try_clf(clf,params)
    return l1_best_clfs

def obtain_l1_vects(l1_best_clfs,spectrum_train,targets_train):
    l1_feat_list = []    
    categories = targets_train.columns[:]
    
    for c, cat in enumerate(categories):
        n = np.sum(np.abs(l1_best_clfs[c].coef_) > 0)
        print('Number of features:',n)
        while n == 0:
            clf = l1_best_clfs[c]
            c_value = clf.get_params()['C']
            new_c = c_value * 10
            clf.set_params(C=new_c)
            X_train, Y_train = clean_nan_samples(spectrum_train,targets_train, c, cat)            
            clf.fit(X_train, Y_train) # refit with higher C
            l1_best_clfs[c] = clf
            n = np.sum(np.abs(clf.coef_) > 0)
            print(n)
    
        # once we know we have at least one non-zero feature
        vect = (np.abs(l1_best_clfs[c].coef_)>0).reshape(-1,)
        l1_feat_list.append(vect)
    return l1_feat_list

# to be applyied to each category
def apply_l1_feature_selection(spectrum_train,vect): # vect is l1_feat_list[c]
    new_spectrum = spectrum_train.copy(deep=True).iloc[:,vect]   
    return new_spectrum

# to obtain a list, with an element for each category
def apply_l1_feature_selection_listmode(spectrum_train,vect_list): # vect is l1_feat_list
    new_spectrum_list = []
    categories = targets_train.columns[:]
    for c,cat in enumerate(categories):
        new_spectrum = spectrum_train.copy(deep=True).iloc[:,vect_list[c]]   
        new_spectrum_list.append(new_spectrum)
    return new_spectrum_list

In [14]:
# l1_best_clfs = get_l1_clfs()

['OXACILINA']
The best parameters are:  {'C': 100.0}
Train accuracy:  0.9248  and test_train accuracy:  0.7788
Train AUC:  0.9742  and test_train AUC:  0.8348
['AMIKACINA']
The best parameters are:  {'C': 100.0}
Train accuracy:  0.9104  and test_train accuracy:  0.7927
Train AUC:  0.9734  and test_train AUC:  0.7327
['AMOXI/CLAV']
The best parameters are:  {'C': 100.0}
Train accuracy:  0.9175  and test_train accuracy:  0.7596
Train AUC:  0.9744  and test_train AUC:  0.815
['CIPROFLOXACINO']
The best parameters are:  {'C': 100.0}
Train accuracy:  0.8762  and test_train accuracy:  0.7596
Train AUC:  0.9552  and test_train AUC:  0.8066
['CLINDAMICINA']
The best parameters are:  {'C': 0.01}
Train accuracy:  0.7961  and test_train accuracy:  0.7788
Train AUC:  0.5  and test_train AUC:  0.5
['ERITROMICINA']
The best parameters are:  {'C': 100.0}
Train accuracy:  0.8398  and test_train accuracy:  0.6731
Train AUC:  0.9266  and test_train AUC:  0.7207
['LEVOFLOXACINO']
The best parameters are:

NameError: name 'l1_feat_list' is not defined

In [21]:
l1_feat_list = obtain_l1_vects(l1_best_clfs,spectrum_train,targets_train)
new_spectrum_list = apply_l1_feature_selection_listmode(spectrum_train,l1_feat_list)

Number of features: 93
Number of features: 82
Number of features: 94
Number of features: 99
Number of features: 0
9
Number of features: 104
Number of features: 101
Number of features: 129
Number of features: 84


In [26]:
print(l1_feat_list[4])

[False  True False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False  True  True False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False  True False False False
  True False False False False False  True False False False False False
 False False False False False False False False False False False False
 False False False  True False False False False False False False False
  True False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False  True False False False
 False False False False False False False False Fa

In [27]:
print(new_spectrum_list[4])

         2075      3425      3475      4825      5025      5325      6375  \
ID                                                                          
329  0.096192  0.077655  0.058617  0.073146  0.150301  0.264529  0.241483   
173  0.071889  0.212948  0.073907  0.167141  0.258142  0.444178  0.239354   
272  0.063321  0.214897  0.070609  0.195325  0.219065  0.511882  0.237442   
496  0.041735  0.138843  0.045350  0.083894  0.220917  0.512740  0.229494   
182  0.166476  0.466049  0.170670  0.626819  0.319971  0.992097  0.390499   
..        ...       ...       ...       ...       ...       ...       ...   
71   0.091350  0.437960  0.104902  0.468475  0.336644  0.712556  0.231952   
106  0.092219  0.161396  0.073400  0.309547  0.233002  0.611231  0.355546   
270  0.122623  0.139775  0.081102  0.139697  0.343888  0.560652  0.466123   
435  0.150422  0.185517  0.563532  0.359765  0.457895  0.593363  0.250892   
102  0.186658  0.277406  0.094011  0.287433  0.273583  1.000000  0.328262   

In [28]:
# save the l1_best_clfs, as it takes so long to train

# with open(path+'l1_best_clfs.data', 'wb') as filehandle:
#     # store the data as binary data stream
#     pickle.dump(l1_best_clfs, filehandle)
    
# with open(path+'l1_feat_list.data', 'wb') as filehandle:
#     # store the data as binary data stream
#     pickle.dump(l1_feat_list, filehandle)

In [30]:
print(l1_best_clfs2)

[LogisticRegression(C=100.0, class_weight='balanced', dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=1000000.0, multi_class='warn', n_jobs=None,
                   penalty='l1', random_state=None, solver='liblinear',
                   tol=0.0001, verbose=0, warm_start=False), LogisticRegression(C=100.0, class_weight='balanced', dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=1000000.0, multi_class='warn', n_jobs=None,
                   penalty='l1', random_state=None, solver='liblinear',
                   tol=0.0001, verbose=0, warm_start=False), LogisticRegression(C=100.0, class_weight='balanced', dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=1000000.0, multi_class='warn', n_jobs=None,
                   penalty='l1', random_state=None, solver='liblinear',
                   tol=0.

In [29]:
# with open(path+'l1_best_clfs.data', 'rb') as filehandle:
#     # read the data as binary data stream
#     l1_best_clfs2 = pickle.load(filehandle)
# with open(path+'l1_feat_list.data', 'rb') as filehandle:
#     # store the data as binary data stream
#     l1_feat_list2 = pickle.load(filehandle)

    
# print(apply_l1_feature_selection(spectrum_train,l1_feat_list2[4]))

         2075      3425      3475      4825      5025      5325      6375  \
ID                                                                          
329  0.096192  0.077655  0.058617  0.073146  0.150301  0.264529  0.241483   
173  0.071889  0.212948  0.073907  0.167141  0.258142  0.444178  0.239354   
272  0.063321  0.214897  0.070609  0.195325  0.219065  0.511882  0.237442   
496  0.041735  0.138843  0.045350  0.083894  0.220917  0.512740  0.229494   
182  0.166476  0.466049  0.170670  0.626819  0.319971  0.992097  0.390499   
..        ...       ...       ...       ...       ...       ...       ...   
71   0.091350  0.437960  0.104902  0.468475  0.336644  0.712556  0.231952   
106  0.092219  0.161396  0.073400  0.309547  0.233002  0.611231  0.355546   
270  0.122623  0.139775  0.081102  0.139697  0.343888  0.560652  0.466123   
435  0.150422  0.185517  0.563532  0.359765  0.457895  0.593363  0.250892   
102  0.186658  0.277406  0.094011  0.287433  0.273583  1.000000  0.328262   

In [8]:
# and, more often, retrieve this data

with open(path+'l1_best_clfs.data', 'rb') as filehandle:
    # read the data as binary data stream
    l1_best_clfs = pickle.load(filehandle)
with open(path+'l1_feat_list.data', 'rb') as filehandle:
    # store the data as binary data stream
    l1_feat_list = pickle.load(filehandle)

## try_clf_feat_selection
Let's make a modification of the original try_clf in order to accept a feature_vector_list and use this in order to train classifiers with less features and more powerful.

In [9]:
def try_clf_feat_selection(clf, params, feature_vector_list, n_cv=10):
    t1 = time.time()
    
    best_classifiers = [];
    accuracies_train = []; accuracies_test_train = [];
    AUC_train = []; AUC_test_train = [];
    
    categories = targets_train.columns[:]    
    for c,cat in enumerate(categories):

        print([cat]) # indicate in which antibiotic we are
        
        # Selection of train and test data (depending on whether there are NaN target values)
        X_train, Y_train = clean_nan_samples(spectrum_train,targets_train, c, cat)
        X_test_train, Y_test_train = clean_nan_samples(spectrum_test_train,targets_test_train, c, cat)
        
        X_train = apply_l1_feature_selection(X_train,feature_vector_list[c])
        X_test_train = apply_l1_feature_selection(X_test_train,feature_vector_list[c])
            
        # perform a GridSearchCV in order to train a classifier for this antibiotic
        grid = GridSearchCV(clf,param_grid=params, cv=n_cv, iid=False)
        grid.fit(X_train, Y_train)

        # print the best parameters (to detect edge values), and save that classifier
        print('The best parameters are: ',grid.best_params_)
        best_clf = grid.best_estimator_
        best_classifiers.append(best_clf)
        
        # compute the accuracy of the classifier
        acc_train = best_clf.score(X_train, Y_train)
        acc_test = best_clf.score(X_test_train, Y_test_train)
        print('Train accuracy: ',np.round(acc_train,4),' and test_train accuracy: ',np.round(acc_test,4))
        accuracies_train.append(acc_train)
        accuracies_test_train.append(acc_test)
        
        # compute the AUC of the classifier
        if callable(getattr(best_clf,"predict_proba",None)):
            pred_train = best_clf.predict_proba(X_train)[:,-1] # only take last column, the prob of Y = +1
            pred_test = best_clf.predict_proba(X_test_train)[:,-1]
        else:
            print('Using decision_function instead of predict_proba')
            pred_train = best_clf.decision_function(X_train)
            pred_test = best_clf.decision_function(X_test_train)            
        auc_score_train = roc_auc_score(Y_train, pred_train)
        auc_score_test = roc_auc_score(Y_test_train, pred_test)
        print('Train AUC: ',np.round(auc_score_train,4),' and test_train AUC: ',np.round(auc_score_test,4))
        AUC_train.append(auc_score_train)
        AUC_test_train.append(auc_score_test)
        
    avg_AUC_train = np.mean(AUC_train)
    avg_AUC_test_train = np.mean(AUC_test_train)
    print('\n\nThe average train AUC is',np.round(avg_AUC_train,4),'and the avg test_train AUC is',np.round(avg_AUC_test_train,4))
    
    t2 = time.time()
    print('\nFull execution took ',np.round(t2-t1,1),'seconds')
    print('\nDONE!')
    return best_classifiers, accuracies_train, accuracies_test_train, AUC_train, AUC_test_train

In [35]:
from sklearn.svm import SVC
clf = SVC(kernel='rbf', class_weight='balanced')

C_vector = 10. ** np.arange(-4,6)
gamma_vector = 10. ** np.arange(-4,4)
params = {'C':C_vector, 'gamma':gamma_vector}

rbf_SVM_best_clfs, _, _, rbf_SVM_AUC_train, rbf_SVM_AUC_test_train = try_clf_feat_selection(clf,params,l1_feat_list)

['OXACILINA']
The best parameters are:  {'C': 10000.0, 'gamma': 0.1}
Train accuracy:  0.9636  and test_train accuracy:  0.7981
Using decision_function instead of predict_proba
Train AUC:  0.9872  and test_train AUC:  0.8687
['AMIKACINA']
The best parameters are:  {'C': 10000.0, 'gamma': 0.01}
Train accuracy:  0.8613  and test_train accuracy:  0.7805
Using decision_function instead of predict_proba
Train AUC:  0.9415  and test_train AUC:  0.7294
['AMOXI/CLAV']
The best parameters are:  {'C': 10000.0, 'gamma': 0.1}
Train accuracy:  0.9587  and test_train accuracy:  0.7692
Using decision_function instead of predict_proba
Train AUC:  0.9886  and test_train AUC:  0.8562
['CIPROFLOXACINO']
The best parameters are:  {'C': 100000.0, 'gamma': 0.01}
Train accuracy:  0.9053  and test_train accuracy:  0.7788
Using decision_function instead of predict_proba
Train AUC:  0.9573  and test_train AUC:  0.7867
['CLINDAMICINA']
The best parameters are:  {'C': 1.0, 'gamma': 1000.0}
Train accuracy:  0.9903 

In [37]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(class_weight='balanced',min_samples_split=5)
params = {'n_estimators':[10,20,30,50,70],'max_depth':np.arange(1,10)}
# later try
# params = {'n_estimators':[10,20,30,50,70,100,150],'max_depth':np.arange(1,30)}
rf_best_clfs, _, _, rf_AUC_train, rf_AUC_test_train = try_clf_feat_selection(clf,params,l1_feat_list)

['OXACILINA']
The best parameters are:  {'max_depth': 9, 'n_estimators': 50}
Train accuracy:  0.9951  and test_train accuracy:  0.7692
Train AUC:  1.0  and test_train AUC:  0.796
['AMIKACINA']
The best parameters are:  {'max_depth': 9, 'n_estimators': 50}
Train accuracy:  0.9884  and test_train accuracy:  0.7683
Train AUC:  0.9991  and test_train AUC:  0.7052
['AMOXI/CLAV']
The best parameters are:  {'max_depth': 9, 'n_estimators': 20}
Train accuracy:  0.9854  and test_train accuracy:  0.7212
Train AUC:  0.9994  and test_train AUC:  0.7241
['CIPROFLOXACINO']
The best parameters are:  {'max_depth': 6, 'n_estimators': 30}
Train accuracy:  0.9733  and test_train accuracy:  0.7115
Train AUC:  0.9975  and test_train AUC:  0.6877
['CLINDAMICINA']
The best parameters are:  {'max_depth': 9, 'n_estimators': 50}
Train accuracy:  0.9879  and test_train accuracy:  0.7596
Train AUC:  0.9975  and test_train AUC:  0.5574
['ERITROMICINA']
The best parameters are:  {'max_depth': 8, 'n_estimators': 30}


In [11]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
clf = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1)) # we can change that
params = {'n_estimators':[150]}
ab_best_clfs, _, _, ab_AUC_train, ab_AUC_test_train = try_clf_feat_selection(clf,params,l1_feat_list)

['OXACILINA']
The best parameters are:  {'n_estimators': 150}
Train accuracy:  0.9951  and test_train accuracy:  0.6827
Train AUC:  1.0  and test_train AUC:  0.7351
['AMIKACINA']
The best parameters are:  {'n_estimators': 150}
Train accuracy:  0.9884  and test_train accuracy:  0.7439
Train AUC:  0.9997  and test_train AUC:  0.7327
['AMOXI/CLAV']
The best parameters are:  {'n_estimators': 150}
Train accuracy:  0.9951  and test_train accuracy:  0.7019
Train AUC:  0.9999  and test_train AUC:  0.7336
['CIPROFLOXACINO']
The best parameters are:  {'n_estimators': 150}
Train accuracy:  0.9879  and test_train accuracy:  0.7212
Train AUC:  0.9997  and test_train AUC:  0.7133
['CLINDAMICINA']
The best parameters are:  {'n_estimators': 150}
Train accuracy:  0.949  and test_train accuracy:  0.7308
Train AUC:  0.9933  and test_train AUC:  0.5891
['ERITROMICINA']
The best parameters are:  {'n_estimators': 150}
Train accuracy:  0.9951  and test_train accuracy:  0.625
Train AUC:  0.9999  and test_trai