# Imports

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import zipfile
import _pickle
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_curve, auc, roc_auc_score
import time

# Data loading
Remember to change path if needed

In [2]:
# path = "D:/GitHub/Machine-Learning/Kaggle/"
path = "C:/Users/Javi/Documents/GitHub/Machine-Learning/Kaggle/"

zf = zipfile.ZipFile(path+'zipped_TrainData.zip', 'r')
df_train = _pickle.loads(zf.open('TrainData.pkl').read())
zf.close()

zf = zipfile.ZipFile(path+'zipped_TestDataUnlabeled.zip', 'r')
df_test = _pickle.loads(zf.open('TestDataUnlabeled.pkl').read())
zf.close()

# Data split & Spectrum in regular bins
Data is split between a proper training set (later used in cross-validation), and a test_train set, which will help us in determining under/overfitting as we do have labels for them.

Spectrums are divided in regular size bins, always the same, so that we can treat them as features, not worrying about different mz scales. According to the literature the peaks contain the relevant information, then we only save the maximum value in the bin (range of mz coordinates) so that peak information is never lost. Moreover, by performing this regularization in bins, peaks at very close mz values (same compound, small mz differences due to experimental uncertainty) are seen by the machine as belonging to the same bin and therefore the same feature. Therefore, it facilitates to use peaks as values.

Also, peak values are normalized by the maximum peak value of the spectrum, as specific values are experiment-dependent and do not carry information, only the relation between peak sizes does.

In [3]:
def spectrum_in_bins(df,m,M,bin_size):
    # Now, let's define the mz ranges, and the label associated to each of them (the mean of the limiting values of each bin)
    range_min = []; range_max = []; range_label = [];
    for mz in range(m,M,bin_size):
        range_min.append(mz)
        range_max.append(mz+bin_size)
        range_label.append(np.mean([range_min[-1],range_max[-1]]).astype(int))


    N = len(df)  # number of samples
    L = len(range_min)  # length of new spectrum (number of bins)
    all_data = np.zeros((N,L))
    for idx in range(N): 
        intensity = df[['intensity']].iloc[idx].values[0]
        mzcoord   = df[['coord_mz']].iloc[idx].values[0]
        idx_data_in_bins = np.zeros((1,L))
        for i,mz in enumerate(range_min):
            intensity_range = intensity[(mzcoord > mz) & (mzcoord < (mz+bin_size))]
            if len(intensity_range) > 0 :
                idx_data_in_bins[0,i] = np.max(intensity_range)
            else: # if those mz coordinates are not in that spectrum
                idx_data_in_bins[0,i] = 0   

        # Normalize the amplitude of the spectrum
        idx_data_in_bins[0,:] = idx_data_in_bins[0,:] / np.max(idx_data_in_bins[0,:])
        
        
        all_data[idx,:] = idx_data_in_bins
    new_df = pd.DataFrame(data=all_data, columns = range_label, index = df.index)
    return new_df

In [4]:
# Extract data (spectra) and targets of the df_train set
data = df_train.iloc[:,-2:]
targets = df_train.iloc[:,1:-2]

m = 2000; M = 20000; 
bin_size = 50;

# Then, split into a train and test_train set
data_train, data_test_train, targets_train, targets_test_train = train_test_split(data, targets, test_size=0.2, random_state=42) # split the data
print('Training samples: '+str(len(data_train))+' and test_train samples: ' + str(len(data_test_train)) )

# apply the bins to all spectra, so that our feature space becomes the same for all samples (make them regular, all the same)
spectrum_train = spectrum_in_bins(data_train,m,M,bin_size)
spectrum_test_train = spectrum_in_bins(data_test_train,m,M,bin_size)
print('Spectrum regularized!')
# these spectrum_... are our X for training

Training samples: 412 and test_train samples: 104
Spectrum regularized!


# Try different classifiers
The try_clf function has been built for, given a classifier and a parameter dictionary (for hyperparameter cross-validation), create a classifier for each antibiotic, and return the results. This enables for fast testing of different classifiers. Moreover, the function also takes charge of suppressing NaN values in the targets ocurring for amikacina, levofloxacino and tobramicina.

In [62]:
def clean_nan_samples(spectrum,targets, c, cat):
# if there are any NaN values, we should remove those samples
    if (targets[cat].isnull().sum() > 0).all(): 
#         print('There are NaN values in',cat)
        merged = pd.concat([spectrum , targets],axis=1,copy=True)
        clean = merged.dropna(subset=[cat])
#         print('Dropped ',len(merged)-len(clean))

        Y = clean.iloc[:,-9+c].to_numpy().reshape(-1,)
        X = clean.iloc[:,:-9]

    else:
        Y = targets.iloc[:,c].to_numpy().reshape(-1,)
        X = spectrum.copy(deep=True)
    return X , Y

In [56]:
def try_clf(clf,params,n_cv=5):
    t1 = time.time()
    
    best_classifiers = [];
    accuracies_train = []; accuracies_test_train = [];
    AUC_train = []; AUC_test_train = [];
    
    categories = targets_train.columns[:]    
    for c,cat in enumerate(categories):

        print([cat]) # indicate in which antibiotic we are
        
        # Selection of train and test data (depending on whether there are NaN target values)
        X_train, Y_train = clean_nan_samples(spectrum_train,targets_train, c, cat)
        X_test_train, Y_test_train = clean_nan_samples(spectrum_test_train,targets_test_train, c, cat)
            
        # perform a GridSearchCV in order to train a classifier for this antibiotic
        grid = GridSearchCV(clf,param_grid=params, cv=n_cv, iid=False)
        grid.fit(X_train, Y_train)

        # print the best parameters (to detect edge values), and save that classifier
        print('The best parameters are: ',grid.best_params_)
        best_clf = grid.best_estimator_
        best_classifiers.append(best_clf)
        
        # compute the accuracy of the classifier
        acc_train = best_clf.score(X_train, Y_train)
        acc_test = best_clf.score(X_test_train, Y_test_train)
        print('Train accuracy: ',np.round(acc_train,4),' and test_train accuracy: ',np.round(acc_test,4))
        accuracies_train.append(acc_train)
        accuracies_test_train.append(acc_test)
        
        # compute the AUC of the classifier
        if callable(getattr(best_clf,"predict_proba",None)):
            pred_train = best_clf.predict_proba(X_train)[:,-1] # only take last column, the prob of Y = +1
            pred_test = best_clf.predict_proba(X_test_train)[:,-1]
        else:
            print('Using decision_function instead of predict_proba')
            pred_train = best_clf.decision_function(X_train)
            pred_test = best_clf.decision_function(X_test_train)            
        auc_score_train = roc_auc_score(Y_train, pred_train)
        auc_score_test = roc_auc_score(Y_test_train, pred_test)
        print('Train AUC: ',np.round(auc_score_train,4),' and test_train AUC: ',np.round(auc_score_test,4))
        AUC_train.append(auc_score_train)
        AUC_test_train.append(auc_score_test)
        
    avg_AUC_train = np.mean(AUC_train)
    avg_AUC_test_train = np.mean(AUC_test_train)
    print('\n\nThe average train AUC is',np.round(avg_AUC_train,4),'and the avg test_train AUC is',np.round(avg_AUC_test_train,4))
    
    t2 = time.time()
    print('\nFull execution took ',np.round(t2-t1,1),'seconds')
    print('\nDONE!')
    return best_classifiers, accuracies_train, accuracies_test_train, AUC_train, AUC_test_train

## Logistic regressor classifier

In [64]:
from sklearn.linear_model import LogisticRegression
# clf = LogisticRegression(penalty='l1',solver='liblinear',max_iter=1e6, class_weight='balanced')
clf = LogisticRegression(solver='lbfgs',max_iter=1e4,class_weight='balanced')
params = {'C':10.**np.arange(-2,5)}
l1_best_clfs, _, _, l1_AUC_train, l1_AUC_test_train = try_clf(clf,params)

['OXACILINA']
The best parameters are:  {'C': 1000.0}
Train accuracy:  0.8932  and test_train accuracy:  0.7404
Train AUC:  0.9554  and test_train AUC:  0.8335
['AMIKACINA']
The best parameters are:  {'C': 10000.0}
Train accuracy:  0.9335  and test_train accuracy:  0.7439
Train AUC:  0.9861  and test_train AUC:  0.7544
['AMOXI/CLAV']
The best parameters are:  {'C': 1000.0}
Train accuracy:  0.8859  and test_train accuracy:  0.7404
Train AUC:  0.9543  and test_train AUC:  0.8102
['CIPROFLOXACINO']
The best parameters are:  {'C': 1000.0}
Train accuracy:  0.8495  and test_train accuracy:  0.7692
Train AUC:  0.9361  and test_train AUC:  0.814
['CLINDAMICINA']
The best parameters are:  {'C': 10000.0}
Train accuracy:  0.9126  and test_train accuracy:  0.6923
Train AUC:  0.9762  and test_train AUC:  0.6621
['ERITROMICINA']
The best parameters are:  {'C': 1000.0}
Train accuracy:  0.8155  and test_train accuracy:  0.6827
Train AUC:  0.905  and test_train AUC:  0.7122
['LEVOFLOXACINO']
The best p

In [8]:
print('Total number of features:',len(spectrum_train.columns.values))
for i in range(len(l1_best_clfs)):
    n = np.sum(np.abs(l1_best_clfs[i].coef_) > 0)
    print('Number of non-zero weights:',n)


Total number of features: 360
Number of non-zero weights: 93
Number of non-zero weights: 82
Number of non-zero weights: 94
Number of non-zero weights: 98
Number of non-zero weights: 0
Number of non-zero weights: 103
Number of non-zero weights: 98
Number of non-zero weights: 82
Number of non-zero weights: 84


In [10]:
spectrum_train.head()

Unnamed: 0_level_0,2025,2075,2125,2175,2225,2275,2325,2375,2425,2475,...,19525,19575,19625,19675,19725,19775,19825,19875,19925,19975
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
329,0.111222,0.096192,0.097194,0.092184,0.08517,0.09018,0.072645,0.081663,0.072645,0.076653,...,0.029058,0.029058,0.029559,0.031062,0.033066,0.030561,0.031062,0.032565,0.029559,0.03006
173,0.053588,0.071889,0.086696,0.086047,0.051625,0.046581,0.046023,0.042744,0.062649,0.043717,...,0.009475,0.009529,0.009547,0.009601,0.009313,0.009331,0.009421,0.009457,0.009385,0.009421
272,0.045602,0.063321,0.089425,0.083965,0.043969,0.04102,0.038436,0.037949,0.050111,0.037315,...,0.010017,0.00992,0.009725,0.009969,0.009871,0.010017,0.009774,0.009969,0.009969,0.010212
496,0.035652,0.041735,0.036051,0.063376,0.034231,0.029394,0.029444,0.029619,0.037148,0.031214,...,0.009274,0.009274,0.009648,0.009349,0.00925,0.00905,0.009299,0.009175,0.00925,0.00925
182,0.132068,0.166476,0.228702,0.221056,0.120827,0.129415,0.105078,0.095777,0.12214,0.100856,...,0.009158,0.009472,0.009073,0.009101,0.009472,0.00933,0.009415,0.009044,0.009187,0.009558


In [24]:
new_spectrum = spectrum_train.copy(deep=True)
coefs_matrix = np.vstack([np.abs(l1_best_clfs[0].coef_)>0]*len(new_spectrum))

new_spectrum = new_spectrum.multiply(coefs_matrix)
print(coefs_matrix[:5,:])
new_spectrum.head()

[[ True  True  True ... False False False]
 [ True  True  True ... False False False]
 [ True  True  True ... False False False]
 [ True  True  True ... False False False]
 [ True  True  True ... False False False]]


Unnamed: 0_level_0,2025,2075,2125,2175,2225,2275,2325,2375,2425,2475,...,19525,19575,19625,19675,19725,19775,19825,19875,19925,19975
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
329,0.111222,0.096192,0.097194,0.0,0.0,0.09018,0.072645,0.0,0.0,0.076653,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
173,0.053588,0.071889,0.086696,0.0,0.0,0.046581,0.046023,0.0,0.0,0.043717,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
272,0.045602,0.063321,0.089425,0.0,0.0,0.04102,0.038436,0.0,0.0,0.037315,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
496,0.035652,0.041735,0.036051,0.0,0.0,0.029394,0.029444,0.0,0.0,0.031214,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
182,0.132068,0.166476,0.228702,0.0,0.0,0.129415,0.105078,0.0,0.0,0.100856,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [43]:
def get_l1_clfs():
    from sklearn.linear_model import LogisticRegression
    clf = LogisticRegression(penalty='l1',solver='liblinear',max_iter=1e6, class_weight='balanced')
    params = {'C':10.**np.arange(-2,5)}
    l1_best_clfs,_,_,_,_ = try_clf(clf,params)
    return l1_best_clfs

def obtain_l1_vects(l1_best_clfs,spectrum_train,targets_train):
    l1_vect_list = []    
    categories = targets_train.columns[:]
    
    for c, cat in enumerate(categories):
        n = np.sum(np.abs(l1_best_clfs[c].coef_) > 0)
        print('Number of features:',n)
        while n == 0:
            clf = l1_best_clfs[c]
            c = clf.get_params()['C']
            new_c = c * 10
            clf.set_params(C=new_c)
            
            if (df_train[cat].isnull().sum() > 0).all(): # if there are NaN values, we should remove those samples
                merged_train = pd.concat([spectrum_train , targets_train],axis=1,copy=True)
                clean_train = merged_train.dropna(subset=column_name)
    
                Y_train = clean_train.iloc[:,-9+c].to_numpy().reshape(-1,)
                X_train = clean_train.iloc[:,:-9]
            else:
                Y_train = targets_train.iloc[:,c].to_numpy().reshape(-1,)
                X_train = spectrum_train.copy(deep=True)
            
            clf.fit(X_train, Y_train) # refit with higher C
            l1_best_clfs[c] = clf
            n = np.sum(np.abs(clf.coef_) > 0)
            print(n)
    
        # once we know we have at least one non-zero feature
        vect = (np.abs(l1_best_clfs[c].coef_)>0).reshape(-1,)
        l1_feat_list.append(vect)
    return l1_feat_list

# to be applyied to each category
def apply_l1_feature_selection(spectrum_train,vect): # vect is l1_feat_list[c]
    new_spectrum = spectrum_train.copy(deep=True).iloc[:,vect]   
    return new_spectrum

# to obtain a list, with an element for each category
def apply_l1_feature_selection(spectrum_train,vect_list): # vect is l1_feat_list
    new_spectrum_list = []
    categories = targets_train.columns[:]
    for c,cat in enumerate(categories):
        new_spectrum = spectrum_train.copy(deep=True).iloc[:,vect_list[c]]   
        new_spectrum_list.append(new_spectrum)
    return new_spectrum_list

In [44]:
l1_coef_matrix_list = train_l1_feature_selection(spectrum_train,targets_train)
new_spectrum_list = apply_l1_feature_selection(l1_coef_matrix_list)


['OXACILINA']
The best parameters are:  {'C': 100.0}
Train accuracy:  0.9272  and test_train accuracy:  0.7692
Train AUC:  0.9742  and test_train AUC:  0.8348

['AMIKACINA']
The best parameters are:  {'C': 100.0}
Train accuracy:  0.9104  and test_train accuracy:  0.7927
Train AUC:  0.9734  and test_train AUC:  0.7335

['AMOXI/CLAV']
The best parameters are:  {'C': 100.0}
Train accuracy:  0.9175  and test_train accuracy:  0.7596
Train AUC:  0.9744  and test_train AUC:  0.8154

['CIPROFLOXACINO']
The best parameters are:  {'C': 100.0}
Train accuracy:  0.8762  and test_train accuracy:  0.7596
Train AUC:  0.9553  and test_train AUC:  0.8066

['CLINDAMICINA']
The best parameters are:  {'C': 0.01}
Train accuracy:  0.7961  and test_train accuracy:  0.7788
Train AUC:  0.5  and test_train AUC:  0.5

['ERITROMICINA']
The best parameters are:  {'C': 100.0}
Train accuracy:  0.8398  and test_train accuracy:  0.6731
Train AUC:  0.9267  and test_train AUC:  0.7207

['LEVOFLOXACINO']
The best paramet

NameError: name 'antibiotic_idx' is not defined

In [53]:
# probar a deletear esas columnas
vect = (np.abs(l1_best_clfs[0].coef_)>0).reshape(-1,)
# print(vect)
print(np.sum(vect))
new_spectrum = spectrum_train.iloc[:,vect]
new_spectrum.head()

93


Unnamed: 0_level_0,2025,2075,2125,2275,2325,2475,2525,2575,2625,2675,...,9225,9575,9625,9675,9775,10125,10175,10475,13725,16525
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
329,0.111222,0.096192,0.097194,0.09018,0.072645,0.076653,0.072645,0.071142,0.07014,0.071643,...,0.061122,0.092184,0.302605,0.138277,0.061623,0.058617,0.046593,0.072144,0.046092,0.042585
173,0.053588,0.071889,0.086696,0.046581,0.046023,0.043717,0.061676,0.045699,0.10343,0.107933,...,0.046581,0.099719,0.29786,0.103069,0.048905,0.064432,0.038187,0.101232,0.019976,0.016662
272,0.045602,0.063321,0.089425,0.04102,0.038436,0.037315,0.050087,0.03985,0.132443,0.133125,...,0.040898,0.092398,0.321431,0.09386,0.046772,0.048673,0.032489,0.076263,0.018816,0.016501
496,0.035652,0.041735,0.036051,0.029394,0.029444,0.031214,0.035453,0.029943,0.031339,0.100873,...,0.029419,0.066293,0.214111,0.079681,0.035079,0.031189,0.024258,0.043381,0.019721,0.015532
182,0.132068,0.166476,0.228702,0.129415,0.105078,0.100856,0.127618,0.096833,0.461683,0.428445,...,0.080571,0.193638,0.545136,0.198402,0.093096,0.089957,0.054693,0.130071,0.019058,0.01475
