The idea is that we have several (two) spectra per sample. Then, instead of averaging them or discarding one of them randomly, let's select those results which are closer to zero or one (with some sort of Gini index).

In [135]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import zipfile
import _pickle
import pickle
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_curve, auc, roc_auc_score
import time
import os
import peakutils

from sklearn.svm import SVC

In [17]:
m = 2000; M = 12000; 
bin_size = 1;

path = "C:/Users/Javi/Documents/GitHub/Machine-Learning/Kaggle/"

zf = zipfile.ZipFile(path+'zipped_TrainData.zip', 'r')
df_train = _pickle.loads(zf.open('TrainData.pkl').read())
zf.close()

zf = zipfile.ZipFile(path+'zipped_TestDataUnlabeled.zip', 'r')
df_test = _pickle.loads(zf.open('TestDataUnlabeled.pkl').read())
zf.close()

In [18]:
def spectrum_in_bins(df,m,M,bin_size):
    # Now, let's define the mz ranges, and the label associated to each of them (the mean of the limiting values of each bin)
    range_min = []; range_max = []; range_label = [];
    for mz in range(m,M,bin_size):
        range_min.append(mz)
        range_max.append(mz+bin_size)
        range_label.append(np.mean([range_min[-1],range_max[-1]]).astype(int))
    N = len(df)  # number of samples
    L = len(range_min)  # length of new spectrum (number of bins)
    all_data = np.zeros((N,L))
    for idx in range(N): 
        intensity = df[['intensity']].iloc[idx].values[0]
        mzcoord   = df[['coord_mz']].iloc[idx].values[0]
        idx_data_in_bins = np.zeros((1,L))
        for i,mz in enumerate(range_min):
            intensity_range = intensity[(mzcoord > mz) & (mzcoord < (mz+bin_size))]
            if len(intensity_range) > 0 :
                idx_data_in_bins[0,i] = np.max(intensity_range)
            else: # if those mz coordinates are not in that spectrum
                idx_data_in_bins[0,i] = 0   

        # Normalize the amplitude of the spectrum
        idx_data_in_bins[0,:] = idx_data_in_bins[0,:] / np.max(idx_data_in_bins[0,:])
        all_data[idx,:] = idx_data_in_bins
    new_df = pd.DataFrame(data=all_data, columns = range_label, index = df.index)
    return new_df

In [23]:
def clean_nan_samples(spectrum,targets, c, cat):
# if there are any NaN values, we should remove those samples
    if (targets[cat].isnull().sum() > 0).all(): 
        merged = pd.concat([spectrum , targets],axis=1,copy=True)
        clean = merged.dropna(subset=[cat])
        Y = clean.iloc[:,-9+c].to_numpy().reshape(-1,)
        X = clean.iloc[:,:-9]

    else:
        Y = targets.iloc[:,c].to_numpy().reshape(-1,)
        X = spectrum.copy(deep=True)
    return X , Y

In [132]:
def spectrum_in_bins_6(df, m, M, bin_size): # allows binsize < 1
    
    range_min = []; range_max = []; range_mean = []
    for mz in np.arange(m,M,bin_size):
        range_min.append(mz)
        range_max.append(mz+bin_size)
        range_mean.append(np.mean([range_min[-1],range_max[-1]]))
    N = len(df)  # number of samples
    L = len(range_min)  # length of new spectrum (number of bins)
    all_data = np.zeros((N,L))
    for idx in range(N):
        intensity = df[['intensity']].iloc[idx].values[0]
        mzcoord   = df[['coord_mz']].iloc[idx].values[0]
        interpolated_spectrum = np.interp(x=range_mean,xp=mzcoord,fp=intensity)
        idx_data_in_bins = np.zeros((L,))
        for i,mz in enumerate(range_min):
            intensity_range = intensity[(mzcoord > mz) & (mzcoord < (mz+bin_size))]
            if len(intensity_range) > 0 :
                # as we are interested in peak values, let's keep the maximum value in the interval
                idx_data_in_bins[i] = np.max(intensity_range)
            else: # if those mz coordinates are not in that spectrum we interpolate
                idx_data_in_bins[i] = interpolated_spectrum[i]

        # Remove baseline
        idx_data_in_bins -= peakutils.baseline(idx_data_in_bins,deg=4)
        # Normalize the amplitude of the spectrum
        idx_data_in_bins = idx_data_in_bins / np.max(idx_data_in_bins)
        # Store in matrix
        all_data[idx,:] = idx_data_in_bins.reshape(1,-1)
    new_df = pd.DataFrame(data=all_data, columns = range_mean, index = df.index)
    print('DONE!')
    return new_df

In [141]:
# Extract data (spectra) and targets of the df_train set
data = df_train.iloc[:,-2:]
targets = df_train.iloc[:,1:-2] # so modify function to take targets.iloc[:,1:]
IDs = df_train[['ID_sample']]

limit = 100
data_train = data.iloc[limit:,:]
targets_train = targets.iloc[limit:,:]
IDs_train = IDs.iloc[limit:,:]
data_test_train = data.iloc[:limit,:]
targets_test_train = targets.iloc[:limit,:]
IDs_test_train = IDs.iloc[:limit,:]

In [142]:
def try_clf(clf, params, spectrum_train, targets_train, n_cv=5, njobs=5,
            FEATURE_SELECTION=False, feature_vector_list=None):  
    # new version --> Incorporates feature selection
    t1 = time.time()

    best_classifiers = []
    grid_list = []
    AUC_train = []; AUC_valid = []

    categories = targets_train.columns[:]
    for c, cat in enumerate(categories):

        print([cat])  # indicate in which antibiotic we are

        # Selection of train and test data (depending on whether there are NaN target values)
        X_train, Y_train = clean_nan_samples(spectrum_train, targets_train, c, cat)

        if FEATURE_SELECTION:  # a boolean that decides whether to apply feature selection
            # (feature list has to be already defined, and input to the function)
            X_train = apply_feature_selection(X_train, feature_vector_list[c])

        # perform a GridSearchCV in order to train a classifier for this antibiotic
        grid = GridSearchCV(clf, param_grid=params, scoring='roc_auc', n_jobs=njobs, 
                            pre_dispatch='2*n_jobs', cv=n_cv, return_train_score=True)
        grid.fit(X_train, Y_train)

        # print the best parameters (to detect edge values), and save that classifier
        print('The best parameters are: ', grid.best_params_)
        best_clf = grid.best_estimator_
        best_classifiers.append(best_clf)
        grid_list.append(grid)

        best_clf = np.where(grid.cv_results_['rank_test_score'] == 1)[0][0]
        AUC_train.append(grid.cv_results_['mean_train_score'][best_clf])
        AUC_valid.append(grid.cv_results_['mean_test_score'][best_clf])

        print('Train AUC: ', np.round(AUC_train[-1], 4), ' and validation AUC: ', np.round(AUC_valid[-1], 4))

    avg_AUC_train = np.mean(AUC_train)
    avg_AUC_valid = np.mean(AUC_valid)
    print('\n\nThe average train AUC is', np.round(avg_AUC_train, 4), 'and the avg validation AUC is',
          np.round(avg_AUC_valid, 4))

    t2 = time.time()
    print('\nFull execution took ', np.round(t2 - t1, 1), 'seconds')
    print('\nDONE!')
    return best_classifiers, grid_list, AUC_train, AUC_valid

In [143]:
def spectrum_in_bins_5(df, m, M, bin_size): # allows binsize < 1
    
    range_min = []; range_max = []; range_mean = []
    for mz in np.arange(m,M,bin_size):
        range_min.append(mz)
        range_max.append(mz+bin_size)
        range_mean.append(np.mean([range_min[-1],range_max[-1]]))
    N = len(df)  # number of samples
    L = len(range_min)  # length of new spectrum (number of bins)
    all_data = np.zeros((N,L))
    for idx in range(N):
        intensity = df[['intensity']].iloc[idx].values[0]
        mzcoord   = df[['coord_mz']].iloc[idx].values[0]
        idx_data_in_bins = np.zeros((1,L))
        for i,mz in enumerate(range_min):
            intensity_range = intensity[(mzcoord > mz) & (mzcoord < (mz+bin_size))]
            if len(intensity_range) > 0 :
                # as we are interested in peak values, let's keep the maximum value in the interval
                idx_data_in_bins[0,i] = np.max(intensity_range)
            else: # if those mz coordinates are not in that spectrum we interpolate
                idx_data_in_bins[0,i] = np.interp(x=range_mean[i],xp=mzcoord,fp=intensity)

        # Remove baseline
        idx_data_in_bins[0,:] -= peakutils.baseline(idx_data_in_bins[0,:],deg=4)
        # Normalize the amplitude of the spectrum
        idx_data_in_bins[0,:] = idx_data_in_bins[0,:] / np.max(idx_data_in_bins[0,:])
        # Store in matrix
        all_data[idx,:] = idx_data_in_bins
    new_df = pd.DataFrame(data=all_data, columns = range_mean, index = df.index)
    print('DONE!')
    return new_df

In [144]:
bin_size = 5; m = 2000; M = 12500;
spectrum_train_normal5 = spectrum_in_bins_5(data_train,m,M,bin_size)
spectrum_test_train_normal5 = spectrum_in_bins_5(data_test_train,m,M,bin_size)

DONE!
DONE!


In [146]:
from sklearn.svm import SVC
clf = SVC(class_weight='balanced', probability=True,kernel='rbf')
params = {'C':[0.05,0.1,0.5,1,5,10],'gamma':[0.01,0.1,0.5,1,5]}
clf_list_normal5, grid_list, AUC_train_normal5, AUC_valid_normal5 = try_clf(clf, params, spectrum_train_normal5, targets_train)
AUC_test_normal5 = get_test_score(clf_list_normal5, spectrum_test_train_normal5, targets_test_train)

['OXACILINA']




The best parameters are:  {'C': 1, 'gamma': 0.5}
Train AUC:  0.9026  and validation AUC:  0.7325
['AMIKACINA']




The best parameters are:  {'C': 10, 'gamma': 0.1}
Train AUC:  0.93  and validation AUC:  0.6612
['AMOXI/CLAV']




The best parameters are:  {'C': 1, 'gamma': 0.5}
Train AUC:  0.9024  and validation AUC:  0.7246
['CIPROFLOXACINO']




The best parameters are:  {'C': 10, 'gamma': 0.1}
Train AUC:  0.9285  and validation AUC:  0.7587
['CLINDAMICINA']




The best parameters are:  {'C': 0.5, 'gamma': 0.1}
Train AUC:  0.7494  and validation AUC:  0.6466
['ERITROMICINA']




The best parameters are:  {'C': 10, 'gamma': 1}
Train AUC:  0.9953  and validation AUC:  0.662
['LEVOFLOXACINO']




The best parameters are:  {'C': 10, 'gamma': 0.1}
Train AUC:  0.939  and validation AUC:  0.7745
['PENICILINA']




The best parameters are:  {'C': 0.05, 'gamma': 5}
Train AUC:  0.9988  and validation AUC:  0.6971
['TOBRAMICINA']




The best parameters are:  {'C': 5, 'gamma': 0.5}
Train AUC:  0.9798  and validation AUC:  0.6666


The average train AUC is 0.9251 and the avg validation AUC is 0.7026

Full execution took  1398.4 seconds

DONE!


NameError: name 'get_test_score' is not defined

In [147]:
def get_test_predictions(clf_list, spectrum_test, IDs_test):
    C = len(clf_list)
    all_predictions_test = np.zeros((spectrum_test.shape[0], C))
    for c in range(C):
        pred_test = clf_list[c].predict_proba(spectrum_test)[:,1]
        all_predictions_test[:,c] = pred_test.reshape(-1,)
    df_pred_test = pd.DataFrame(data=all_predictions_test, index = spectrum_test.index, columns = targets.columns)
#     df_pred_test[['ID_sample']] = IDs_test[['ID_sample']]
    return df_pred_test

In [148]:
pred_test = get_test_predictions(clf_list_normal5, spectrum_test_train_normal5, IDs_test_train)

In [149]:
merged = pd.concat([targets_test_train, pred_test, IDs_test_train],axis=1,copy=True)
clean = merged.dropna(subset=targets_test_train.columns)
targets_test_train = clean.iloc[:,:9]
pred_test = clean.iloc[:,9:-1]
IDs_test_train = clean[['ID_sample']]

IT WORKS!!
Now get a better classifier :)

In [150]:
# when I have all_predictions_test
N = pred_test.shape[0]
C = pred_test.shape[1]

# get unique ID samples
ID_samples = IDs_test_train.drop_duplicates(subset='ID_sample')
ID_samples = ID_samples[['ID_sample']].to_numpy().astype(int).reshape(-1,)

# create empty matrix for new predictions
new_predictions = np.zeros((len(ID_samples),pred_test.shape[1]))

# for each ID sample, compare both predictions and take the better one
for counter, id in enumerate(ID_samples):
#     print(id)
    predictions = pred_test.loc[IDs_test_train['ID_sample'].to_numpy().astype(int)==id]
#     print(predictions)
    L = len(predictions)
    if L > 1: # more than one spectrum for that sample
        decisivity_index = np.zeros((L,)) # decisivity index ~= Gini = sum( (1-value)*value) 
        # --> low for close to 1 or 0, high for close to 0.5
        for l in range(L):
            di = 0
            for c in range(C): # sum the decisivity index for all categories
                val = predictions.iloc[l,c]
                di += (1-val)*val
            decisivity_index[l] = di
        idx = np.argsort(decisivity_index)[0]
        new_predictions[counter,:] = predictions.iloc[idx,:]

    elif L==1: # just one prediction for that ID
        new_predictions[counter,:] = predictions 
    else:
        print('Weird, no sample for ID %d. Bug in code.' %id)
        
# save those results in a df with ID sample as ID
predictions_more_expressive = pd.DataFrame(data = new_predictions, index = ID_samples, columns = targets.columns)



In [154]:
targets_withID.head()

Unnamed: 0_level_0,OXACILINA,AMIKACINA,AMOXI/CLAV,CIPROFLOXACINO,CLINDAMICINA,ERITROMICINA,LEVOFLOXACINO,PENICILINA,TOBRAMICINA
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
24,0,0.0,0,0,0,0,0.0,1,0.0
28,1,1.0,1,1,0,1,1.0,1,1.0
30,0,0.0,0,1,0,1,1.0,1,0.0
34,0,0.0,0,0,0,0,0.0,1,0.0
52,0,0.0,0,0,0,0,0.0,1,0.0


In [151]:
# get scores with targets_test
targets_withID = targets_test_train.copy(deep=True)
targets_withID[['ID_sample']] = IDs_test_train
targets_withID = targets_withID.drop_duplicates(subset='ID_sample')
targets_withID = targets_withID.drop(columns='ID_sample')
score_new = roc_auc_score(targets_withID.to_numpy(), predictions_more_expressive)
print('New score:',np.round(score_new,3))

# get scores just making drop_unique
pred_test_withID = pred_test.copy(deep=True)
pred_test_withID[['ID_sample']] = IDs_test_train
pred_test_withID = pred_test_withID.drop_duplicates(subset='ID_sample')
pred_test_withID = pred_test_withID.drop(columns='ID_sample')
score_old = roc_auc_score(targets_withID.to_numpy(), pred_test_withID)
print('Old score:',np.round(score_old,3))

ValueError: Only one class present in y_true. ROC AUC score is not defined in that case.