The idea is that we have several (two) spectra per sample. Then, instead of averaging them or discarding one of them randomly, let's select those results which are closer to zero or one (with some sort of Gini index).

In [3]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import zipfile
import _pickle
import pickle
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_curve, auc, roc_auc_score
import time
import os

from sklearn.svm import SVC

In [17]:
m = 2000; M = 12000; 
bin_size = 1;

path = "C:/Users/Javi/Documents/GitHub/Machine-Learning/Kaggle/"

zf = zipfile.ZipFile(path+'zipped_TrainData.zip', 'r')
df_train = _pickle.loads(zf.open('TrainData.pkl').read())
zf.close()

zf = zipfile.ZipFile(path+'zipped_TestDataUnlabeled.zip', 'r')
df_test = _pickle.loads(zf.open('TestDataUnlabeled.pkl').read())
zf.close()

In [18]:
def spectrum_in_bins(df,m,M,bin_size):
    # Now, let's define the mz ranges, and the label associated to each of them (the mean of the limiting values of each bin)
    range_min = []; range_max = []; range_label = [];
    for mz in range(m,M,bin_size):
        range_min.append(mz)
        range_max.append(mz+bin_size)
        range_label.append(np.mean([range_min[-1],range_max[-1]]).astype(int))
    N = len(df)  # number of samples
    L = len(range_min)  # length of new spectrum (number of bins)
    all_data = np.zeros((N,L))
    for idx in range(N): 
        intensity = df[['intensity']].iloc[idx].values[0]
        mzcoord   = df[['coord_mz']].iloc[idx].values[0]
        idx_data_in_bins = np.zeros((1,L))
        for i,mz in enumerate(range_min):
            intensity_range = intensity[(mzcoord > mz) & (mzcoord < (mz+bin_size))]
            if len(intensity_range) > 0 :
                idx_data_in_bins[0,i] = np.max(intensity_range)
            else: # if those mz coordinates are not in that spectrum
                idx_data_in_bins[0,i] = 0   

        # Normalize the amplitude of the spectrum
        idx_data_in_bins[0,:] = idx_data_in_bins[0,:] / np.max(idx_data_in_bins[0,:])
        all_data[idx,:] = idx_data_in_bins
    new_df = pd.DataFrame(data=all_data, columns = range_label, index = df.index)
    return new_df

In [23]:
def clean_nan_samples(spectrum,targets, c, cat):
# if there are any NaN values, we should remove those samples
    if (targets[cat].isnull().sum() > 0).all(): 
        merged = pd.concat([spectrum , targets],axis=1,copy=True)
        clean = merged.dropna(subset=[cat])
        Y = clean.iloc[:,-9+c].to_numpy().reshape(-1,)
        X = clean.iloc[:,:-9]

    else:
        Y = targets.iloc[:,c].to_numpy().reshape(-1,)
        X = spectrum.copy(deep=True)
    return X , Y

In [36]:
# Extract data (spectra) and targets of the df_train set
data = df_train.iloc[:,-2:]
targets = df_train.iloc[:,1:-2] # so modify function to take targets.iloc[:,1:]
IDs = df_train[['ID_sample']]

limit = 350
data_train = data.iloc[:limit,:]
targets_train = targets.iloc[:limit,:]
IDs_train = IDs.iloc[:limit,:]
data_test_train = data.iloc[limit:,:]
targets_test_train = targets.iloc[limit:,:]
IDs_test_train = IDs.iloc[limit:,:]

spectrum_train = spectrum_in_bins(data_train,m,M,bin_size)
spectrum_test_train = spectrum_in_bins(data_test_train,m,M,bin_size)
print('Spectrum regularized!')

Spectrum regularized!


In [20]:
targets_test.head()

Unnamed: 0_level_0,ID_sample,OXACILINA,AMIKACINA,AMOXI/CLAV,CIPROFLOXACINO,CLINDAMICINA,ERITROMICINA,LEVOFLOXACINO,PENICILINA,TOBRAMICINA
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
350,1776,1,0.0,1,0,1,1,0.0,1,0.0
351,1788,1,1.0,1,1,0,0,1.0,1,1.0
352,1788,1,1.0,1,1,0,0,1.0,1,1.0
353,1811,1,1.0,1,1,1,1,1.0,1,1.0
354,1811,1,1.0,1,1,1,1,1.0,1,1.0


In [46]:
def try_clf(clf,params,n_cv=5): # also output prediction for test set
    t1 = time.time()
    
    best_classifiers = [];
    accuracies_train = []; accuracies_test_train = [];
    AUC_train = []; AUC_test_train = [];
    
    categories = targets_train.columns[:]    
    for c,cat in enumerate(categories):

        print([cat]) # indicate in which antibiotic we are
        
        # Selection of train and test data (depending on whether there are NaN target values)
        X_train, Y_train = clean_nan_samples(spectrum_train,targets_train, c, cat)
        X_test_train, Y_test_train = clean_nan_samples(spectrum_test_train,targets_test_train, c, cat)
            
        # perform a GridSearchCV in order to train a classifier for this antibiotic
        grid = GridSearchCV(clf,param_grid=params, cv=n_cv, iid=False, scoring='roc_auc')
        grid.fit(X_train, Y_train)

        # print the best parameters (to detect edge values), and save that classifier
        print('The best parameters are: ',grid.best_params_)
        best_clf = grid.best_estimator_
        best_classifiers.append(best_clf)


        # compute the AUC of the classifier
        if callable(getattr(best_clf,"predict_proba",None)):
            pred_train = best_clf.predict_proba(X_train)[:,-1] # only take last column, the prob of Y = +1
            pred_test = best_clf.predict_proba(X_test_train)[:,-1]
        else:
            print('Using decision_function instead of predict_proba')
            pred_train = best_clf.decision_function(X_train)
            pred_test = best_clf.decision_function(X_test_train)            
        auc_score_train = roc_auc_score(Y_train, pred_train)
        auc_score_test = roc_auc_score(Y_test_train, pred_test)
        print('Train AUC: ',np.round(auc_score_train,4),' and test_train AUC: ',np.round(auc_score_test,4))
        AUC_train.append(auc_score_train)
        AUC_test_train.append(auc_score_test)
        
    avg_AUC_train = np.mean(AUC_train)
    avg_AUC_test_train = np.mean(AUC_test_train)
    print('\n\nThe average train AUC is',np.round(avg_AUC_train,4),'and the avg test_train AUC is',np.round(avg_AUC_test_train,4))
    
    t2 = time.time()
    print('\nFull execution took ',np.round(t2-t1,1),'seconds')
    print('\nDONE!')
    return best_classifiers, AUC_train, AUC_test_train

In [48]:
clf = SVC(class_weight='balanced', probability=True,kernel='linear')
params = {'C':[0.1,0.5,1]}
best_classifiers, AUC_train, AUC_test_train = try_clf(clf,params,3)

['OXACILINA']
The best parameters are:  {'C': 0.5}
Train AUC:  0.9118  and test_train AUC:  0.7135
['AMIKACINA']
The best parameters are:  {'C': 1}
Train AUC:  0.9382  and test_train AUC:  0.646
['AMOXI/CLAV']
The best parameters are:  {'C': 1}
Train AUC:  0.9267  and test_train AUC:  0.6764
['CIPROFLOXACINO']
The best parameters are:  {'C': 1}
Train AUC:  0.934  and test_train AUC:  0.7628
['CLINDAMICINA']
The best parameters are:  {'C': 0.1}
Train AUC:  0.7508  and test_train AUC:  0.523
['ERITROMICINA']
The best parameters are:  {'C': 1}
Train AUC:  0.8893  and test_train AUC:  0.6038
['LEVOFLOXACINO']
The best parameters are:  {'C': 1}
Train AUC:  0.9478  and test_train AUC:  0.8245
['PENICILINA']
The best parameters are:  {'C': 0.1}
Train AUC:  0.8449  and test_train AUC:  0.8071
['TOBRAMICINA']
The best parameters are:  {'C': 1}
Train AUC:  0.942  and test_train AUC:  0.6389


The average train AUC is 0.8984 and the avg test_train AUC is 0.6884

Full execution took  282.7 seconds

In [56]:
def get_test_predictions(clf_list, spectrum_test, IDs_test):
    C = len(clf_list)
    all_predictions_test = np.zeros((spectrum_test.shape[0], C))
    for c in range(C):
        pred_test = clf_list[c].predict_proba(spectrum_test)[:,1]
        all_predictions_test[:,c] = pred_test.reshape(-1,)
    df_pred_test = pd.DataFrame(data=all_predictions_test, index = spectrum_test.index, columns = targets.columns)
#     df_pred_test[['ID_sample']] = IDs_test[['ID_sample']]
    return df_pred_test

In [60]:
pred_test = get_test_predictions(best_classifiers, spectrum_test_train, IDs_test_train)

In [87]:
# when I have all_predictions_test
N = pred_test.shape[0]
C = pred_test.shape[1]

# get unique ID samples
ID_samples = IDs_test_train.drop_duplicates(subset='ID_sample')
ID_samples = ID_samples[['ID_sample']].to_numpy().astype(int).reshape(-1,)

# create empty matrix for new predictions
new_predictions = np.zeros((len(ID_samples),pred_test.shape[1]))

# for each ID sample, compare both predictions and take the better one
for counter, id in enumerate(ID_samples):
#     print(id)
    predictions = pred_test.loc[IDs_test_train['ID_sample'].to_numpy().astype(int)==id]
#     print(predictions)
    L = len(predictions)
    if L > 1: # more than one spectrum for that sample
        decisivity_index = np.zeros((L,))
        for l in range(L):
            
            
            # compute "decisivity_index"
            print('a')
            
    elif L==1: # just one prediction for that ID
        new_predictions[counter,:] = predictions 
    else:
        print('Weird, no sample for that ID. Bug in code.')
        
# save those results in a df with ID sample as ID
predictions_more_expressive = pd.DataFrame(data = new_predictions, index = ID_samples, columns = targets.columns)



a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a


In [88]:
# get scores with targets_test
targets_withID = targets_test_train.copy(deep=True)
targets_withID[['ID_sample']] = IDs_test_train
targets_withID.drop_duplicates(subset='ID_sample')
targets_withID.drop(columns='ID_sample')
score_new = roc_auc_score(targets_test_train.to_numpy(), new_predictions)


# get scores just making drop_unique
pred_test_withID = pred_test.copy(deep=True)
pred_test_withID[['ID_sample']] = IDs_test_train
pred_test_withID.drop_duplicates(subset='ID_sample')
pred_test_withID.drop(columns='ID_sample')

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [92]:
np.sum(targets_test_train.isna())

OXACILINA         0
AMIKACINA         0
AMOXI/CLAV        0
CIPROFLOXACINO    0
CLINDAMICINA      0
ERITROMICINA      0
LEVOFLOXACINO     2
PENICILINA        0
TOBRAMICINA       0
dtype: int64

In [83]:
predictions

Unnamed: 0_level_0,OXACILINA,AMIKACINA,AMOXI/CLAV,CIPROFLOXACINO,CLINDAMICINA,ERITROMICINA,LEVOFLOXACINO,PENICILINA,TOBRAMICINA
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
