# Imports

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import zipfile
import _pickle
import pickle
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_curve, auc, roc_auc_score
import time
import os

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

# Data loading and preprocessing
Remember to change the path if needed be

### Hyperparameters of preprocessing

In [2]:
m = 2000; M = 12000; 
bin_size = 5;

path = "D:/GitHub/Machine-Learning/Kaggle/"
# path = "C:/Users/Javi/Documents/GitHub/Machine-Learning/Kaggle/"

# # Take the data from Google Drive
# from google.colab import drive
# drive.mount('/content/drive',force_remount=True)
# path = "/content/drive/My Drive/Colab Notebooks/Kaggle/"

In [3]:
savepath = path + 'Kaggle classifiers/bin size 5/'
path_results = path + 'Kaggle_results/'

ncpu = os.cpu_count()
if (ncpu>2): njobs = ncpu - 2; 
else: njobs = 1;

In [4]:
zf = zipfile.ZipFile(path+'zipped_TrainData.zip', 'r')
df_train = _pickle.loads(zf.open('TrainData.pkl').read())
zf.close()

zf = zipfile.ZipFile(path+'zipped_TestDataUnlabeled.zip', 'r')
df_test = _pickle.loads(zf.open('TestDataUnlabeled.pkl').read())
zf.close()

In [5]:
def spectrum_in_bins(df,m,M,bin_size):
    # Now, let's define the mz ranges, and the label associated to each of them (the mean of the limiting values of each bin)
    range_min = []; range_max = []; range_label = [];
    for mz in range(m,M,bin_size):
        range_min.append(mz)
        range_max.append(mz+bin_size)
        range_label.append(np.mean([range_min[-1],range_max[-1]]).astype(int))
    N = len(df)  # number of samples
    L = len(range_min)  # length of new spectrum (number of bins)
    all_data = np.zeros((N,L))
    for idx in range(N): 
        intensity = df[['intensity']].iloc[idx].values[0]
        mzcoord   = df[['coord_mz']].iloc[idx].values[0]
        idx_data_in_bins = np.zeros((1,L))
        for i,mz in enumerate(range_min):
            intensity_range = intensity[(mzcoord > mz) & (mzcoord < (mz+bin_size))]
            if len(intensity_range) > 0 :
                idx_data_in_bins[0,i] = np.max(intensity_range)
            else: # if those mz coordinates are not in that spectrum
                idx_data_in_bins[0,i] = 0   

        # Normalize the amplitude of the spectrum
        idx_data_in_bins[0,:] = idx_data_in_bins[0,:] / np.max(idx_data_in_bins[0,:])
        all_data[idx,:] = idx_data_in_bins
    new_df = pd.DataFrame(data=all_data, columns = range_label, index = df.index)
    return new_df

In [6]:
df_train = df_train.drop_duplicates(subset='ID_sample') # eliminate duplicates
# Let's work without duplicates from now on, to avoid having same sample in training and test_training sets

# Extract data (spectra) and targets of the df_train set
data_train = df_train.iloc[:,-2:]
targets_train = df_train.iloc[:,1:-2]

# apply the bins to all spectra, so that our feature space becomes the same for all samples (make them regular, all the same)
spectrum_train = spectrum_in_bins(data_train,m,M,bin_size)
print('Spectrum regularized!')
# these spectrum_... are our X for training

Spectrum regularized!


In [7]:
def clean_nan_samples(spectrum,targets, c, cat):
# if there are any NaN values, we should remove those samples
    if (targets[cat].isnull().sum() > 0).all(): 
        merged = pd.concat([spectrum , targets],axis=1,copy=True)
        clean = merged.dropna(subset=[cat])
        Y = clean.iloc[:,-9+c].to_numpy().reshape(-1,)
        X = clean.iloc[:,:-9]

    else:
        Y = targets.iloc[:,c].to_numpy().reshape(-1,)
        X = spectrum.copy(deep=True)
    return X , Y

# Try different classifiers
The try_clf function has been built for, given a classifier and a parameter dictionary (for hyperparameter cross-validation), create a classifier for each antibiotic, and return the results. This enables for fast testing of different classifiers. Moreover, the function also takes charge of suppressing NaN values in the targets ocurring for amikacina, levofloxacino and tobramicina.

In [8]:
def try_clf(clf,params,n_cv=5,L1_FEATURE_SELECTION=False, feature_vector_list=None):  # new version! (after Sevilla)
    # new version --> Incorporates feature selection
    t1 = time.time()
    
    best_classifiers = [];
    grid_list = [];
    AUC_train = []; AUC_valid = [];
    
    categories = targets_train.columns[:]    
    for c,cat in enumerate(categories):

        print([cat]) # indicate in which antibiotic we are
        
        # Selection of train and test data (depending on whether there are NaN target values)
        X_train, Y_train = clean_nan_samples(spectrum_train,targets_train, c, cat)
        
        if L1_FEATURE_SELECTION: # a boolean that decides whether to apply L1 feature selection (L1 feature list has to be already defined, and input to the function)
            X_train = apply_l1_feature_selection(X_train,feature_vector_list[c])
            
        # perform a GridSearchCV in order to train a classifier for this antibiotic
        grid = GridSearchCV(clf,param_grid=params,scoring='roc_auc',n_jobs=njobs,pre_dispatch='2*n_jobs', cv=n_cv, iid=False,return_train_score=True)
        grid.fit(X_train, Y_train)

        # print the best parameters (to detect edge values), and save that classifier
        print('The best parameters are: ',grid.best_params_)
        best_clf = grid.best_estimator_
        best_classifiers.append(best_clf)
        grid_list.append(grid)
        
        best_clf = np.where(grid.cv_results_['rank_test_score']==1)[0][0]
        AUC_train.append(grid.cv_results_['mean_train_score'][best_clf])
        AUC_valid.append(grid.cv_results_['mean_test_score'][best_clf])
        
        print('Train AUC: ',np.round(AUC_train[-1],4),' and validation AUC: ',np.round(AUC_valid[-1],4))
        
    avg_AUC_train = np.mean(AUC_train)
    avg_AUC_valid = np.mean(AUC_valid)
    print('\n\nThe average train AUC is',np.round(avg_AUC_train,4),'and the avg validation AUC is',np.round(avg_AUC_valid,4))
    
    t2 = time.time()
    print('\nFull execution took ',np.round(t2-t1,1),'seconds')
    print('\nDONE!')
    return best_classifiers, grid_list, AUC_train, AUC_valid

## Enable L1 feature selection

In [9]:
with open(path+'l1_best_clfs.data', 'rb') as filehandle:
    l1_best_clfs = pickle.load(filehandle)
with open(path+'l1_feat_list.data', 'rb') as filehandle:
    l1_feat_list = pickle.load(filehandle)

# to be applyied to each category
def apply_l1_feature_selection(spectrum_train,vect): # vect is l1_feat_list[c]
    new_spectrum = spectrum_train.copy(deep=True).iloc[:,vect]   
    return new_spectrum



## save_clf  &  load_clf
Let's implement these functions so that we can load previous results without need to perform try_clf on all executions of the notebook.

In [10]:
def save_clf(savepath,filename,clf_list):
  # filename must be without extension
  if (savepath[-1] != '/'): savepath = savepath + '/'
  with open(savepath+filename+'.data','wb') as filehandle:
    pickle.dump(clf_list,filehandle)

def load_clf(savepath,filename):
  if (savepath[-1] != '/'): savepath = savepath + '/'
  if os.path.isfile(savepath+filename+'.data'):
    with open(savepath+filename+'.data','rb') as filehandle:
      new_list = pickle.load(filehandle)
    print('Loaded!')
  else:
    print('File not found')
    new_list = []
  return new_list

## Logistic regressor classifier

In [11]:
clf = LogisticRegression(max_iter=1e6, solver='lbfgs',class_weight='balanced')
params = {'C':10.**np.arange(-2,4)}
lr_best_clfs, _, _, lr_AUC_train, lr_AUC_test_train = try_clf(clf,params,10)

save_clf(savepath,'logreg',lr_best_clfs)

['OXACILINA']




The best parameters are:  {'C': 1000.0}
Train AUC:  0.9999  and validation AUC:  0.7788
['AMIKACINA']




The best parameters are:  {'C': 100.0}
Train AUC:  0.993  and validation AUC:  0.7325
['AMOXI/CLAV']




The best parameters are:  {'C': 100.0}
Train AUC:  0.9867  and validation AUC:  0.7553
['CIPROFLOXACINO']




The best parameters are:  {'C': 1000.0}
Train AUC:  1.0  and validation AUC:  0.7483
['CLINDAMICINA']




The best parameters are:  {'C': 0.01}
Train AUC:  0.6433  and validation AUC:  0.6012
['ERITROMICINA']




The best parameters are:  {'C': 100.0}
Train AUC:  0.9739  and validation AUC:  0.6185
['LEVOFLOXACINO']




The best parameters are:  {'C': 10.0}
Train AUC:  0.9412  and validation AUC:  0.7604
['PENICILINA']




The best parameters are:  {'C': 0.1}
Train AUC:  0.7671  and validation AUC:  0.6291
['TOBRAMICINA']
The best parameters are:  {'C': 100.0}
Train AUC:  0.9928  and validation AUC:  0.7462


The average train AUC is 0.922 and the avg validation AUC is 0.7078

Full execution took  33.2 seconds

DONE!




ValueError: not enough values to unpack (expected 5, got 4)

## KNN

In [17]:
from sklearn.neighbors import KNeighborsClassifier
clf = KNeighborsClassifier()
params = {'n_neighbors':np.arange(1,30)}
knn_best_clfs, _, _, knn_AUC_train, knn_AUC_test_train = try_clf(clf,params,10)

save_clf(savepath,'knn',knn_best_clfs)

['OXACILINA']
The best parameters are:  {'n_neighbors': 6}
Train accuracy:  0.7985  and test_train accuracy:  0.7019
Train AUC:  0.8732  and test_train AUC:  0.7306
['AMIKACINA']
Dropped  66
Dropped  22
The best parameters are:  {'n_neighbors': 4}
Train accuracy:  0.8324  and test_train accuracy:  0.7561
Train AUC:  0.8794  and test_train AUC:  0.7109
['AMOXI/CLAV']
The best parameters are:  {'n_neighbors': 6}
Train accuracy:  0.7985  and test_train accuracy:  0.7019
Train AUC:  0.8714  and test_train AUC:  0.7182
['CIPROFLOXACINO']
The best parameters are:  {'n_neighbors': 2}
Train accuracy:  0.8641  and test_train accuracy:  0.7212
Train AUC:  0.9467  and test_train AUC:  0.6756
['CLINDAMICINA']
The best parameters are:  {'n_neighbors': 20}
Train accuracy:  0.7961  and test_train accuracy:  0.7788
Train AUC:  0.7476  and test_train AUC:  0.6753
['ERITROMICINA']
The best parameters are:  {'n_neighbors': 22}
Train accuracy:  0.665  and test_train accuracy:  0.6154
Train AUC:  0.7218  a

## SVM - Linear

In [18]:
from sklearn.svm import SVC
clf = SVC(kernel='linear', class_weight='balanced')
params = {'C':10.**np.arange(-1,6)}
linear_SVM_best_clfs, _, _, linear_SVM_AUC_train, linear_SVM_AUC_test_train = try_clf(clf,params,10)
save_clf(savepath,'linear_SVM',linear_SVM_best_clfs)

['OXACILINA']
The best parameters are:  {'C': 10.0}
Train accuracy:  0.8981  and test_train accuracy:  0.7404
Using decision_function instead of predict_proba
Train AUC:  0.9479  and test_train AUC:  0.7719
['AMIKACINA']
Dropped  66
Dropped  22
The best parameters are:  {'C': 10.0}
Train accuracy:  0.8786  and test_train accuracy:  0.7561
Using decision_function instead of predict_proba
Train AUC:  0.9334  and test_train AUC:  0.7964
['AMOXI/CLAV']
The best parameters are:  {'C': 100.0}
Train accuracy:  0.949  and test_train accuracy:  0.7115
Using decision_function instead of predict_proba
Train AUC:  0.9856  and test_train AUC:  0.7711
['CIPROFLOXACINO']
The best parameters are:  {'C': 10.0}
Train accuracy:  0.9053  and test_train accuracy:  0.7404
Using decision_function instead of predict_proba
Train AUC:  0.9506  and test_train AUC:  0.7836
['CLINDAMICINA']
The best parameters are:  {'C': 1000.0}
Train accuracy:  0.9903  and test_train accuracy:  0.6923
Using decision_function ins

## SVM - Polynomial
*It can not be done, the huge amount of features makes it computationally unfeasible*

## SVM - RBF

In [19]:
clf = SVC(kernel='rbf', class_weight='balanced')

C_vector = 10. ** np.arange(-1,6)
gamma_vector = 10. ** np.arange(-4,1)
params = {'C':C_vector, 'gamma':gamma_vector}

rbf_SVM_best_clfs, _, _, rbf_SVM_AUC_train, rbf_SVM_AUC_test_train = try_clf(clf,params,10)

save_clf(savepath,'rbf_SVM',rbf_SVM_best_clfs)

['OXACILINA']
The best parameters are:  {'C': 100.0, 'gamma': 0.1}
Train accuracy:  0.9587  and test_train accuracy:  0.7308
Using decision_function instead of predict_proba
Train AUC:  0.9842  and test_train AUC:  0.8209
['AMIKACINA']
Dropped  66
Dropped  22
The best parameters are:  {'C': 1000.0, 'gamma': 0.1}
Train accuracy:  0.9855  and test_train accuracy:  0.7805
Using decision_function instead of predict_proba
Train AUC:  0.9935  and test_train AUC:  0.823
['AMOXI/CLAV']
The best parameters are:  {'C': 100.0, 'gamma': 0.1}
Train accuracy:  0.9563  and test_train accuracy:  0.75
Using decision_function instead of predict_proba
Train AUC:  0.9824  and test_train AUC:  0.7957
['CIPROFLOXACINO']
The best parameters are:  {'C': 100.0, 'gamma': 0.1}
Train accuracy:  0.9539  and test_train accuracy:  0.7692
Using decision_function instead of predict_proba
Train AUC:  0.98  and test_train AUC:  0.834
['CLINDAMICINA']
The best parameters are:  {'C': 100.0, 'gamma': 1.0}
Train accuracy:  

## RF with balanced classes and Cross-Validation

In [20]:
clf = RandomForestClassifier(class_weight='balanced',min_samples_split=5)
params = {'n_estimators':[10,30,50,70,150,250],'max_depth':np.arange(1,50)}
rf_best_clfs, _, _, rf_AUC_train, rf_AUC_test_train = try_clf(clf,params)
save_clf(savepath,'rf_cv',rf_best_clfs)

['OXACILINA']
The best parameters are:  {'max_depth': 6, 'n_estimators': 50}
Train accuracy:  0.9709  and test_train accuracy:  0.7404
Train AUC:  0.9965  and test_train AUC:  0.7143
['AMIKACINA']
Dropped  66
Dropped  22


KeyboardInterrupt: 

## Real Adaboost

In [21]:
clf = AdaBoostClassifier(DecisionTreeClassifier(max_depth=2)) # we can change that
params = {'n_estimators':[500]}
ab_best_clfs, _, _, ab_AUC_train, ab_AUC_test_train = try_clf(clf,params)
save_clf(savepath,'ab_sklearn_cv',ab_best_clfs)

['OXACILINA']
The best parameters are:  {'n_estimators': 500}
Train accuracy:  0.9951  and test_train accuracy:  0.8173
Train AUC:  1.0  and test_train AUC:  0.8356
['AMIKACINA']
Dropped  66
Dropped  22
The best parameters are:  {'n_estimators': 500}
Train accuracy:  0.9884  and test_train accuracy:  0.6707
Train AUC:  0.9997  and test_train AUC:  0.6923
['AMOXI/CLAV']
The best parameters are:  {'n_estimators': 500}
Train accuracy:  0.9951  and test_train accuracy:  0.7308
Train AUC:  0.9999  and test_train AUC:  0.7553
['CIPROFLOXACINO']
The best parameters are:  {'n_estimators': 500}
Train accuracy:  0.9879  and test_train accuracy:  0.7212
Train AUC:  0.9997  and test_train AUC:  0.7125
['CLINDAMICINA']
The best parameters are:  {'n_estimators': 500}
Train accuracy:  0.9903  and test_train accuracy:  0.7692
Train AUC:  0.9997  and test_train AUC:  0.6755
['ERITROMICINA']
The best parameters are:  {'n_estimators': 500}
Train accuracy:  0.9951  and test_train accuracy:  0.7212
Train A

## Our own Real AdaBoost
The one coded for Homework 2. After some trials, it seems to work much better than the sklearn method. The following code is a modification. Moreover, the number of learners will be set to 500 as in some test (up to 2.000 learners) that seemed a good point.

In [22]:
from sklearn.metrics import accuracy_score as acc

class OurAdaBoostEnsemble():
    def __init__(self,T):
        self.T = T

    def fit(self,X_train, Y_train):
        T = self.T
        
        Y_train[Y_train==0] = -1 # change labels 0 to -1, if needed

        alpha = np.zeros((T,1))
        Dt_all = np.zeros((T,X_train.shape[0]))
        Dt_all[0,:] = np.ones((1,X_train.shape[0])) / X_train.shape[0]  # Initialize all weights as 1 / n_samples

        outputs_train = np.zeros((T,Y_train.shape[0]))

        tree_list = []

        for i in range(T):
            mytree = DecisionTreeClassifier(max_depth=2)
            mytree.fit(X_train,Y_train,sample_weight=Dt_all[i,:])
            tree_list.append(mytree)

            # For real-valued predictions:
            outputs_train[i,:] = np.dot(mytree.predict_proba(X_train),mytree.classes_)
            
            # Get gamma and alpha_i values
            gamma = np.dot(Dt_all[i,:],np.multiply(outputs_train[i,:],Y_train))
            alpha[i] = 0.5 * np.log((1+gamma)/(1-gamma))

            # Update emphasis function (except if in the last iteration)
            if (i < T-1):
                emphasis = np.multiply(Dt_all[i,:],np.exp(-1*alpha[i]*np.multiply(outputs_train[i,:],Y_train)))
                Dt_all[i+1,:] = emphasis / np.sum(emphasis) # normalize

        # save neccessary parameters for the other methods
        self.T = T
        self.alpha = alpha
        self.tree_list = tree_list

        return self

    def predict_proba(self, X):
        outputs = np.zeros((self.T,X.shape[0]))
        for t in range(self.T):
            mytree = self.tree_list[t]
            outputs[t,:] = mytree.predict_proba(X)[:,-1].T
        f = np.sum(np.multiply(outputs,self.alpha),axis=0).reshape((-1,1))
        return f


# 5-fold cross-validation of resulting classifiers
We have already used cross-validation to select the most suitable hyperparameters for the classifiers. Now, let's do a 10-fold split in order to see the general performance and std of each classifier over each of the "test" splits, in order to choose the best performing ones for each antibiotic.

In [23]:
from sklearn.model_selection import StratifiedKFold

spectrum_full_train = spectrum_in_bins(data,m,M,bin_size)
targets_full_train = targets.copy(deep=True)

N = 10
ab_ours = [OurAdaBoostEnsemble(T=500)]*9
list_of_best_clfs_list = [lr_best_clfs,knn_best_clfs,rbf_SVM_best_clfs,linear_SVM_best_clfs,ab_best_clfs,ab_ours]
labs = ['LogReg','KNN','RBF SVM','Linear SVM','AdaBoost','Our AdaBoost']
categories = targets.columns[:]
test_aucs = np.zeros((len(categories),len(list_of_best_clfs_list),N))   # The dimensions are cat x type of clf x fold
train_aucs = np.zeros((len(categories),len(list_of_best_clfs_list),N))
      
cv = StratifiedKFold(n_splits=N, random_state=123, shuffle=True)

for c, cat in enumerate(categories):
    X, Y = clean_nan_samples(spectrum_full_train,targets_full_train, c, cat)  
    
    for (train, test), i in zip(cv.split(X, Y), range(N)):
        X_train = X.iloc[train]; Y_train = Y[train];
        X_test_train = X.iloc[test]; Y_test_train = Y[test];
        for nclf, l in enumerate(list_of_best_clfs_list):
            clf = l[c].fit(X_train,Y_train)
            if callable(getattr(clf,"predict_proba",None)):
                pred_train = clf.predict_proba(X_train)[:,-1] # only take last column, the prob of Y = +1
                pred_test = clf.predict_proba(X_test_train)[:,-1]
            else:
                pred_train = clf.decision_function(X_train)
                pred_test = clf.decision_function(X_test_train) 
            train_aucs[c,nclf,i] = roc_auc_score(Y_train, pred_train)
            test_aucs[c,nclf,i] = roc_auc_score(Y_test_train, pred_test)
        print('\tFold '+str(i+1)+' finished!')
    print('Category '+cat+' finished!\n')

# The dimensions are cat x type of clf x fold
mean_test_auc = np.mean(test_aucs,axis=2)
std_test_auc  = np.std( test_aucs,axis=2)
print('\n\nDONE!')

	Fold 1 finished!
	Fold 2 finished!
	Fold 3 finished!
	Fold 4 finished!
	Fold 5 finished!
	Fold 6 finished!
	Fold 7 finished!
	Fold 8 finished!
	Fold 9 finished!
	Fold 10 finished!
Category OXACILINA finished!

Dropped  88
	Fold 1 finished!
	Fold 2 finished!
	Fold 3 finished!
	Fold 4 finished!
	Fold 5 finished!
	Fold 6 finished!
	Fold 7 finished!
	Fold 8 finished!
	Fold 9 finished!
	Fold 10 finished!
Category AMIKACINA finished!

	Fold 1 finished!
	Fold 2 finished!
	Fold 3 finished!
	Fold 4 finished!
	Fold 5 finished!
	Fold 6 finished!
	Fold 7 finished!
	Fold 8 finished!
	Fold 9 finished!
	Fold 10 finished!
Category AMOXI/CLAV finished!

	Fold 1 finished!
	Fold 2 finished!
	Fold 3 finished!
	Fold 4 finished!
	Fold 5 finished!
	Fold 6 finished!
	Fold 7 finished!
	Fold 8 finished!
	Fold 9 finished!
	Fold 10 finished!
Category CIPROFLOXACINO finished!

	Fold 1 finished!
	Fold 2 finished!
	Fold 3 finished!
	Fold 4 finished!


KeyboardInterrupt: 

In [None]:
plt.figure(figsize=(15,8))
col_names = targets.columns.values
for i in range(len(list_of_best_clfs_list)):
    plt.errorbar(col_names,mean_test_auc[:,i],std_test_auc[:,i],label=labs[i])
plt.title('AUC for test_train set with '+str(N)+' fold testing')
plt.legend()
plt.show()

In [None]:
# save all of this
mydict = {'listoflist':list_of_best_clfs_list,'labels':labs,'AUC_test':test_aucs}
save_clf(savepath,'dict_bs5_allclfs_10fold',mydict)

In [None]:
# many support vectors means that the model is overfitted
N = len(X_train) # total number of samples

for c, cat in enumerate(categories):
    print('Linear SVM:')
    n_sv = len(linear_SVM_best_clfs[c].support_)
    print('Proportion of support vectors: ',n_sv*100/N,'%')

for c, cat in enumerate(categories):
    print('\nRBF SVM:')
    n_sv = len(rbf_SVM_best_clfs[c].support_)
    print('Proportion of support vectors: ',n_sv*100/N,'%')

# Generate final results in CSV format
From "Jagger_4_GenerateCSV_v2"

In [0]:
def get_unique_spectre(spectre, df):
  # Include the ID_sample column for the group_by
  spectre['ID_sample'] = df.ID_sample
  # MEAN OF THE SPECTRE
  spectre = spectre.groupby('ID_sample').mean().reset_index()
  return spectre

def generate_csv_from_clf(clf_list, path, path_results, file_name):
  # classifiers must be provided with parameters, and in a list [clf_antibiotic0, clf_antibiotic1, ...]
  # spectrum and targets full train (containing all training points) will be used for training the clfs in clf list
  # df_test must be provided as loaded from the file

  # read all data from files
  zf = zipfile.ZipFile(path+'zipped_TrainData.zip', 'r')
  df_full_train = _pickle.loads(zf.open('TrainData.pkl').read());   zf.close()

  zf = zipfile.ZipFile(path+'zipped_TestDataUnlabeled.zip', 'r')
  df_test = _pickle.loads(zf.open('TestDataUnlabeled.pkl').read());   zf.close()

  # Process test df to get UNIQUE samples and convert to spectrum

  # df_unique_test = df_test.drop_duplicates(subset='ID_sample')

  spectrum_test_forcsv = spectrum_in_bins(df_test,m,M,bin_size)
  spectrum_test_forcsv = get_unique_spectre(spectrum_test_forcsv, df_test)

  # Process train set to later train the clfs
  spectrum_full_train = spectrum_in_bins(df_full_train.iloc[:,-2:],m,M,bin_size)
  targets_full_train  = df_full_train.iloc[:,1:-2]  

  # read the submission example file
  df_submission = pd.read_csv(path+'SubmissionSample.csv') 
  categories = df_submission.columns[1:]
  df_submission['ID']= spectrum_test_forcsv['ID_sample'].values
  # To eliminate the ID_sample from the spectrum
  spectrum_test_forcsv = spectrum_test_forcsv.drop(columns=['ID_sample'])
  for c, cat in enumerate(categories): 
      # clean NaN values
      X_train, Y_train = clean_nan_samples(spectrum_full_train,targets_full_train, c, cat)

      # fit the classifier
      clf_base = clf_list[c].fit(X_train,Y_train)
      # Compute its test prestiction and save this output
      o_test = clf_base.predict_proba(spectrum_test_forcsv)[:,1]
      df_submission[cat] = o_test

  # Save the dataframe with the predicted outputs
  df_submission = df_submission.set_index('ID')
  df_submission.to_csv(path_results + file_name + '.csv')
  print('DONE!')
  return df_submission

In [0]:
# Let's create manually a list of good classifiers
clf_oxalacina       = #
clf_amikacina       = #
clf_amoxiclav       = #
clf_ciprofloxacino  = #
clf_clindamicina    = #
clf_eritromicina    = #
clf_levofloxacino   = #
clf_penicilina      = #
clf_tobramicina     = #

clf_list = [clf_oxalacina, clf_amikacina, clf_amoxiclav, clf_ciprofloxacino, clf_clindamicina,
            clf_eritromicina, clf_levofloxacino, clf_penicilina, clf_tobramicina]

In [0]:
savepath = '/content/drive/My Drive/Colab Notebooks/Kaggle/Kaggle classifiers/bin size 10'
clf_list_bs10 = load_clf(savepath,'logreg')
savepath = '/content/drive/My Drive/Colab Notebooks/Kaggle/Kaggle classifiers/bin size 50'
clf_list_bs50 = load_clf(savepath,'logreg')

In [0]:
name = 'logreg_bs10'
df_submission = generate_csv_from_clf(clf_list_bs10,path,path_results, name)
print('File: '+name+' has been successfully generated')

Dropped  88
Dropped  4
Dropped  82
DONE!
File: logreg_bs10 has been successfully generated


In [0]:
name = 'logreg_bs50'
df_submission = generate_csv_from_clf(clf_list_bs50,path,path_results, name)
print('File: '+name+' has been successfully generated')

Dropped  88
Dropped  4
Dropped  82
DONE!
File: logreg_bs50 has been successfully generated
