In [1]:
import importlib

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from helpers import *
# importlib.reload(helpers)
# from skbio.stats.composition import clr, ilr
from sklearn.model_selection import train_test_split
from sklearn.metrics import balanced_accuracy_score

from sklearn import svm
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier

from time import time

In [2]:
def CLR_transform(X, scale):
    assert 0 < scale and scale < 1
    minval = np.min(X[np.nonzero(X)])
    X[X == 0] = minval * scale
    X = np.log(X)
    X = X - np.mean(X, axis = 0)
    return(X)


def load_csv_data(data_path, n_min=1000, CLR_scale=None):
    """Loads data and returns y (class labels), tX (features) and ids (event ids)"""
    print('Loading data...')
    y = np.genfromtxt(data_path, delimiter=",", skip_header=1, dtype=str, usecols=1)
    data = np.genfromtxt(data_path, delimiter=",", skip_header=1)
    ids = data[:, 0].astype(np.int)
    X = data[:, 2:]

    # convert class labels from strings to binary (0,1)
    yb = np.ones(len(y))
    yb[np.where(y=='Prokaryote')] = 0

    # Remove rows having less than n_min counts
    print('Removing rows with less than n_min counts...')
    to_delete = [i for i in range(X.shape[0]) if np.sum(X[i,]) < n_min]
    yb   = np.delete(yb,   to_delete, axis=0)
    ids = np.delete(ids, to_delete, axis=0)
    X   = np.delete(X,   to_delete, axis=0)

    if CLR_scale:
        print('Counts to CLR transformed...')
        X = CLR_transform(X, CLR_scale)

        print('Data loaded!')
        return yb, X, ids

    print('Counts to frequencies...')
    X = X / X.sum(axis=1, keepdims=True)
    print('Data loaded!')
    return yb, X, ids

In [3]:
seed = 42
n_jobs = None

lin_svc = svm.LinearSVC(random_state=seed)
ker_svc = svm.SVC(random_state=seed)
log_reg = LogisticRegression(max_iter=10000, random_state=seed, n_jobs=n_jobs)
rf = RandomForestClassifier(max_depth=10, random_state=seed, n_jobs=n_jobs)
nn = MLPClassifier(solver='adam', max_iter=500, random_state=seed)
methods = {'linear svc':lin_svc, 'kernel svc':ker_svc, 'logistic regression':log_reg, 'random forest':rf, 'neural network':nn}
# methods = {'linear svc':lin_svc}

In [4]:
datas = {}

y, X, _ = load_csv_data("Counts_n10000_k5_s5000.csv")
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=seed)
datas['freq'] = (X_train, X_test, y_train, y_test)

y, X, _ = load_csv_data("Counts_n10000_k5_s5000.csv", CLR_scale=0.65)
# X = clr(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=seed)
datas['clr'] = (X_train, X_test, y_train, y_test)

# X= ilr(X)
# X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=seed)
# datas['ilr'] = (X_train, X_test, y_train, y_test)

Loading data...
Removing rows with less than n_min counts...
Counts to frequencies...
Data loaded!
Loading data...
Removing rows with less than n_min counts...
Counts to CLR transformed...
Data loaded!


In [5]:
verbose = True

df = pd.DataFrame(columns=['method', 'transformation',
                           'accuracy', 'euk_acc', 'pro_acc',
                           'learning time', 'prediction time'])
for m in methods:
    for t in datas:
        if verbose:
            print('{} with {}'.format(m, t))
        
        X_train, X_test, y_train, y_test = datas[t]
        
        t1 = time()
        methods[m].fit(X_train, y_train)
        t2 = time()
        y_pred = methods[m].predict(X_test)
        t3 = time()
        
        bal_acc = balanced_accuracy_score(y_test, y_pred)
        euk_acc = euk_accuracy(y_test, y_pred)
        pro_acc = pro_accuracy(y_test, y_pred)
        
        
        df = df.append({'method':m, 'transformation':t,
               'accuracy':bal_acc, 'euk_acc':euk_acc, 'pro_acc':pro_acc,
               'learning time':(t2 - t1), 'prediction time':(t3-t2)}
                , ignore_index=True)

linear svc with freq
linear svc with clr




kernel svc with freq
kernel svc with clr
logistic regression with freq
logistic regression with clr
random forest with freq
random forest with clr
neural network with freq




neural network with clr


In [6]:
df

Unnamed: 0,method,transformation,accuracy,euk_acc,pro_acc,learning time,prediction time
0,linear svc,freq,0.909844,0.912729,0.906958,3.531687,0.032559
1,linear svc,clr,0.962009,0.965767,0.95825,6.90907,0.008441
2,kernel svc,freq,0.984773,0.985053,0.984493,65.873226,21.046994
3,kernel svc,clr,0.991714,0.989392,0.994036,51.462275,16.21632
4,logistic regression,freq,0.784862,0.605111,0.964612,0.903471,0.018477
5,logistic regression,clr,0.966226,0.966249,0.966203,14.470513,0.019395
6,random forest,freq,0.97856,0.97541,0.98171,39.581723,0.21576
7,random forest,clr,0.982066,0.976856,0.987276,17.641518,0.210622
8,neural network,freq,0.9709,0.962874,0.978926,5849.713083,5.421683
9,neural network,clr,0.987797,0.985535,0.99006,37.675324,0.183914
