In [76]:
import importlib

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from helpers import *
# importlib.reload(helpers)
# from skbio.stats.composition import clr, ilr
from sklearn.model_selection import train_test_split
from sklearn.metrics import balanced_accuracy_score

from sklearn import svm
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier

from time import time

In [73]:
seed = 42
n_jobs = None

lin_svc = svm.LinearSVC(random_state=seed)
ker_svc = svm.SVC(random_state=seed)
log_reg = LogisticRegression(max_iter=10000, random_state=seed, n_jobs=n_jobs)
rf = RandomForestClassifier(max_depth=10, random_state=seed, n_jobs=n_jobs)
nn = MLPClassifier(solver='adam', max_iter=500, random_state=seed)
methods = {'linear svc':lin_svc, 'kernel svc':ker_svc, 'logistic regression':log_reg, 'random forest':rf, 'neural network':nn}

In [72]:
datas = {}

y, X, _ = load_csv_data("Counts_n10000_k5_s5000.csv")
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=seed)
datas['freq'] = (X_train, X_test, y_train, y_test)

y, X, _ = load_csv_data("Counts_n10000_k5_s5000.csv", CLR_scale=0.65)
# X = clr(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=seed)
datas['clr'] = (X_train, X_test, y_train, y_test)

# X= ilr(X)
# X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=seed)
# datas['ilr'] = (X_train, X_test, y_train, y_test)

Loading data...
Removing rows with less than n_min counts...
Counts to frequencies...
Data loaded!
Loading data...
Removing rows with less than n_min counts...
Counts to CLR transformed...
Data loaded!


In [None]:
verbose = True

df = pd.DataFrame(columns=['method', 'transformation',
                           'accuracy', 'euk_acc', 'pro_acc',
                           'learning time', 'prediction time'])
for m in methods:
    for t in datas:
        if verbose:
            print('{} with {}'.format(m, t))
        
        X_train, X_test, y_train, y_test = datas[t]
        
        t1 = time()
        methods[m].fit(X_train, y_train)
        t2 = time()
        y_pred = methods[m].predict(X_test)
        t3 = time()
        
        bal_acc = balanced_accuracy_score(y_test, y_pred)
        euk_acc = euk_accuracy(y_test, y_pred)
        pro_acc = pro_accuracy(y_test, y_pred)
        
        res = {'method':m, 'transformation':t,
               'accuracy':bal_acc, 'euk_acc':euk_acc, 'pro_acc':pro_acc,
               'learning time':(t2 - t1), 'prediction time':(t3-t2)}
        df.append(res, ignore_index=True)

linear svc with freq
linear svc with clr




kernel svc with freq
kernel svc with clr
logistic regression with freq
logistic regression with clr
random forest with freq
random forest with clr
neural network with freq


In [69]:
df

test
5
6
8
