# Exploração ML do readcounts 
É um dataset com a abundancia taxonomica

In [28]:
import numpy as np
import pandas as pd

readcounts_train = pd.read_csv("train/readcounts_training.csv")
readcounts_test = pd.read_csv("test/readcounts_test.csv")
pheno_train = pd.read_csv("train/pheno_training.csv")
pheno_test = pd.read_csv("test/pheno_test.csv")


#### Pré-processamento de dados

In [37]:
# get indexes of the rows without nan values
inds_train = pheno_train.dropna().index
inds_test = pheno_test.dropna().index

# data structure organization - train
pheno_train_y = pheno_train.PrevalentHFAIL[inds_train]
readcounts_train_X = readcounts_train.iloc[:, 1:].T # tranpose dataset for compablility
readcounts_train_X.columns = list(readcounts_train["Unnamed: 0"]) # renaming columns for acessibility
readcounts_train_X = readcounts_train_X.iloc[inds_train] # using only rows without nan

# data structure organization - test
pheno_test_y = pheno_test.PrevalentHFAIL[inds_test]
readcounts_test_X = readcounts_test.iloc[:, 1:].T # tranpose dataset for compablility
readcounts_test_X.columns = list(readcounts_test["Unnamed: 0"]) # renaming columns for acessibility
readcounts_test_X = readcounts_test_X.iloc[inds_test] # using only rows without nan

## Medelos de ML
#### Logistic Regression

In [38]:
from sklearn import linear_model
from sklearn.metrics import accuracy_score

logistic = linear_model.LogisticRegression(C=1e5, solver = "liblinear", multi_class = "auto")
logistic = logistic.fit(readcounts_train_X, pheno_train_y)
logistic_prev = logistic.predict(readcounts_test_X)

print("PECC (accuracy):", accuracy_score(pheno_test_y, logistic_prev))

PECC (accuracy): 0.9586167800453514




#### Linear Regression

In [42]:
from sklearn import linear_model
from sklearn.model_selection import cross_val_score

regr_model = linear_model.LinearRegression()
regr_model = regr_model.fit(readcounts_train_X, pheno_train_y)
regr_model_pred = regr_model.predict(readcounts_test_X)

print(regr_model_pred)

print("Perc. acertos: ", (regr_model_pred == pheno_test_y).sum()/len(regr_model_pred))
print("Não funciona com boleans")

[ 0.87261827 -0.35912821  1.32509594 ...  1.25286809  0.0154085
  0.28186538]
Perc. acertos:  0.0
Não funciona com boleans


#### SVM's

In [43]:
from sklearn import svm
from sklearn.model_selection import cross_val_score
SVM = svm.SVC(gamma=0.001, C=100.)
SVM = SVM.fit(readcounts_train_X, pheno_train_y)
svm_pred = SVM.predict(readcounts_test_X)
print("PECC (accuracy):", accuracy_score(pheno_test_y, svm_pred))

PECC (accuracy): 0.9784580498866213


#### KNeighborsClassifier

In [44]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()
knn.fit(readcounts_train_X, pheno_train_y)
knn_pred = knn.predict(readcounts_test_X)
print("Perc. acertos: ", (knn_pred == pheno_test_y).sum()/len(knn_pred))

Perc. acertos:  0.9784580498866213


#### Decision Tree

In [45]:
from sklearn import tree

tree_model = tree.DecisionTreeClassifier()
tree_model = tree_model.fit(readcounts_train_X, pheno_train_y)
preds_tree = tree_model.predict(readcounts_test_X)

print("Perc. acertos: ", (preds_tree == pheno_test_y).sum()/len(preds_tree))

Perc. acertos:  0.9359410430839002


### Ensembles
#### Voting Classifier

In [46]:
from sklearn.ensemble import VotingClassifier
DTree = tree.DecisionTreeClassifier(max_depth=4)
knn_class = KNeighborsClassifier(n_neighbors=7)
svc = svm.SVC(gamma='scale', kernel='rbf', probability=True)
eclf = VotingClassifier(estimators=[('dt', DTree), ('knn', knn_class), ('svc', svc)], voting='hard')

eclf = eclf.fit(readcounts_train_X, pheno_train_y)
print(eclf.score(readcounts_test_X, pheno_test_y))

0.9784580498866213
