In [20]:
import time
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn import preprocessing, metrics
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier, ExtraTreesClassifier, VotingClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn import svm
from sklearn.svm import SVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from sklearn.decomposition import PCA

In [22]:
TEST_SIZE = 0.2


def vectorization_TF_IDF(DATASET, text_field_name, label_name):

    X_train, X_test, y_train, y_test = train_test_split( DATASET[text_field_name] ,
                                                    DATASET[label_name],
                                                    train_size=0.80, random_state=28)

    encoder = preprocessing.LabelEncoder()
    y_train = encoder.fit_transform(y_train)
    y_test = encoder.fit_transform(y_test)
    return X_train, X_test, y_train, y_test

def saving_results(results, path, file_name):

    results.to_csv(path + file_name,  index = False, sep=";")

def train_model(classifier, X_train_v, X_test_v, y_train, y_test):
    ini = time.time()
    classifier.fit(X_train_v, y_train)
    predictions = classifier.predict(X_test_v)
    fim = time.time()

    timee = fim-ini
    acuracia = accuracy_score(y_test, predictions)
    precision = precision_score(y_test, predictions, average='weighted')
    matriz_confusao =  confusion_matrix(y_test, predictions)
    recall = recall_score(y_test, predictions, average='weighted')
    f1_score = metrics.f1_score(y_test, predictions, average='weighted')
    # print("Time: " + str(timee))
    # print("Acurácia: " + str(acuracia))
    # print("Precision: " + str(precision))
    # print("Recal: " + str(recall))
    # print("f1_score: " + str(f1_score))

    return timee , acuracia, precision, recall, f1_score, matriz_confusao

def get_tests_result(X_train_v, X_test_v, y_train, y_test):

    all_res = []
    ### MLPClassifier

    #print( "MLPClassifier")
    clf = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(70, ), random_state=1, verbose=True)
    timee , acuracia, precision, recall, f1_score, matriz_confusao = train_model(clf, X_train_v, X_test_v, y_train, y_test)
    #print(matriz_confusao)
    all_res.append(["MLPClassifier: ", timee , acuracia, precision, recall, f1_score])



    ### AdaBoostClassifier

    #print( "AdaBoostClassifier: ")
    clf = AdaBoostClassifier(n_estimators= 50, learning_rate=1)
    timee , acuracia, precision, recall, f1_score, matriz_confusao = train_model(clf, X_train_v, X_test_v, y_train, y_test)
    #print(matriz_confusao)
    all_res.append(["AdaBoostClassifier: ", timee , acuracia, precision, recall, f1_score])



    ### Voting_LR3_SVC1_ETC2
    #print( "Voting_LR3_SVC1_ETC2: ")
    ini = time.time()
    clf2 = LogisticRegression(random_state=0)
    clf5 = SVC(kernel='rbf', probability=True)
    clf6 = ExtraTreesClassifier(n_estimators=100, random_state=0)
    eclf = VotingClassifier(estimators=[ ('lr', clf2), ('svc', clf5), ('etc', clf6)], voting='soft', weights=[3, 1, 2])
    clf2 = clf2.fit(X_train_v, y_train)
    clf5 = clf5.fit(X_train_v, y_train)
    clf6 = clf6.fit(X_train_v, y_train)
    eclf = eclf.fit(X_train_v, y_train)
    #Y_previsto_vc1 = eclf.predict(X_test_v.toarray())
    Y_previsto_vc1 = eclf.predict(X_test_v)
    fim = time.time()

    timee = fim-ini
    acuracia = accuracy_score(y_test, Y_previsto_vc1)
    precision = precision_score(y_test, Y_previsto_vc1, average='weighted')
    matriz_confusao =  confusion_matrix(y_test, Y_previsto_vc1)
    recall = recall_score(y_test, Y_previsto_vc1, average='weighted')
    f1_score = metrics.f1_score(y_test, Y_previsto_vc1, average='weighted')
    # print("Time: " + str(timee))
    # print("Acurácia: " + str(acuracia))
    # print("Precision: " + str(precision))
    # print("Recal: " + str(recall))
    # print("f1_score: " + str(f1_score))
    # print(matriz_confusao)
    all_res.append(["Voting_LR3_SVC1_ETC2: ", timee , acuracia, precision, recall, f1_score])



    ### DecisionTreeClassifier
    #print( "DecisionTreeClassifier: ")
    clf = DecisionTreeClassifier(random_state=0)
    timee , acuracia, precision, recall, f1_score, matriz_confusao = train_model(clf, X_train_v, X_test_v, y_train, y_test)
    #print(matriz_confusao)
    all_res.append(["DecisionTreeClassifier: ", timee , acuracia, precision, recall, f1_score])



    ### RandomForestClassifier
    #print( "RandomForestClassifier: " )
    clf = RandomForestClassifier(n_estimators=50, random_state=1)
    timee , acuracia, precision, recall, f1_score, matriz_confusao = train_model(clf, X_train_v, X_test_v, y_train, y_test)
    #print(matriz_confusao)
    all_res.append(["RandomForestClassifier: ", timee , acuracia, precision, recall, f1_score])



    ### ExtraTreesClassifier
    #print( "ExtraTreesClassifier: ")
    clf = ExtraTreesClassifier(n_estimators=100, random_state=0)
    timee , acuracia, precision, recall, f1_score, matriz_confusao = train_model(clf, X_train_v, X_test_v, y_train, y_test)
    #print(matriz_confusao)
    all_res.append(["ExtraTreesClassifier: ", timee , acuracia, precision, recall, f1_score])



    ### LogisticRegression
    #print( "LogisticRegression: ")
    clf = LogisticRegression(random_state=0)
    timee , acuracia, precision, recall, f1_score, matriz_confusao = train_model(clf, X_train_v, X_test_v, y_train, y_test)
    #print(matriz_confusao)
    all_res.append(["LogisticRegression: ",  timee , acuracia, precision, recall, f1_score])



    ### svm
    #print( "svm: ")
    clf = svm.SVC()
    timee , acuracia, precision, recall, f1_score, matriz_confusao = train_model(clf, X_train_v, X_test_v, y_train, y_test)
    #print(matriz_confusao)
    all_res.append(["svm: ",  timee , acuracia, precision, recall, f1_score])



    ### Stacking_scikit
    #print( "Stacking_scikit: ")
    clf1 = RandomForestClassifier(n_estimators=50, random_state=1)
    clf2 = LogisticRegression(random_state=0)
    clf3 = SVC()
    clf4 = ExtraTreesClassifier(n_estimators=100, random_state=0)
    estimators = [('rf', clf1), ('lr', clf2), ('svc', clf3), ('etc', clf4)]
    ini = time.time()
    clf = StackingClassifier( estimators=estimators, final_estimator=LogisticRegression(), n_jobs=None)
    clf.fit(X_train_v, y_train)
    Y_Previsto_stacking = clf.predict(X_test_v)
    fim = time.time()

    timee = fim-ini
    acuracia = accuracy_score(y_test, Y_Previsto_stacking)
    precision = precision_score(y_test, Y_Previsto_stacking, average='weighted')
    matriz_confusao =  confusion_matrix(y_test, Y_Previsto_stacking)
    recall = recall_score(y_test, Y_Previsto_stacking, average='weighted')
    f1_score = metrics.f1_score(y_test, Y_Previsto_stacking, average='weighted')
    # print("Time: " + str(timee))
    # print("Acurácia: " + str(acuracia))
    # print("Precision: " + str(precision))
    # print("Recal: " + str(recall))
    # print("f1_score: " + str(f1_score))
    # print(matriz_confusao)
    all_res.append(["Stacking_scikit: ",  timee , acuracia, precision, recall, f1_score])



    ### XGBClassifier
    #print( "XGBClassifier: ")
    #clf = XGBClassifier(eval_metric='mlogloss' )
    #timee , acuracia, precision, recall, f1_score, matriz_confusao = train_model(clf, X_train_v, X_test_v, y_train, y_test)
    #print(matriz_confusao)
    #all_res.append(["XGBClassifier: ",  timee , acuracia, precision, recall, f1_score])



    ### OneVsRestClassifier_RF
    #print( "OneVsRest_RF: ")
    clf = RandomForestClassifier(n_estimators=50, random_state=1)
    ovr = OneVsRestClassifier(clf)
    ini = time.time()
    ovr.fit(X_train_v, y_train)
    Y_Previsto_OVR_RF = ovr.predict(X_test_v)
    fim = time.time()


    timee = fim-ini
    acuracia = accuracy_score(y_test, Y_Previsto_OVR_RF)
    precision = precision_score(y_test, Y_Previsto_OVR_RF, average='weighted')
    matriz_confusao =  confusion_matrix(y_test, Y_Previsto_OVR_RF)
    recall = recall_score(y_test, Y_Previsto_OVR_RF, average='weighted')
    f1_score = metrics.f1_score(y_test, Y_Previsto_OVR_RF, average='weighted')
    # print("Time: " + str(timee))
    # print("Acurácia: " + str(acuracia))
    # print("Precision: " + str(precision))
    # print("Recal: " + str(recall))
    # print("f1_score: " + str(f1_score))
    # print(matriz_confusao)
    all_res.append(["OvR_RF: ",  timee , acuracia, precision, recall, f1_score])

    return pd.DataFrame( all_res, columns=["ALGORITHM","TIME","ACCURACY","PRECISION","RECALL", "F1_SCORE"])

feat = ['Idade', 'Peso Kg', 'Cor da pele', 'ACTN3',
       'AGT', 'ECA', 'Score Força%', 'Score Resistência %',
       'Pré CK (U/L)', 'Pós CK (U/L)', 'Pré CK MB U/L', 'Pós CK MB U/L',
       'Pré Lactato (mmol/L)', 'Pós Lactato (mmol/L)', 'Pré LDH (U/L)',
       'Pós LDH (U/L)', 'Pré AST (U/L)', 'Pós AST (U/L)',
       'Pré Albumina (g/dL)', 'Pós Albumina (g/dL)', 'Pré TGP (g/dL)',
       'Pós TGP (g/dL)', 'Pré GGT U/L', 'Pós GGT U/L', 'Pré Ac. Úrico (mg/dL)',
       'Pós Ac. Úrico (mg/dL)', 'Pré TFG   (ml/min/1.73 m2)',
       'Pós TFG   (ml/min/1.73 m2)', 'Pré Ureia mg/dL', 'Pós Ureia mg/dL',
       'Pré creatinina mg/dL', 'Pós creatinina mg/dL',
       'Delta CREATININA mg/dL', 'Pré Fósforo mg/dL', 'Pós Fósforo mg/dL',
       'Pré Proteína Total (g/dL)', 'Pós Proteína Total (g/dL)',
       'Pré FE (ug/dL)', 'Pós FE (ug/dL)', 'Pré calcio mg/dL',
       'Pós calcio mg/dL', 'Pré Potássio mmol/L', 'Pós Potássio mmol/L',
       'Glicose pré', 'Glicose pós', 'Pré Sódio mmol/L', 'Pós Sódio mmol/L',
       'Pré Magnésio mg/dL', 'Pós Magnésio mg/dL', 'Pré Cloro mmol/L',
       'Pós Cloro mmol/L',
       'Osmolalidade sérica PRÉ BD4(mOsm/kg): 1,86(Na + K) + 1,15(Gli /18) + (Ureia/6) + 14\t\t\t',
       'Osmolalidade sérica PÓS BD4(mOsm/kg): 1,86(Na + K) + 1,15(Gli /18) + (Ureia/6) + 14\t\t\t',
       'Pré RCB 10^6/mm³', 'Pós RCB 10^6/mm³', 'Pré HGB g/dL', 'Pós HGB g/dL',
       'Pré HCT %', 'Pós HCT %', 'Pré MCV fL', 'Pós MCV fL', 'Pré MCH pg',
       'Pós MCH pg', 'Pré MCHC g/dL', 'Pós MCHC g/dL', 'Pré PLT 10³/mm³',
       'Pós PLT 10³/mm³', 'Pré RDW %', 'Pós RDW %', 'Pré RET %', 'Pós RET %',
       'Pré WBC 10³/mm³', 'Pós WBC 10³/mm³', 'Pré NEUT %', 'Pós NEUT %',
       'Pré LINF %', 'Pós LINF %', 'Pré MONO %', 'Pós MONO %', 'Pré EOS %',
       'Pós EOS %', 'Pré BASO %', 'Pós BASO %', 'massa magra', 'MCT KG', 'ESTATURA', 'vo2max estimado',
       'Handgrip     Dom.', 'Handgrip Cont.', '% gordura']

label_rabdomiolise = "Rabdomiólise"
label_lesao = "Lesão Renal Aguda"




In [77]:
from google.colab import drive
drive.mount('/content/drive')
path = '/content/drive/My Drive/'
file = 'dados.xlsx'
file2 = 'dados_completos.xlsx'
DATASET = pd.read_excel(path + file)
DATASET = DATASET.drop(columns=['Ano','Missão','Aluno'])
for coluna in DATASET.columns:
    if coluna == 'ACTN3' or coluna == 'AGT' or coluna == 'ECA' or coluna == 'BDKRB2':
        continue
    if DATASET[coluna].dtype == 'object':
        DATASET[coluna] = pd.to_numeric(DATASET[coluna], errors='coerce')


DATASET2 = pd.read_excel(path + file2)
DATASET


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Unnamed: 0,Idade,Peso Kg,Cor da pele,ACTN3,AGT,ECA,BDKRB2,Score Força%,Score Resistência %,Pré CK (U/L),...,Pós BASO %,Lesão Renal Aguda,Rabdomiólise,massa magra,MCT KG,ESTATURA,vo2max estimado,Handgrip Dom.,Handgrip Cont.,% gordura
0,33.0,88.0,1.0,XX,MT,ID,"(-9,-9)",25.0,75.0,337.0,...,0.1,0.0,False,75.52,84.3,181.5,50.85,35.0,37.0,10.05
1,26.0,74.0,1.0,RR,MT,II,"(+9,-9)",50.0,50.0,687.0,...,0.2,0.0,True,69.49,75.9,171.0,51.98,42.0,43.0,6.08
2,26.0,76.0,1.0,RX,MM,ID,"(+9,+9)",50.0,50.0,841.0,...,0.3,0.0,True,72.07,79.2,174.0,52.42,49.5,49.5,8.72
3,25.0,84.0,1.0,RR,TT,DD,"(+9,-9)",,,363.0,...,0.9,0.0,True,75.26,84.3,178.0,50.68,60.0,57.0,10.61
4,33.0,82.0,1.0,RR,MT,II,"(+9,-9)",50.0,50.0,192.0,...,0.1,0.0,True,66.26,79.6,182.0,47.64,51.0,49.0,14.84
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
462,29.0,,2.0,,,,,,,231.0,...,,0.0,False,,82.8,182.0,,56.0,59.0,
463,27.0,,2.0,,,,,,,649.0,...,,0.0,False,,75.1,170.0,,48.0,60.0,
464,28.0,,2.0,,,,,,,274.0,...,,0.0,False,,82.7,179.5,,58.0,52.0,
465,29.0,,2.0,,,,,,,176.0,...,,0.0,False,,80.5,181.0,,60.0,50.0,


**PREPRAÇÃO DOS DADOS**

CASO 1 - Preencher NaN

In [59]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler


DATASET_fill_mean = DATASET.copy()
DATASET_fill_mean = DATASET_fill_mean.replace('-', np.nan)

# Preencher numéricos com média
imputer_mean = SimpleImputer(strategy='mean')
X_numeric = DATASET_fill_mean.select_dtypes(include=['float64', 'int64'])
DATASET_fill_mean[X_numeric.columns] = imputer_mean.fit_transform(X_numeric)

# Preencher categóricos com moda
imputer_freq = SimpleImputer(strategy='most_frequent')
X_categorical = DATASET_fill_mean.select_dtypes(include=['object', 'category'])
DATASET_fill_mean[X_categorical.columns] = imputer_freq.fit_transform(X_categorical)

X_fill_mean = DATASET_fill_mean.drop(columns=['Rabdomiólise'])
y_fill_mean = DATASET_fill_mean['Rabdomiólise']

X_fm_Train, X_fm_Test, y_fm_Train, y_fm_Test = train_test_split(X_fill_mean,y_fill_mean,test_size=TEST_SIZE)

scaler = MinMaxScaler()
X_numeric_train = X_fm_Train.select_dtypes(include=['float64', 'int64'])
X_numeric_test = X_fm_Test.select_dtypes(include=['float64', 'int64'])

X_fm_Train[X_numeric_train.columns] = scaler.fit_transform(X_numeric_train)
X_fm_Test[X_numeric_test.columns] = scaler.transform(X_numeric_test)
X_fm_Train

Unnamed: 0,Idade,Peso Kg,Cor da pele,ACTN3,AGT,ECA,BDKRB2,Score Força%,Score Resistência %,Pré CK (U/L),...,Pré BASO %,Pós BASO %,Lesão Renal Aguda,massa magra,MCT KG,ESTATURA,vo2max estimado,Handgrip Dom.,Handgrip Cont.,% gordura
300,0.000000,0.000000,0.5,RR,MT,DD,"(+9,-9)",0.312500,0.687500,0.009106,...,0.000000,0.051724,0.0,0.782009,0.748293,0.857143,0.876809,0.65000,0.590909,0.834817
143,0.820513,0.671937,0.5,RR,MT,ID,"(+9,-9)",0.222226,0.777886,0.027554,...,0.101124,0.017241,0.0,0.777606,0.727317,0.917403,0.867455,0.72500,0.636364,0.663699
189,0.641026,0.678854,0.0,RR,TT,ID,"(+9, -9)",0.312500,0.687500,0.034177,...,0.011236,0.000000,0.0,0.757139,0.693171,0.895584,1.000000,0.72500,0.500000,0.296736
441,0.717949,0.831028,0.5,RR,MT,ID,"(+9,-9)",0.222226,0.777886,0.064740,...,0.089888,0.086207,0.0,0.830260,0.888780,0.942857,0.837956,0.87500,0.750000,0.528336
296,0.820513,0.752964,0.5,RR,MT,ID,"(-9,-9)",0.125000,0.875000,0.013600,...,0.000000,0.017241,0.0,0.832461,0.755122,0.909091,0.942287,0.60000,0.500000,0.384273
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
374,0.846154,0.715415,1.0,RR,MT,ID,"(+9,-9)",0.222226,0.777886,0.024480,...,0.191011,0.063592,0.0,0.830260,0.744878,0.883117,0.837956,0.72500,0.659091,0.528336
383,0.871795,0.738142,1.0,RR,MT,ID,"(+9,-9)",0.222226,0.777886,0.054518,...,0.111461,0.063592,0.0,0.830260,0.748293,0.909091,0.837956,0.67500,0.613636,0.528336
315,0.692308,0.820158,0.5,RR,MM,DD,"(-9,-9)",0.187500,0.812500,0.096973,...,0.015605,0.086207,0.0,0.873275,0.835610,0.919481,0.908224,0.60000,0.590909,0.540059
100,0.666667,0.731225,0.5,RX,MM,ID,"(+9,+9)",0.187500,0.812500,0.017029,...,0.000000,0.000000,0.0,0.857568,0.772683,0.903896,0.925168,0.61875,0.562500,0.431256


CASO 2 - Dropar colunas com 50% mais de NaN e depois dropar linhas com NaN

In [76]:
taxa_drop = 0.5

DATASET_drop_cols50 = DATASET.copy()

DATASET_drop_cols50 = DATASET_drop_cols50.loc[:, DATASET_drop_cols50.isnull().mean() < taxa_drop]

DATASET_drop_cols50 = DATASET_drop_cols50.dropna()


#Codificar colunas categóricas
label_encoder = preprocessing.LabelEncoder()
for coluna in DATASET_drop_cols50.select_dtypes(include=['object', 'category']).columns:
    DATASET_drop_cols50[coluna] = label_encoder.fit_transform(DATASET_drop_cols50[coluna].astype(str))

X_drop_cols50 = DATASET_drop_cols50.drop(columns=['Rabdomiólise'])
scale_obj = preprocessing.StandardScaler()
X_drop_cols50 = scale_obj.fit_transform(X_drop_cols50)

y_drop_cols50 = DATASET_drop_cols50['Rabdomiólise']

X_drop_cols50_Train, X_drop_cols50_Test, y_drop_cols50_Train, y_drop_cols50_Test = train_test_split(X_drop_cols50,y_drop_cols50,test_size=TEST_SIZE)


DATASET_drop_cols50

Unnamed: 0,Idade,Peso Kg,Cor da pele,ACTN3,AGT,ECA,BDKRB2,Score Força%,Score Resistência %,Pré CK (U/L),...,Pós BASO %,Lesão Renal Aguda,Rabdomiólise,massa magra,MCT KG,ESTATURA,vo2max estimado,Handgrip Dom.,Handgrip Cont.,% gordura
0,33.0,88.0,1.0,3,1,1,6,25.0,75.0,337.0,...,0.1,0.0,False,75.52,84.30,181.5,50.85,35.0,37.0,10.05
1,26.0,74.0,1.0,0,1,2,3,50.0,50.0,687.0,...,0.2,0.0,True,69.49,75.90,171.0,51.98,42.0,43.0,6.08
2,26.0,76.0,1.0,1,0,1,2,50.0,50.0,841.0,...,0.3,0.0,True,72.07,79.20,174.0,52.42,49.5,49.5,8.72
4,33.0,82.0,1.0,0,1,2,3,50.0,50.0,192.0,...,0.1,0.0,True,66.26,79.60,182.0,47.64,51.0,49.0,14.84
5,31.0,92.0,1.0,3,1,1,6,100.0,0.0,437.0,...,0.3,0.0,True,73.99,90.20,184.5,47.05,54.0,47.0,16.90
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
338,28.0,87.0,1.0,3,2,0,3,62.5,37.5,105.0,...,3.1,0.0,True,74.20,83.60,175.0,50.56,54.0,42.0,8.43
339,27.0,78.5,1.0,0,0,0,6,50.0,50.0,321.0,...,0.2,0.0,True,73.39,85.65,177.0,51.46,48.0,52.0,10.92
340,25.0,85.1,1.0,3,2,1,6,37.5,62.5,233.0,...,0.4,0.0,False,72.78,80.00,177.5,0.00,42.0,42.0,7.58
342,28.0,85.0,1.0,0,1,1,6,50.0,50.0,228.0,...,0.4,0.0,True,53.70,62.35,169.0,52.58,44.0,38.0,9.73


CASO 3 - fill_mean + transforma texto/categorias em números

In [75]:
DATASET_encoded = DATASET_fill_mean.copy()


label_encoder = preprocessing.LabelEncoder()
for coluna in DATASET_encoded.select_dtypes(include=['object', 'category']).columns:
    DATASET_encoded[coluna] = label_encoder.fit_transform(DATASET_encoded[coluna].astype(str))


X_encoded = DATASET_encoded.drop(columns=['Rabdomiólise'])
y_encoded = DATASET_encoded['Rabdomiólise']


X_encoded_Train, X_encoded_Test, y_encoded_Train, y_encoded_Test = train_test_split(
    X_encoded, y_encoded, test_size=TEST_SIZE)


scaler = preprocessing.StandardScaler()
X_encoded_Train = scaler.fit_transform(X_encoded_Train)
X_encoded_Test = scaler.transform(X_encoded_Test)

DATASET_encoded

Unnamed: 0,Idade,Peso Kg,Cor da pele,ACTN3,AGT,ECA,BDKRB2,Score Força%,Score Resistência %,Pré CK (U/L),...,Pós BASO %,Lesão Renal Aguda,Rabdomiólise,massa magra,MCT KG,ESTATURA,vo2max estimado,Handgrip Dom.,Handgrip Cont.,% gordura
0,33.0,88.000000,1.0,4,1,1,6,25.000000,75.000000,337.0,...,0.100000,0.0,False,75.520000,84.3,181.5,50.850000,35.0,37.0,10.050000
1,26.0,74.000000,1.0,1,1,2,3,50.000000,50.000000,687.0,...,0.200000,0.0,True,69.490000,75.9,171.0,51.980000,42.0,43.0,6.080000
2,26.0,76.000000,1.0,2,0,1,2,50.000000,50.000000,841.0,...,0.300000,0.0,True,72.070000,79.2,174.0,52.420000,49.5,49.5,8.720000
3,25.0,84.000000,1.0,1,2,0,3,56.945274,43.077114,363.0,...,0.900000,0.0,True,75.260000,84.3,178.0,50.680000,60.0,57.0,10.610000
4,33.0,82.000000,1.0,1,1,2,3,50.000000,50.000000,192.0,...,0.100000,0.0,True,66.260000,79.6,182.0,47.640000,51.0,49.0,14.840000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
462,29.0,72.469128,2.0,1,1,1,3,56.945274,43.077114,231.0,...,0.368832,0.0,False,69.775015,82.8,182.0,47.478563,56.0,59.0,10.682962
463,27.0,72.469128,2.0,1,1,1,3,56.945274,43.077114,649.0,...,0.368832,0.0,False,69.775015,75.1,170.0,47.478563,48.0,60.0,10.682962
464,28.0,72.469128,2.0,1,1,1,3,56.945274,43.077114,274.0,...,0.368832,0.0,False,69.775015,82.7,179.5,47.478563,58.0,52.0,10.682962
465,29.0,72.469128,2.0,1,1,1,3,56.945274,43.077114,176.0,...,0.368832,0.0,False,69.775015,80.5,181.0,47.478563,60.0,50.0,10.682962


**TESTANDO AS IAS SEM PCA**


PARA DATASET COM DROP NAS LINHAS COM NAN


In [63]:
results_drop_cols50 = get_tests_result(X_drop_cols50_Train, X_drop_cols50_Test, y_drop_cols50_Train, y_drop_cols50_Test)
saving_results(results_drop_cols50,
              path + "ResultadosTestesPIBITI/TesteSemPCA/",
              "resultados_rabdomiolise_dropcols.csv")

PARA ENCODED DATASET

In [64]:
results_encoded = get_tests_result(X_encoded_Train, X_encoded_Test, y_encoded_Train, y_encoded_Test)
saving_results(results_encoded,
              path + "ResultadosTestesPIBITI/TesteSemPCA/",
              "resultados_rabdomiolise_encoded.csv")

**APLICANDO PCA NOS DADOS**

In [72]:
NUMBER_COMPONENTS = X_encoded_Train.shape[1]

import copy
# Guarda cópias dos dados originais
X_drop_cols50_Train_original = copy.deepcopy(X_drop_cols50_Train)
X_drop_cols50_Test_original = copy.deepcopy(X_drop_cols50_Test)
X_encoded_Train_original = copy.deepcopy(X_encoded_Train)
X_encoded_Test_original = copy.deepcopy(X_encoded_Test)


metric_column = "PRECISION"  #"ACCURACY", "PRECISION", "RECALL", "F1_SCORE"

# Dicionários para armazenar os melhores resultados por algoritmo
best_dropcols_results = {}  # Formato: {algoritmo: {"score": valor, "components": n, "results": dataframe_row}}
best_encoded_results = {}   # Mesmo formato

# Verifica e imprime os shapes originais
print("Shapes originais:")
print(f"X_drop_cols50_Train: {X_drop_cols50_Train_original.shape}")
print(f"X_encoded_Train: {X_encoded_Train_original.shape}")

# Determina o número máximo de componentes possíveis para cada conjunto de dados
max_components_dropcols = min(X_drop_cols50_Train_original.shape[0], X_drop_cols50_Train_original.shape[1])
max_components_encoded = min(X_encoded_Train_original.shape[0], X_encoded_Train_original.shape[1])
print(f"Máximo de componentes possíveis para dropcols: {max_components_dropcols}")
print(f"Máximo de componentes possíveis para encoded: {max_components_encoded}")


for i in range(1, NUMBER_COMPONENTS):
    print(f"\n===== Processando com {i} componentes =====")

    try:
        # Para X_drop_cols50
        if i <= max_components_dropcols:

            pca_drop = PCA(n_components=i)
            X_drop_cols50_Train_pca = pca_drop.fit_transform(X_drop_cols50_Train_original)
            X_drop_cols50_Test_pca = pca_drop.transform(X_drop_cols50_Test_original)

            print(f"PCA aplicado com sucesso em X_drop_cols50: resultado shape = {X_drop_cols50_Train_pca.shape}")


            results_dropcols_df = get_tests_result(X_drop_cols50_Train_pca, X_drop_cols50_Test_pca,
                                                 y_drop_cols50_Train, y_drop_cols50_Test)


            saving_results(results_dropcols_df,
                          path + f"ResultadosTestesPIBITI/TesteComPCA/DROPCOLS/NUMBER_COMPONENTES_{i}_",
                          "resultados_rabdomiolise_dropcols.csv")


            for idx, row in results_dropcols_df.iterrows():
                algoritmo = row["ALGORITHM"]
                current_score = row[metric_column]


                if algoritmo not in best_dropcols_results or current_score > best_dropcols_results[algoritmo]["score"]:
                    best_dropcols_results[algoritmo] = {
                        "score": current_score,
                        "components": i,
                        "results": row.to_dict()
                    }
                    print(f"Novo melhor resultado para dropcols/{algoritmo}: {metric_column}={current_score} com {i} componentes")
        else:
            print(f"Pulando PCA para X_drop_cols50: {i} componentes > {max_components_dropcols}")

        # Para X_encoded
        if i <= max_components_encoded:

            pca_encoded = PCA(n_components=i)
            X_encoded_Train_pca = pca_encoded.fit_transform(X_encoded_Train_original)
            X_encoded_Test_pca = pca_encoded.transform(X_encoded_Test_original)

            print(f"PCA aplicado com sucesso em X_encoded: resultado shape = {X_encoded_Train_pca.shape}")


            results_encoded_df = get_tests_result(X_encoded_Train_pca, X_encoded_Test_pca,
                                                y_encoded_Train, y_encoded_Test)


            saving_results(results_encoded_df,
                          path + f"ResultadosTestesPIBITI/TesteComPCA/ENCODED/NUMBER_COMPONENTES_{i}_",
                          "resultados_rabdomiolise_encoded.csv")


            for idx, row in results_encoded_df.iterrows():
                algoritmo = row["ALGORITHM"]
                current_score = row[metric_column]

                # Se não temos o algoritmo ainda ou este score é melhor que o anterior
                if algoritmo not in best_encoded_results or current_score > best_encoded_results[algoritmo]["score"]:
                    best_encoded_results[algoritmo] = {
                        "score": current_score,
                        "components": i,
                        "results": row.to_dict()  # Armazena a linha completa como dicionário
                    }
                    print(f"Novo melhor resultado para encoded/{algoritmo}: {metric_column}={current_score} com {i} componentes")
        else:
            print(f"Pulando PCA para X_encoded: {i} componentes > {max_components_encoded}")

    except Exception as e:
        print(f"Erro na iteração {i}: {e}")


print("\n===== Salvando os melhores resultados por algoritmo =====")


best_dropcols_rows = []
best_encoded_rows = []

# Processa os melhores resultados para dropcols
for algoritmo, info in best_dropcols_results.items():
    results_dict = info["results"]
    results_dict["N_COMPONENTS"] = info["components"]
    best_dropcols_rows.append(results_dict)
    print(f"Melhor para dropcols/{algoritmo}: {metric_column}={info['score']} com {info['components']} componentes")

# Processa os melhores resultados para encoded
for algoritmo, info in best_encoded_results.items():
    results_dict = info["results"]
    results_dict["N_COMPONENTS"] = info["components"]
    best_encoded_rows.append(results_dict)
    print(f"Melhor para encoded/{algoritmo}: {metric_column}={info['score']} com {info['components']} componentes")

# Converte para DataFrames
if best_dropcols_rows:
    best_dropcols_df = pd.DataFrame(best_dropcols_rows)
    saving_results(best_dropcols_df,
                  path + "ResultadosTestesPIBITI/TesteComPCA/",
                  "MELHORES_POR_ALGORITMO_dropcols.csv")
    print("Arquivo de melhores resultados para dropcols salvo com sucesso!")
else:
    print("Nenhum resultado válido encontrado para dropcols")

if best_encoded_rows:
    best_encoded_df = pd.DataFrame(best_encoded_rows)
    saving_results(best_encoded_df,
                  path + "ResultadosTestesPIBITI/TesteComPCA/",
                  "MELHORES_POR_ALGORITMO_encoded.csv")
    print("Arquivo de melhores resultados para encoded salvo com sucesso!")
else:
    print("Nenhum resultado válido encontrado para encoded")

Shapes originais:
X_drop_cols50_Train: (211, 92)
X_encoded_Train: (373, 92)
Máximo de componentes possíveis para dropcols: 92
Máximo de componentes possíveis para encoded: 92

===== Processando com 1 componentes =====
PCA aplicado com sucesso em X_drop_cols50: resultado shape = (211, 1)
Novo melhor resultado para dropcols/MLPClassifier: : PRECISION=0.6981132075471698 com 1 componentes
Novo melhor resultado para dropcols/AdaBoostClassifier: : PRECISION=0.6683599419448475 com 1 componentes
Novo melhor resultado para dropcols/Voting_LR3_SVC1_ETC2: : PRECISION=0.6964775979905369 com 1 componentes
Novo melhor resultado para dropcols/DecisionTreeClassifier: : PRECISION=0.6577942497753819 com 1 componentes
Novo melhor resultado para dropcols/RandomForestClassifier: : PRECISION=0.6577942497753819 com 1 componentes
Novo melhor resultado para dropcols/ExtraTreesClassifier: : PRECISION=0.6961477987421383 com 1 componentes
Novo melhor resultado para dropcols/LogisticRegression: : PRECISION=0.79526

### Teste 1 -  'Pré CK (U/L)', 'Pós CK (U/L)', "Rabdomiólise"

In [None]:
col_label_rabdo = ["Rabdomiólise"]
col_label_lesao = ["Lesão Renal Aguda"]
col_feature = ['Pré CK (U/L)', 'Pós CK (U/L)']
col_to_be_encoded = ["Rabdomiólise", "Lesão Renal Aguda"]


col_df = ['Pré CK (U/L)', 'Pós CK (U/L)', "Rabdomiólise", "Lesão Renal Aguda"]


df = DATASET[col_df].dropna()
encoder = preprocessing.LabelEncoder()

for c in col_df:
    df = df.drop(df[df[c] == '-'].index)


for c in col_to_be_encoded:
    df[c] = encoder.fit_transform(df[c])


for d in col_feature:
    df[d] = df[d].astype(float)

df

Unnamed: 0,Pré CK (U/L),Pós CK (U/L),Rabdomiólise,Lesão Renal Aguda
0,337.0,664.0,0,0
1,687.0,2246.0,1,0
2,841.0,4616.0,1,0
3,363.0,2611.0,1,0
4,192.0,1111.0,1,0
...,...,...,...,...
423,223.0,1570.0,1,0
424,340.0,11324.0,1,1
425,312.0,4688.0,1,0
426,109.0,6710.0,1,0


Prevendo Rabdomiólise

Prevendo Lesão Renal Aguda

Salvando os resultados

# Teste 2 - Pré CK MB e Pós CK MB

In [None]:
col_label = ["Rabdomiólise"]
label_lesao = ["Lesão Renal Aguda"]
col_feature = ['Pré CK MB U/L', 'Pós CK MB U/L']
col_to_be_encoded = ["Rabdomiólise"]

col_df = ['Pré CK MB U/L', 'Pós CK MB U/L', "Rabdomiólise" ]

df = DATASET[col_df].dropna()
encoder = preprocessing.LabelEncoder()
for c in col_df:
    df = df.drop(df[df[c] == '-'].index)

for c in col_to_be_encoded:
    df[c] = encoder.fit_transform(df[c])

# convert to float
for d in col_feature:
    df[d] = df[d].astype(float)

df

Unnamed: 0,Pré CK MB U/L,Pós CK MB U/L,Rabdomiólise
0,3.0,16.0,0
1,6.0,38.0,1
2,43.0,71.0,1
3,5.0,53.0,1
4,3.0,15.0,1
...,...,...,...
339,20.0,153.0,1
340,20.0,45.0,0
341,18.0,57.0,0
342,19.0,76.0,1


In [None]:
X_train, X_test, y_train, y_test = train_test_split( df[col_feature],
                                                    df[col_label],
                                                    train_size=0.80, random_state=28)

results = get_tests_result(X_train, X_test, y_train, y_test)
results

Unnamed: 0,ALGORITHM,TIME,ACCURACY,PRECISION,RECALL,F1_SCORE
0,MLPClassifier:,0.107004,0.594203,0.616937,0.594203,0.569521
1,AdaBoostClassifier:,0.150343,0.57971,0.610306,0.57971,0.543088
2,Voting_LR3_SVC1_ETC2:,0.386746,0.637681,0.639187,0.637681,0.637224
3,DecisionTreeClassifier:,0.004338,0.608696,0.63135,0.608696,0.587849
4,RandomForestClassifier:,0.087877,0.594203,0.616937,0.594203,0.569521
5,ExtraTreesClassifier:,0.200832,0.623188,0.666256,0.623188,0.5939
6,LogisticRegression:,0.009489,0.623188,0.623188,0.623188,0.623188
7,svm:,0.019978,0.652174,0.674858,0.652174,0.642279
8,Stacking_scikit:,2.685797,0.623188,0.625501,0.623188,0.622237
9,OvR_RF:,0.156736,0.594203,0.616937,0.594203,0.569521


# Teste 3 - 'ACTN3',  'AGT', 'ECA', 'BDKRB2', 'Score Força%', 'Score Resistência %,

In [None]:
col_label = ["Rabdomiólise"]
label_lesao = ["Lesão Renal Aguda"]
col_feature = ['ACTN3', 'AGT', 'ECA', 'BDKRB2', 'Score Força%', 'Score Resistência %']
col_to_be_encoded = ['ACTN3', 'AGT', 'ECA', 'BDKRB2', "Rabdomiólise"]
col_to_float = ['Score Força%', 'Score Resistência %']

col_df = ['ACTN3', 'AGT', 'ECA', 'BDKRB2', 'Score Força%', 'Score Resistência %', "Rabdomiólise" ]

df = DATASET[col_df].dropna()
encoder = preprocessing.LabelEncoder()
for c in col_df:
    df = df.drop(df[df[c] == '-'].index)

for c in col_to_be_encoded:
    df[c] = encoder.fit_transform(df[c])

# convert to float
for d in col_to_float:
    df[d] = df[d].astype(float)

df

Unnamed: 0,ACTN3,AGT,ECA,BDKRB2,Score Força%,Score Resistência %,Rabdomiólise
0,3,1,1,6,25.0,75.0,0
1,0,1,2,3,50.0,50.0,1
2,1,0,1,2,50.0,50.0,1
4,0,1,2,3,50.0,50.0,1
5,3,1,1,6,100.0,0.0,1
...,...,...,...,...,...,...,...
338,3,2,0,3,62.5,37.5,1
339,0,0,0,6,50.0,50.0,1
340,3,2,1,6,37.5,62.5,0
342,0,1,1,6,50.0,50.0,1


In [None]:
X_train, X_test, y_train, y_test = train_test_split( df[col_feature],
                                                    df[col_label],
                                                    train_size=0.80, random_state=28)

results = get_tests_result(X_train, X_test, y_train, y_test)
results

Unnamed: 0,ALGORITHM,TIME,ACCURACY,PRECISION,RECALL,F1_SCORE
0,MLPClassifier:,1.102627,0.481481,0.460637,0.481481,0.465468
1,AdaBoostClassifier:,0.205284,0.5,0.456349,0.5,0.4591
2,Voting_LR3_SVC1_ETC2:,0.585839,0.444444,0.371153,0.444444,0.39145
3,DecisionTreeClassifier:,0.005799,0.444444,0.460077,0.444444,0.446734
4,RandomForestClassifier:,0.084891,0.425926,0.40535,0.425926,0.412342
5,ExtraTreesClassifier:,0.132086,0.444444,0.460077,0.444444,0.446734
6,LogisticRegression:,0.013518,0.5,0.418478,0.5,0.427734
7,svm:,0.013033,0.574074,0.329561,0.574074,0.418736
8,Stacking_scikit:,1.582079,0.574074,0.329561,0.574074,0.418736
9,OvR_RF:,0.119575,0.425926,0.40535,0.425926,0.412342


# Teste 4 - 'Idade', 'Peso Kg', 'Cor da pele',  'Score Força%', 'Score Resistência %'

In [None]:
col_label = ["Rabdomiólise"]
label_lesao = ["Lesão Renal Aguda"]
col_feature = ['Idade', 'Peso Kg', 'Cor da pele', 'Score Força%', 'Score Resistência %']
col_to_be_encoded = ["Cor da pele", "Rabdomiólise"]
col_to_float = ['Idade', 'Peso Kg', 'Score Força%', 'Score Resistência %']

col_df = ['Idade', 'Peso Kg', 'Cor da pele', 'Score Força%', 'Score Resistência %', "Rabdomiólise" ]

df = DATASET[col_df].dropna()
encoder = preprocessing.LabelEncoder()
for c in col_df:
    df = df.drop(df[df[c] == '-'].index)

for c in col_to_be_encoded:
    df[c] = encoder.fit_transform(df[c])

# convert to float
for d in col_to_float:
    df[d] = df[d].astype(float)

df

Unnamed: 0,Idade,Peso Kg,Cor da pele,Score Força%,Score Resistência %,Rabdomiólise
0,33.0,88.0,1,25.0,75.0,0
1,26.0,74.0,1,50.0,50.0,1
2,26.0,76.0,1,50.0,50.0,1
4,33.0,82.0,1,50.0,50.0,1
5,31.0,92.0,1,100.0,0.0,1
...,...,...,...,...,...,...
338,28.0,87.0,1,62.5,37.5,1
339,27.0,78.5,1,50.0,50.0,1
340,25.0,85.1,1,37.5,62.5,0
342,28.0,85.0,1,50.0,50.0,1


In [None]:
X_train, X_test, y_train, y_test = train_test_split( df[col_feature],
                                                    df[col_label],
                                                    train_size=0.80, random_state=28)

results = get_tests_result(X_train, X_test, y_train, y_test)
results

Unnamed: 0,ALGORITHM,TIME,ACCURACY,PRECISION,RECALL,F1_SCORE
0,MLPClassifier:,0.204043,0.5,0.467857,0.5,0.470879
1,AdaBoostClassifier:,0.134847,0.518519,0.485859,0.518519,0.485082
2,Voting_LR3_SVC1_ETC2:,0.40188,0.425926,0.40535,0.425926,0.412342
3,DecisionTreeClassifier:,0.004088,0.351852,0.36795,0.351852,0.349851
4,RandomForestClassifier:,0.091678,0.407407,0.413436,0.407407,0.409877
5,ExtraTreesClassifier:,0.13619,0.407407,0.418381,0.407407,0.410677
6,LogisticRegression:,0.009598,0.555556,0.512327,0.555556,0.482621
7,svm:,0.010687,0.574074,0.329561,0.574074,0.418736
8,Stacking_scikit:,1.66765,0.574074,0.329561,0.574074,0.418736
9,OvR_RF:,0.120142,0.407407,0.413436,0.407407,0.409877


# Teste 5 - 'Pré Lactato (mmol/L)', 'Pós Lactato (mmol/L)'

In [None]:
col_label = ["Rabdomiólise"]
label_lesao = ["Lesão Renal Aguda"]
col_feature = ['Pré Lactato (mmol/L)', 'Pós Lactato (mmol/L)']
col_to_be_encoded = ["Rabdomiólise"]
col_to_float = ['Pré Lactato (mmol/L)', 'Pós Lactato (mmol/L)']

col_df = ['Pré Lactato (mmol/L)', 'Pós Lactato (mmol/L)', "Rabdomiólise" ]

df = DATASET[col_df].dropna()
encoder = preprocessing.LabelEncoder()
for c in col_df:
    df = df.drop(df[df[c] == '-'].index)

for c in col_to_be_encoded:
    df[c] = encoder.fit_transform(df[c])

# convert to float
for d in col_to_float:
    df[d] = df[d].astype(float)

df

Unnamed: 0,Pré Lactato (mmol/L),Pós Lactato (mmol/L),Rabdomiólise
0,0.0,0.0,0
1,0.0,0.0,1
2,0.0,0.0,1
3,0.0,0.0,1
4,0.0,0.0,1
...,...,...,...
423,1.7,2.2,1
424,2.1,2.2,1
425,2.6,1.8,1
426,2.5,1.9,1


In [None]:
X_train, X_test, y_train, y_test = train_test_split( df[col_feature],
                                                    df[col_label],
                                                    train_size=0.80, random_state=28)

results = get_tests_result(X_train, X_test, y_train, y_test)
results

Unnamed: 0,ALGORITHM,TIME,ACCURACY,PRECISION,RECALL,F1_SCORE
0,MLPClassifier:,0.225249,0.670886,0.704481,0.670886,0.61106
1,AdaBoostClassifier:,0.137217,0.64557,0.776149,0.64557,0.539644
2,Voting_LR3_SVC1_ETC2:,0.405973,0.64557,0.668475,0.64557,0.569087
3,DecisionTreeClassifier:,0.004062,0.607595,0.584665,0.607595,0.578942
4,RandomForestClassifier:,0.089522,0.620253,0.598523,0.620253,0.581335
5,ExtraTreesClassifier:,0.131993,0.607595,0.579455,0.607595,0.562991
6,LogisticRegression:,0.006137,0.607595,0.369172,0.607595,0.459284
7,svm:,0.014912,0.64557,0.776149,0.64557,0.539644
8,Stacking_scikit:,1.578716,0.670886,0.786521,0.670886,0.587109
9,OvR_RF:,0.112536,0.620253,0.598523,0.620253,0.581335


# Teste 6 - 'Pré LDH (U/L)',  'Pós LDH (U/L)', 'Pré AST (U/L)', 'Pós AST (U/L)',
       'Pré Albumina (g/dL)', 'Pós Albumina (g/dL)', 'Pré TGP (g/dL)',
       'Pós TGP (g/dL)', 'Pré GGT U/L', 'Pós GGT U/L', 'Pré Ac. Úrico (mg/dL)',
       'Pós Ac. Úrico (mg/dL)', 'Pré TFG   (ml/min/1.73 m2)',
       'Pós TFG   (ml/min/1.73 m2)', 'Pré Ureia mg/dL', 'Pós Ureia mg/dL',
       'Pré creatinina mg/dL', 'Pós creatinina mg/dL',
       'Delta CREATININA mg/dL', 'Pré Fósforo mg/dL', 'Pós Fósforo mg/dL',
       'Pré Proteína Total (g/dL)', 'Pós Proteína Total (g/dL)',
       'Pré FE (ug/dL)', 'Pós FE (ug/dL)', 'Pré calcio mg/dL',
       'Pós calcio mg/dL', 'Pré Potássio mmol/L', 'Pós Potássio mmol/L',
       'Glicose pré', 'Glicose pós', 'Pré Sódio mmol/L', 'Pós Sódio mmol/L',
       'Pré Magnésio mg/dL', 'Pós Magnésio mg/dL', 'Pré Cloro mmol/L',
       'Pós Cloro mmol/L'

In [None]:
col_label = ["Rabdomiólise"]
label_lesao = ["Lesão Renal Aguda"]
col_feature = ['Pré LDH (U/L)', 'Pós LDH (U/L)', 'Pré AST (U/L)', 'Pós AST (U/L)',
       'Pré Albumina (g/dL)', 'Pós Albumina (g/dL)', 'Pré TGP (g/dL)',
       'Pós TGP (g/dL)', 'Pré GGT U/L', 'Pós GGT U/L', 'Pré Ac. Úrico (mg/dL)',
       'Pós Ac. Úrico (mg/dL)', 'Pré TFG   (ml/min/1.73 m2)',
       'Pós TFG   (ml/min/1.73 m2)', 'Pré Ureia mg/dL', 'Pós Ureia mg/dL',
       'Pré creatinina mg/dL', 'Pós creatinina mg/dL',
       'Delta CREATININA mg/dL', 'Pré Fósforo mg/dL', 'Pós Fósforo mg/dL',
       'Pré Proteína Total (g/dL)', 'Pós Proteína Total (g/dL)',
       'Pré FE (ug/dL)', 'Pós FE (ug/dL)', 'Pré calcio mg/dL',
       'Pós calcio mg/dL', 'Pré Potássio mmol/L', 'Pós Potássio mmol/L',
       'Glicose pré', 'Glicose pós', 'Pré Sódio mmol/L', 'Pós Sódio mmol/L',
       'Pré Magnésio mg/dL', 'Pós Magnésio mg/dL', 'Pré Cloro mmol/L',
       'Pós Cloro mmol/L']
col_to_be_encoded = ["Rabdomiólise"]
col_to_float = ['Pré LDH (U/L)', 'Pós LDH (U/L)', 'Pré AST (U/L)', 'Pós AST (U/L)',
                'Pré Albumina (g/dL)',
                'Pós Albumina (g/dL)', 'Pré Ac. Úrico (mg/dL)',
       'Pós Ac. Úrico (mg/dL)', 'Pré TFG   (ml/min/1.73 m2)', 'Pós TFG   (ml/min/1.73 m2)',
                'Pré Ureia mg/dL', 'Pós Ureia mg/dL', 'Pré creatinina mg/dL', 'Pós creatinina mg/dL',
               'Delta CREATININA mg/dL', 'Pré Fósforo mg/dL', 'Pós Fósforo mg/dL',
                'Pré Proteína Total (g/dL)', 'Pós Proteína Total (g/dL)',
                'Pré FE (ug/dL)', 'Pós FE (ug/dL)', 'Pré calcio mg/dL',
       'Pós calcio mg/dL', 'Pré Potássio mmol/L', 'Pós Potássio mmol/L',
                'Pré Magnésio mg/dL', 'Pós Magnésio mg/dL', 'Pré Cloro mmol/L',
       'Pós Cloro mmol/L']

col_df = ['Pré LDH (U/L)', 'Pós LDH (U/L)', 'Pré AST (U/L)', 'Pós AST (U/L)',
       'Pré Albumina (g/dL)', 'Pós Albumina (g/dL)', 'Pré TGP (g/dL)',
       'Pós TGP (g/dL)', 'Pré GGT U/L', 'Pós GGT U/L', 'Pré Ac. Úrico (mg/dL)',
       'Pós Ac. Úrico (mg/dL)', 'Pré TFG   (ml/min/1.73 m2)',
       'Pós TFG   (ml/min/1.73 m2)', 'Pré Ureia mg/dL', 'Pós Ureia mg/dL',
       'Pré creatinina mg/dL', 'Pós creatinina mg/dL',
       'Delta CREATININA mg/dL', 'Pré Fósforo mg/dL', 'Pós Fósforo mg/dL',
       'Pré Proteína Total (g/dL)', 'Pós Proteína Total (g/dL)',
       'Pré FE (ug/dL)', 'Pós FE (ug/dL)', 'Pré calcio mg/dL',
       'Pós calcio mg/dL', 'Pré Potássio mmol/L', 'Pós Potássio mmol/L',
       'Glicose pré', 'Glicose pós', 'Pré Sódio mmol/L', 'Pós Sódio mmol/L',
       'Pré Magnésio mg/dL', 'Pós Magnésio mg/dL', 'Pré Cloro mmol/L',
       'Pós Cloro mmol/L', "Rabdomiólise" ]

df = DATASET[col_df].dropna()
encoder = preprocessing.LabelEncoder()
for c in col_df:
    df = df.drop(df[df[c] == '-'].index)

for c in col_to_be_encoded:
    df[c] = encoder.fit_transform(df[c])

# convert to float
for d in col_to_float:
    df[d] = df[d].astype(float)

df

Unnamed: 0,Pré LDH (U/L),Pós LDH (U/L),Pré AST (U/L),Pós AST (U/L),Pré Albumina (g/dL),Pós Albumina (g/dL),Pré TGP (g/dL),Pós TGP (g/dL),Pré GGT U/L,Pós GGT U/L,...,Pós Potássio mmol/L,Glicose pré,Glicose pós,Pré Sódio mmol/L,Pós Sódio mmol/L,Pré Magnésio mg/dL,Pós Magnésio mg/dL,Pré Cloro mmol/L,Pós Cloro mmol/L,Rabdomiólise
0,629.0,796.0,33.0,48.0,4.6,4.0,37,63.0,16,17.0,...,4.6,103,110.0,145.0,146.0,2.0,2.0,103.0,106.0,0
1,720.0,1182.0,49.0,105.0,4.9,4.6,57,93.0,38,37.0,...,4.3,95,114.0,142.0,141.0,2.2,1.9,102.0,103.0,1
2,895.0,1531.0,57.0,145.0,4.9,4.8,58,113.0,26,23.0,...,4.5,99,98.0,145.0,146.0,2.2,2.2,105.0,100.0,1
3,349.0,666.0,46.0,138.0,4.8,4.1,42,100.0,13,10.0,...,4.6,94,144.0,144.0,143.0,2.1,2.0,106.0,108.0,1
4,643.0,1089.0,32.0,94.0,4.8,4.3,32,85.0,39,40.0,...,4.5,96,142.0,142.0,142.0,2.0,1.9,102.0,103.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
339,191.0,642.0,36.0,116.0,4.4,4.7,38,42.0,17,19.0,...,4.3,85,91.0,139.0,137.0,2.0,2.0,104.0,95.0,1
340,190.0,751.0,29.0,59.0,4.4,5.1,25,28.0,19,22.0,...,4.7,91,94.0,139.0,138.0,2.0,2.3,101.0,95.0,0
341,228.0,461.0,30.0,57.0,4.4,5.1,29,43.0,25,30.0,...,4.4,88,101.0,140.0,138.0,2.1,2.0,103.0,95.0,0
342,246.0,545.0,31.0,82.0,4.5,5.5,28,33.0,17,18.0,...,5.0,93,107.0,138.0,139.0,2.1,2.3,101.0,96.0,1


In [None]:
X_train, X_test, y_train, y_test = train_test_split( df[col_feature],
                                                    df[col_label],
                                                    train_size=0.80, random_state=28)

results = get_tests_result(X_train, X_test, y_train, y_test)
results

Unnamed: 0,ALGORITHM,TIME,ACCURACY,PRECISION,RECALL,F1_SCORE
0,MLPClassifier:,0.05672,0.492754,0.495351,0.492754,0.453656
1,AdaBoostClassifier:,0.233375,0.84058,0.855603,0.84058,0.839163
2,Voting_LR3_SVC1_ETC2:,0.482129,0.768116,0.772151,0.768116,0.76753
3,DecisionTreeClassifier:,0.010788,0.811594,0.818655,0.811594,0.8108
4,RandomForestClassifier:,0.10899,0.811594,0.818655,0.811594,0.8108
5,ExtraTreesClassifier:,0.150007,0.84058,0.848198,0.84058,0.839908
6,LogisticRegression:,0.026009,0.782609,0.782938,0.782609,0.782609
7,svm:,0.015891,0.826087,0.826087,0.826087,0.826087
8,Stacking_scikit:,3.036137,0.797101,0.806847,0.797101,0.795816
9,OvR_RF:,0.196375,0.811594,0.818655,0.811594,0.8108


# Teste 7 - Série vermelha -  'Pré RCB 10^6/mm³', 'Pós RCB 10^6/mm³', 'Pré HGB g/dL', 'Pós HGB g/dL',
       'Pré HCT %', 'Pós HCT %', 'Pré MCV fL', 'Pós MCV fL', 'Pré MCH pg',
       'Pós MCH pg', 'Pré MCHC g/dL', 'Pós MCHC g/dL', 'Pré PLT 10³/mm³',
       'Pós PLT 10³/mm³', 'Pré RDW %', 'Pós RDW %', 'Pré RET %', 'Pós RET %'

In [None]:
col_label = ["Rabdomiólise"]
label_lesao = ["Lesão Renal Aguda"]
col_feature = ['Pré RCB 10^6/mm³', 'Pós RCB 10^6/mm³', 'Pré HGB g/dL', 'Pós HGB g/dL',
       'Pré HCT %', 'Pós HCT %', 'Pré MCV fL', 'Pós MCV fL', 'Pré MCH pg',
       'Pós MCH pg', 'Pré MCHC g/dL', 'Pós MCHC g/dL', 'Pré PLT 10³/mm³',
       'Pós PLT 10³/mm³', 'Pré RDW %', 'Pós RDW %', 'Pré RET %', 'Pós RET %']
col_to_be_encoded = ["Rabdomiólise"]
col_to_float = ['Pré RCB 10^6/mm³', 'Pós RCB 10^6/mm³', 'Pré HGB g/dL', 'Pós HGB g/dL',
       'Pré HCT %', 'Pós HCT %', 'Pré MCV fL', 'Pós MCV fL', 'Pré MCH pg',
       'Pós MCH pg', 'Pré MCHC g/dL', 'Pós MCHC g/dL', 'Pré PLT 10³/mm³',
       'Pós PLT 10³/mm³', 'Pré RDW %', 'Pós RDW %', 'Pré RET %', 'Pós RET %']

col_df = ['Pré RCB 10^6/mm³', 'Pós RCB 10^6/mm³', 'Pré HGB g/dL', 'Pós HGB g/dL',
       'Pré HCT %', 'Pós HCT %', 'Pré MCV fL', 'Pós MCV fL', 'Pré MCH pg',
       'Pós MCH pg', 'Pré MCHC g/dL', 'Pós MCHC g/dL', 'Pré PLT 10³/mm³',
       'Pós PLT 10³/mm³', 'Pré RDW %', 'Pós RDW %', 'Pré RET %', 'Pós RET %', "Rabdomiólise" ]

df = DATASET[col_df].dropna()
encoder = preprocessing.LabelEncoder()
for c in col_df:
    df = df.drop(df[df[c] == '-'].index)

for c in col_to_be_encoded:
    df[c] = encoder.fit_transform(df[c])

# convert to float
for d in col_to_float:
    df[d] = df[d].astype(float)

df

Unnamed: 0,Pré RCB 10^6/mm³,Pós RCB 10^6/mm³,Pré HGB g/dL,Pós HGB g/dL,Pré HCT %,Pós HCT %,Pré MCV fL,Pós MCV fL,Pré MCH pg,Pós MCH pg,Pré MCHC g/dL,Pós MCHC g/dL,Pré PLT 10³/mm³,Pós PLT 10³/mm³,Pré RDW %,Pós RDW %,Pré RET %,Pós RET %,Rabdomiólise
0,4.38,4.37,13.5,13.3,40.9,39.4,93.4,90.2,30.8,30.4,33.0,33.8,299.0,289.0,12.8,12.2,1.83,1.52,0
1,0.00,4.64,0.0,13.9,0.0,40.6,0.0,87.5,0.0,30.0,0.0,34.2,268.0,304.0,0.0,13.4,0.00,1.47,1
2,0.00,4.77,0.0,14.5,0.0,42.7,0.0,89.5,0.0,30.4,0.0,34.0,0.0,268.0,0.0,12.1,0.00,1.20,1
3,5.22,4.52,14.9,13.1,47.7,38.7,91.4,85.6,28.5,29.0,31.2,33.9,242.0,246.0,12.9,12.4,0.00,1.11,1
4,4.37,4.06,13.3,12.5,41.9,37.5,95.9,92.4,30.4,30.8,31.7,33.3,192.0,259.0,11.7,12.1,0.00,2.27,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
339,4.58,4.48,14.4,16.2,44.2,48.4,96.4,95.4,31.4,32.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,1.56,1
340,4.49,5.61,13.7,11.8,42.2,36.5,93.8,93.7,30.4,30.3,0.0,0.0,0.0,0.0,0.0,0.0,0.00,1.10,0
341,4.61,4.70,14.0,12.5,42.7,37.9,92.5,92.0,30.3,30.3,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.96,0
342,4.64,3.24,13.8,12.3,41.7,37.3,89.8,89.6,29.8,29.5,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.57,1


In [None]:
X_train, X_test, y_train, y_test = train_test_split( df[col_feature],
                                                    df[col_label],
                                                    train_size=0.80, random_state=28)

results = get_tests_result(X_train, X_test, y_train, y_test)
results

Unnamed: 0,ALGORITHM,TIME,ACCURACY,PRECISION,RECALL,F1_SCORE
0,MLPClassifier:,0.834769,0.521739,0.524247,0.521739,0.488756
1,AdaBoostClassifier:,0.243388,0.681159,0.696457,0.681159,0.673601
2,Voting_LR3_SVC1_ETC2:,0.763454,0.666667,0.703509,0.666667,0.648909
3,DecisionTreeClassifier:,0.013717,0.594203,0.594178,0.594203,0.593691
4,RandomForestClassifier:,0.18105,0.695652,0.699213,0.695652,0.69372
5,ExtraTreesClassifier:,0.149877,0.681159,0.704538,0.681159,0.670328
6,LogisticRegression:,0.02753,0.57971,0.595271,0.57971,0.55732
7,svm:,0.015425,0.695652,0.756689,0.695652,0.674663
8,Stacking_scikit:,1.875046,0.710145,0.766221,0.710145,0.692515
9,OvR_RF:,0.139734,0.695652,0.699213,0.695652,0.69372


# Teste 8 - Série leucocitária - 'Pré WBC 10³/mm³', 'Pós WBC 10³/mm³', 'Pré NEUT %', 'Pós NEUT %',
       'Pré LINF %', 'Pós LINF %', 'Pré MONO %', 'Pós MONO %', 'Pré EOS %',
       'Pós EOS %', 'Pré BASO %', 'Pós BASO %'

In [None]:
col_label = ["Rabdomiólise"]
label_lesao = ["Lesão Renal Aguda"]
col_feature = ['Pré WBC 10³/mm³', 'Pós WBC 10³/mm³', 'Pré NEUT %', 'Pós NEUT %',
       'Pré LINF %', 'Pós LINF %', 'Pré MONO %', 'Pós MONO %', 'Pré EOS %',
       'Pós EOS %', 'Pré BASO %', 'Pós BASO %']
col_to_be_encoded = ["Rabdomiólise"]
col_to_float = ['Pré WBC 10³/mm³', 'Pós WBC 10³/mm³', 'Pré NEUT %', 'Pós NEUT %',
       'Pré LINF %', 'Pós LINF %', 'Pré MONO %', 'Pós MONO %', 'Pré EOS %',
       'Pós EOS %', 'Pré BASO %', 'Pós BASO %']

col_df = ['Pré WBC 10³/mm³', 'Pós WBC 10³/mm³', 'Pré NEUT %', 'Pós NEUT %',
       'Pré LINF %', 'Pós LINF %', 'Pré MONO %', 'Pós MONO %', 'Pré EOS %',
       'Pós EOS %', 'Pré BASO %', 'Pós BASO %', "Rabdomiólise" ]

df = DATASET[col_df].dropna()
encoder = preprocessing.LabelEncoder()
for c in col_df:
    df = df.drop(df[df[c] == '-'].index)

for c in col_to_be_encoded:
    df[c] = encoder.fit_transform(df[c])

# convert to float
for d in col_to_float:
    df[d] = df[d].astype(float)

df

Unnamed: 0,Pré WBC 10³/mm³,Pós WBC 10³/mm³,Pré NEUT %,Pós NEUT %,Pré LINF %,Pós LINF %,Pré MONO %,Pós MONO %,Pré EOS %,Pós EOS %,Pré BASO %,Pós BASO %,Rabdomiólise
0,7.38,8.17,49.1,72.6,39.0,21.1,8.3,5.3,3.3,0.9,0.3,0.1,0
1,0.00,9.38,0.0,72.0,0.0,12.5,0.0,15.0,0.0,0.3,0.0,0.2,1
2,0.00,7.48,0.0,82.8,0.0,11.2,0.0,5.6,0.0,0.1,0.0,0.3,1
3,4.91,5.79,52.0,67.8,34.6,21.6,11.2,6.2,1.2,3.5,1.0,0.9,1
4,6.31,10.56,79.1,82.6,15.2,10.9,4.1,5.5,1.1,0.9,0.5,0.1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
442,6.63,8.28,59.1,55.6,30.3,34.1,8.0,8.5,1.4,1.0,1.2,0.8,0
443,8.61,8.28,59.5,70.7,31.5,20.6,6.7,8.3,2.0,0.1,0.3,0.3,0
444,5.45,15.77,44.6,52.3,41.8,36.3,9.0,10.4,3.9,0.6,0.7,0.4,0
445,4.90,11.69,57.4,57.5,31.0,31.4,8.8,9.9,2.0,0.7,0.8,0.5,0


In [None]:
X_train, X_test, y_train, y_test = train_test_split( df[col_feature],
                                                    df[col_label],
                                                    train_size=0.80, random_state=28)

results = get_tests_result(X_train, X_test, y_train, y_test)
results

Unnamed: 0,ALGORITHM,TIME,ACCURACY,PRECISION,RECALL,F1_SCORE
0,MLPClassifier:,0.258394,0.686747,0.685893,0.686747,0.685919
1,AdaBoostClassifier:,0.156224,0.686747,0.685908,0.686747,0.68471
2,Voting_LR3_SVC1_ETC2:,0.514431,0.698795,0.712514,0.698795,0.686727
3,DecisionTreeClassifier:,0.007631,0.662651,0.661898,0.662651,0.658735
4,RandomForestClassifier:,0.117487,0.686747,0.685908,0.686747,0.68471
5,ExtraTreesClassifier:,0.157354,0.722892,0.72333,0.722892,0.720427
6,LogisticRegression:,0.024302,0.710843,0.710438,0.710843,0.708964
7,svm:,0.016432,0.590361,0.632415,0.590361,0.519485
8,Stacking_scikit:,1.900761,0.722892,0.733673,0.722892,0.714534
9,OvR_RF:,0.140574,0.686747,0.685908,0.686747,0.68471
