In [27]:
import pandas as pd
import numpy as np
import sys
sys.path.insert(0, "../../utils/")
from sklearn.model_selection import train_test_split
from imblearn.under_sampling import RandomUnderSampler
from training_models.classification_models import ClassificationModels
from joblib import dump
import os

In [28]:
def min_samples(df):
    df_high = df[df['target'] == 2]
    df_mid = df[df['target'] == 1]
    df_low = df[df['target'] == 0]
    n=min(len(df_high), len(df_mid), len(df_low))
    return n

In [29]:
def split(df_data, seed):
    #Separa los datos
    train_data, val_data = train_test_split(df_data, test_size=0.2, random_state=seed)
    train_data_under, val_data_under= undersampling(train_data, val_data, seed)
    #train_data_over, val_data_over= oversampling(train_data,val_data)
    return train_data, val_data, train_data_under, val_data_under

In [30]:
def undersampling(train_data, val_data, seed):
    #Se obtienen los tamaños de entrenamiento y validacion
    n_samples_train= min_samples(train_data)
    n_samples_val= min_samples(val_data)

    #Se definen los objetos para submuestrear
    undersampler_train = RandomUnderSampler(sampling_strategy={0: n_samples_train, 1: n_samples_train, 2: n_samples_train}, random_state=seed)
    undersampler_val = RandomUnderSampler(sampling_strategy={0: n_samples_val, 1: n_samples_val, 2: n_samples_val}, random_state=seed)

    #Se dividen los datos en X e y para entrenamiento y validacion
    X_train = train_data.drop('target', axis=1)
    y_train = train_data['target']  
    X_val = val_data.drop('target', axis=1)
    y_val = val_data['target']

    #Se aplica el submuestreo
    X_resampled, y_resampled = undersampler_train.fit_resample(X_train, y_train)
    df_resampled_train = pd.concat([pd.DataFrame(X_resampled, columns=X_train.columns), pd.Series(y_resampled, name='target')], axis=1)
    X_resampled, y_resampled = undersampler_val.fit_resample(X_val, y_val)
    df_resampled_val = pd.concat([pd.DataFrame(X_resampled, columns=X_val.columns), pd.Series(y_resampled, name='target')], axis=1)

    return df_resampled_train, df_resampled_val

In [31]:
def train(model_name, method_name, train_v, validation_v, iteration, repr_name, div, seed):
    #Separa datos de sus target de entrenamiento y validacion
    train_values = train_v.drop(columns="target").values
    train_response = train_v["target"].values

    validation_values = validation_v.drop(columns="target").values
    validation_response = validation_v["target"].values

    print(f"Training model: {model_name}")
    #Se instancia el objeto
    clf_model = ClassificationModels(X_train=train_values, X_val=validation_values, y_train=train_response, y_val=validation_response)
    #Se entrena el respectivo algoritmo con k-fold
    getattr(clf_model, method_name)()
    clf_model.process_model(kfold=True, k=5)

    #Se guarda el modelo
    dump(clf_model.model, f"../../models/{model_name}_{div}_{iteration}_{repr_name}_seed{seed}.joblib")

    return clf_model.performances

In [32]:
model_funcs = {
    "AdaBoost": "instance_ada_boost",
    "RandomForest": "instance_random_forest",
    "SVM": "instance_svc",
    "GradientBoosting": "instance_gradient_boosting",
    "LogisticRegression": "instance_logistic_regresion",
    "XGBoost": "instance_xg_boost",
    "Lgbm": "instance_lgbm",
    "KNN": "instance_neighbors"
}

In [33]:
rename_map = {
    "f1_weighted": "F1-score",
    "recall_weighted": "Recall",
    "precision_weighted": "Precision",
    "accuracy": "Accuracy"
}

In [34]:
repr_name="one_hot"
df_data = pd.read_csv(f"../../data/numerical_rep/{repr_name}_antiviral_homology_90.csv")
df_data.drop(["experimental_characteristics"], axis=1, inplace=True)

In [35]:
for i in range(30):
    seed=np.random.randint(0, 100)
    df_train, df_val, df_train_under, df_val_under=split(df_data, seed)
    all_metrics = []
    for model_name, method_name in model_funcs.items():
        perf_base=train(model_name, method_name, df_train, df_val, i, repr_name, 'base', seed)
        perf_under=train(model_name, method_name, df_train_under, df_val_under, i, repr_name, 'undersampling', seed)
    df_metrics = pd.DataFrame(all_metrics)
    df_metrics.to_csv(f"../../models/metrics_{name}.csv", index=False)

Training model: AdaBoost




Training model: AdaBoost




Training model: RandomForest
Training model: RandomForest
Training model: SVM
Training model: SVM
Training model: GradientBoosting
Training model: GradientBoosting
Training model: LogisticRegression
Training model: LogisticRegression


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Training model: XGBoost


AttributeError: 'super' object has no attribute '__sklearn_tags__'

In [None]:
folder = "../../data/numerical_rep/"
names=[]

In [None]:
for root, dirs, files in os.walk(folder):
    for archive in files:
        if archive.endswith(".csv"):
            route= os.path.join(root, archive)

            df = pd.read_csv(route)
            df.drop(["experimental_characteristics"], axis=1, inplace=True)
            

In [None]:
train_metrics = perf["training_metrics"]
    val_metrics = perf["validation_metrics"]
    val_metrics.pop("Confusion Matrix", None)
    train_renamed = {rename_map.get(k, k): v for k, v in train_metrics.items()}

    metrics_row = {"iteration": i, "model": model_name, "seed": seed}
    for k in rename_map.values():
        metrics_row[f"Train_{k}"] = round(train_renamed[k], 4)
        metrics_row[f"Val_{k}"] = round(val_metrics[k], 4)
    all_metrics.append(metrics_row)
    performance_under=train(model_name, method_name, df_train_under, df_val_under)