In [14]:
import pandas as pd
import numpy as np
import sys
sys.path.insert(0, "../../utils/")
from sklearn.model_selection import train_test_split
from imblearn.under_sampling import RandomUnderSampler
from training_models.classification_models import ClassificationModels
from joblib import dump

In [15]:
def n_samples(df):
    df_high = df[df['target'] == 2]
    df_mid = df[df['target'] == 1]
    df_low = df[df['target'] == 0]
    n=min(len(df_high), len(df_mid), len(df_low))
    return n

In [16]:
name="one_hot"
df_data = pd.read_csv(f"../../data/numerical_rep/{name}_antiviral_homology_90.csv")
df_data.drop(["experimental_characteristics"], axis=1, inplace=True)

In [17]:
model_funcs = {
    "AdaBoost": "instance_ada_boost",
    "RandomForest": "instance_random_forest",
    "SVM": "instance_svc",
    "GradientBoosting": "instance_gradient_boosting",
    "LogisticRegression": "instance_logistic_regresion",
    "XGBoost": "instance_xg_boost",
    "Lgbm": "instance_lgbm",
    "KNN": "instance_neighbors"
}

In [18]:
rename_map = {
    "f1_weighted": "F1-score",
    "recall_weighted": "Recall",
    "precision_weighted": "Precision",
    "accuracy": "Accuracy"
}

In [19]:
all_metrics = []
i=np.random.randint(0, 100)

In [20]:
train_data, val_data = train_test_split(df_data, test_size=0.2, random_state=i)

In [21]:
n_samples_train= n_samples(train_data)
n_samples_val= n_samples(val_data)

In [22]:
undersampler_train = RandomUnderSampler(sampling_strategy={0: n_samples_train, 1: n_samples_train, 2: n_samples_train}, random_state=i)
undersampler_val = RandomUnderSampler(sampling_strategy={0: n_samples_val, 1: n_samples_val, 2: n_samples_val}, random_state=i)

In [23]:
X_train = train_data.drop('target', axis=1)  # tus features
y_train = train_data['target']  
X_val = val_data.drop('target', axis=1)
y_val = val_data['target']

In [24]:
X_resampled, y_resampled = undersampler_train.fit_resample(X_train, y_train)
df_resampled_train = pd.concat([pd.DataFrame(X_resampled, columns=X_train.columns), pd.Series(y_resampled, name='target')], axis=1)
X_resampled, y_resampled = undersampler_val.fit_resample(X_val, y_val)
df_resampled_val = pd.concat([pd.DataFrame(X_resampled, columns=X_val.columns), pd.Series(y_resampled, name='target')], axis=1)

In [25]:
train_values = df_resampled_train.drop(columns="target").values
train_response = df_resampled_train["target"].values

validation_values = df_resampled_val.drop(columns="target").values
validation_response = df_resampled_val["target"].values

In [26]:
for model_name, method_name in model_funcs.items():
        print(f"Entrenando modelo: {model_name}")

        clf_model = ClassificationModels(X_train=train_values, X_val=validation_values, y_train=train_response, y_val=validation_response)

        getattr(clf_model, method_name)()
        clf_model.process_model(kfold=True, k=5)

        train_metrics = clf_model.performances["training_metrics"]
        val_metrics = clf_model.performances["validation_metrics"]
        val_metrics.pop("Confusion Matrix", None)
        train_renamed = {rename_map.get(k, k): v for k, v in train_metrics.items()}

        # Guardar métricas
        metrics_row = {"iteration": i, "model": model_name}
        for k in rename_map.values():
            metrics_row[f"Train_{k}"] = round(train_renamed[k], 4)
            metrics_row[f"Val_{k}"] = round(val_metrics[k], 4)
        all_metrics.append(metrics_row)

        # Guardar modelo
        dump(clf_model.model, f"../../models/{model_name}.joblib")

# Guardar métricas finales
df_metrics = pd.DataFrame(all_metrics)
df_metrics.to_csv(f"../../models/metrics_{name}.csv", index=False)

Entrenando modelo: AdaBoost


Entrenando modelo: RandomForest
Entrenando modelo: SVM
Entrenando modelo: GradientBoosting
Entrenando modelo: LogisticRegression


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Entrenando modelo: XGBoost
Entrenando modelo: Lgbm
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000228 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 498
[LightGBM] [Info] Number of data points in the train set: 388, number of used features: 35
[LightGBM] [Info] Start training from score -1.101193
[LightGBM] [Info] Start training from score -1.093471
[LightGBM] [Info] Start training from score -1.101193
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000232 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 506
[LightGBM] [Info] Number of data points in the train set: 389, number of used features: 35
[LightGBM] [Info] Start training from score -1.103767
[LightGBM] [Info] Start training from score -1.096045
[LightGBM] [Info] Start training from score

clf_model = ClassificationModels(
    X_train=train_values, X_val=validation_values, y_train=train_response, y_val=validation_response
)

clf_model.instance_linear_svc()
clf_model.process_model(kfold=True, k=5)

train=clf_model.performances["training_metrics"]
valid=clf_model.performances["validation_metrics"]
valid.pop("Confusion Matrix", None)

train_r = {rename_map.get(k, k): v for k, v in train.items()}
df_metrics = pd.DataFrame({
    "Training": train_r,
    "Validation": valid
})