In [1]:
from sklearn.model_selection import GridSearchCV, StratifiedKFold
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import plotly.graph_objects as go



MODELS = {
    "LogisticRegression": (LogisticRegression(max_iter=1000), {
        "C": [0.1, 10, 100],
        "penalty": ["l1", "l2", None],
        "solver": ["lbfgs", "liblinear"]
    }),
    "KNeighbors": (KNeighborsClassifier(), {
        "n_neighbors": [3, 7, 11],
        "weights": ["uniform", "distance"]
    }),
    "XGBClassifier": (XGBClassifier(eval_metric='logloss'), {
        "n_estimators": [50, 100],
        "max_depth": [3, 7],
        "learning_rate": [0.01, 0.1]
    }),
    "RandomForest": (RandomForestClassifier(), {
        "n_estimators": [50, 100, 150],
        "max_depth": [3, 5, 7, None],
        "criterion": ["gini", "entropy", "log_loss"]
    })
}

In [2]:
original_df = pd.read_csv("../data/adults/preprocessed/original/adults_original.csv")
X_train, y_train = original_df.drop(columns=["income"]), original_df["income"]

In [3]:
results = []

for model_name, (model, param_grid) in MODELS.items():
    try:
        # GridSearch con validación cruzada para encontrar mejores hiperparámetros
        grid = GridSearchCV(model, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
        grid.fit(X_train, y_train)
        best_params = grid.best_params_
    except Exception as e:
        print(f"Error in {model_name} with params {param_grid}: {e}")
        continue
    # Entrenar 10 veces con diferentes splits para obtener rango de accuracies
    skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
    for train_idx, test_idx in skf.split(X_train, y_train):
        X_tr, X_te = X_train.iloc[train_idx], X_train.iloc[test_idx]
        y_tr, y_te = y_train.iloc[train_idx], y_train.iloc[test_idx]
        
        clf = type(model)(**best_params)
        clf.fit(X_tr, y_tr)
        y_pred = clf.predict(X_te)
        acc = accuracy_score(y_te, y_pred)
        results.append({'model': model_name, 'score': acc})

results_df = pd.DataFrame(results)

30 fits failed out of a total of 90.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
15 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\naxet\AppData\Roaming\Python\Python313\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
    ~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\naxet\AppData\Roaming\Python\Python313\site-packages\sklearn\base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "C:\Users\naxet\AppData\Roaming\Python\Python313\site-packages\sklearn\linear_model\_logistic.py", line 1193, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual

In [None]:
results_df.to_csv("../data/adults/results/models/model_results_over_original.csv", index=False)

In [15]:

# Obtener los modelos únicos
modelos = results_df['model'].unique()

# Crear una figura nueva
fig_minmax = go.Figure()

# Añadir líneas para cada modelo
for modelo in modelos:
    scores = results_df[results_df['model'] == modelo]['score']
    min_score = scores.min()
    max_score = scores.max()
    fig_minmax.add_trace(go.Scatter(
        x=[min_score, max_score],
        y=[modelo, modelo],
        mode='lines+markers',
        name=modelo,
        line=dict(width=5),
        marker=dict(size=10)
    ))

fig_minmax.update_layout(
    # title="Rango de accuracy por modelo",
    xaxis_title="Accuracy",
    yaxis_title="",
    xaxis=dict(
        tickfont=dict(size=18),
        title_font=dict(size=18)
    ),
    yaxis=dict(
        categoryorder='array',
        categoryarray=list(modelos),
        showticklabels=False
    ),
    legend=dict(
        orientation='h',
        yanchor='bottom',
        y=-0.25,
        xanchor='center',
        x=0.5,
        font=dict(size=18),
        title=dict(text='Modelos', font=dict(size=18))
    ),
    margin=dict(l=0, r=0, t=0, b=0),
    width=800,
    height=400
)
fig_minmax.show()