In [None]:
import pandas as pd
import numpy as np
import time, json, ast, os
from pathlib import Path

from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, GradientBoostingRegressor
from sklearn.linear_model import Ridge, Lasso
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import joblib
import warnings
warnings.filterwarnings("ignore")

DATASETS = {
    "red_capped":    r"C:\Users\david\Downloads\OneDrive - Asesoftware S.A.S\Pictures\Equinox AI\Plan semilla\Proyectos\Analisis de la calidad del vino\Tratamiento Outliers\red-capped.csv",
    "red_removed":   r"C:\Users\david\Downloads\OneDrive - Asesoftware S.A.S\Pictures\Equinox AI\Plan semilla\Proyectos\Analisis de la calidad del vino\Tratamiento Outliers\red-removed.csv",
    "white_capped":  r"C:\Users\david\Downloads\OneDrive - Asesoftware S.A.S\Pictures\Equinox AI\Plan semilla\Proyectos\Analisis de la calidad del vino\Tratamiento Outliers\white-capped.csv",
    "white_removed": r"C:\Users\david\Downloads\OneDrive - Asesoftware S.A.S\Pictures\Equinox AI\Plan semilla\Proyectos\Analisis de la calidad del vino\Tratamiento Outliers\white-removed.csv",
}
ARTIFACT_DIR = Path(r"C:\Users\david\Downloads\OneDrive - Asesoftware S.A.S\Pictures\Equinox AI\Plan semilla\Proyectos\Analisis de la calidad del vino\Entrenamiento modelo")
MODEL_DIR = ARTIFACT_DIR / "models"
MODEL_DIR.mkdir(parents=True, exist_ok=True)

RESULTS_CSV = ARTIFACT_DIR / "finalresults.csv"

TRAIN_FRAC = 0.60
VAL_FRAC   = 0.20
TEST_FRAC  = 0.20
RANDOM_SEED = 111
np.random.seed(RANDOM_SEED)

PCA_CHOICES = ['passthrough', 0.85, 0.90, 0.95, 7]  # variance fraction or integer components

CV_FOLDS = 3
N_ITER   = 40   # total random trials across all model families below
SCORING  = "neg_mean_squared_error"


In [None]:
from sklearn.base import clone

# Base pipeline: scale -> PCA (or passthrough) -> placeholder 'model'
pipe = Pipeline(steps=[
    ("scale", StandardScaler()),
    ("pca", PCA()),            #
    ("model", RandomForestRegressor(random_state=RANDOM_SEED, n_jobs=-1))
])

# Model candidates 
rf  = RandomForestRegressor(random_state=RANDOM_SEED, n_jobs=-1)
etr = ExtraTreesRegressor(random_state=RANDOM_SEED, n_jobs=-1)
gbr = GradientBoostingRegressor(random_state=RANDOM_SEED)
svr = SVR()
ridge = Ridge(random_state=RANDOM_SEED)
lasso = Lasso(random_state=RANDOM_SEED)

rf_space = {
    "model": [rf],
    "model__n_estimators": [100, 300, 500, 800],
    "model__max_depth": [None, 10, 18],
    "model__min_samples_split": [2, 5, 10],
    "model__min_samples_leaf": [1, 2, 4],
    "model__max_features": ["sqrt", "log2", 0.5],
}
etr_space = {
    "model": [etr],
    "model__n_estimators": [200, 500, 800],
    "model__max_depth": [None, 10, 18],
    "model__min_samples_split": [2, 5, 10],
    "model__min_samples_leaf": [1, 2, 4],
    "model__max_features": ["sqrt", "log2", 0.5],
}
gbr_space = {
    "model": [gbr],
    "model__n_estimators": [100, 300],
    "model__learning_rate": [0.05, 0.1],
    "model__max_depth": [2, 3],
    "model__subsample": [0.7, 1.0],
    "model__max_features": ["sqrt", "log2", None],
}
svr_space = {
    "model": [svr],
    "model__C": [0.5, 1, 5, 10],
    "model__epsilon": [0.05, 0.1, 0.2],
    "model__kernel": ["rbf"],
    "model__gamma": ["scale"],
}
ridge_space = {
    "model": [ridge],
    "model__alpha": [0.1, 1.0, 10.0]
}
lasso_space = {
    "model": [lasso],
    "model__alpha": [0.001, 0.01, 0.1, 1.0]
}

param_distributions = []

for base_space in [rf_space, etr_space, gbr_space, svr_space, ridge_space, lasso_space]:
    # (a) Skip PCA entirely
    d_skip = dict(base_space)  # shallow copy
    d_skip.update({"pca": ["passthrough"]})
    param_distributions.append(d_skip)

    # (b) Keep PCA (choose #components as variance fraction or integer)
    d_keep = dict(base_space)
    d_keep.update({"pca__n_components": [c for c in PCA_CHOICES if c != "passthrough"]})
    param_distributions.append(d_keep)

# Sanity check: RandomizedSearchCV can accept a LIST of dicts; each dict is a separate subspace
len(param_distributions), type(param_distributions[0])


(12, dict)

In [None]:
all_rows = []

for name, csv_path in DATASETS.items():
    print(f"\n=== Running: {name} ===")
    
    # --- Load data ---
    df = pd.read_csv(csv_path, sep=";")
    if "quality" not in df.columns:
        raise ValueError(f"'quality' column not found in {csv_path}. Got: {df.columns.tolist()}")
    X = df.drop(columns=["quality"])
    y = df["quality"]

    # --- 60/20/20 split ---
    X_train, X_tmp, y_train, y_tmp = train_test_split(
        X, y, test_size=(1.0 - TRAIN_FRAC), random_state=RANDOM_SEED
    )
    tmp_test_size = TEST_FRAC / (VAL_FRAC + TEST_FRAC)
    X_val, X_test, y_val, y_test = train_test_split(
        X_tmp, y_tmp, test_size=tmp_test_size, random_state=RANDOM_SEED
    )

    # --- Randomized search: search models + PCA jointly, score by MSE ---
    search = RandomizedSearchCV(
        estimator=pipe,
        param_distributions=param_distributions,
        n_iter=N_ITER,
        scoring=SCORING,
        cv=CV_FOLDS,
        refit=True,             # refit best params on the whole TRAIN (60%)
        n_jobs=-1,
        random_state=RANDOM_SEED,
        verbose=0,
        return_train_score=False
    )

    t0 = time.time()
    search.fit(X_train, y_train)
    train_time = time.time() - t0

    best_est = search.best_estimator_      # full pipeline: scale -> (pca or passthrough) -> best model
    best_params = search.best_params_
    best_cv_mse = -search.best_score_

    # --- Validation metrics ---
    t1 = time.time()
    y_val_pred = best_est.predict(X_val)
    pred_time_val = time.time() - t1
    mse_val = mean_squared_error(y_val, y_val_pred)
    rmse_val = float(np.sqrt(mse_val))
    mae_val = mean_absolute_error(y_val, y_val_pred)
    r2_val  = r2_score(y_val, y_val_pred)
    yv_round = np.rint(y_val_pred)
    acc_val_exact = float(np.mean(y_val.values == yv_round))
    acc_val_pm1   = float(np.mean(np.abs(y_val.values - yv_round) <= 1))

    # --- Test metrics ---
    t2 = time.time()
    y_test_pred = best_est.predict(X_test)
    pred_time_test = time.time() - t2
    mse_test = mean_squared_error(y_test, y_test_pred)
    rmse_test = float(np.sqrt(mse_test))
    mae_test = mean_absolute_error(y_test, y_test_pred)
    r2_test  = r2_score(y_test, y_test_pred)
    yt_round = np.rint(y_test_pred)
    acc_test_exact = float(np.mean(y_test.values == yt_round))
    acc_test_pm1   = float(np.mean(np.abs(y_test.values - yt_round) <= 1))

    # --- Save *searched* best estimator (trained on 60%) ---
    best_model_name = type(best_est.named_steps["model"]).__name__
    model_path = MODEL_DIR / f"best_{name}.pkl"
    joblib.dump(best_est, model_path)

    # --- Store only serializable params (exclude actual estimator object) ---
    best_params_no_model = {k: v for k, v in best_params.items() if k != "model"}

    all_rows.append({
        "dataset": name,
        "n_samples": len(df),
        "best_model": best_model_name,
        "best_params": json.dumps(best_params_no_model),   # safe JSON string
        "best_cv_MSE": float(best_cv_mse),

        "val_MSE": float(mse_val), "val_RMSE": rmse_val, "val_MAE": float(mae_val), "val_R2": float(r2_val),
        "val_Acc_exact": acc_val_exact, "val_Acc_±1": acc_val_pm1,

        "test_MSE": float(mse_test), "test_RMSE": rmse_test, "test_MAE": float(mae_test), "test_R2": float(r2_test),
        "test_Acc_exact": acc_test_exact, "test_Acc_±1": acc_test_pm1,

        "train_time_s": float(train_time),
        "pred_time_val_s": float(pred_time_val),
        "pred_time_test_s": float(pred_time_test),

        "saved_model_path": str(model_path)
    })

results_df = pd.DataFrame(all_rows).sort_values(by="test_MSE")
results_df



=== Running: red_capped ===

=== Running: red_removed ===

=== Running: white_capped ===

=== Running: white_removed ===


Unnamed: 0,dataset,n_samples,best_model,best_params,best_cv_MSE,val_MSE,val_RMSE,val_MAE,val_R2,val_Acc_exact,...,test_MSE,test_RMSE,test_MAE,test_R2,test_Acc_exact,test_Acc_±1,train_time_s,pred_time_val_s,pred_time_test_s,saved_model_path
1,red_removed,1249,RandomForestRegressor,"{""pca"": ""passthrough"", ""model__n_estimators"": ...",0.362932,0.32035,0.565995,0.430139,0.487931,0.66,...,0.305213,0.552461,0.413776,0.496628,0.692,0.984,26.82761,0.700986,0.697303,C:\Users\david\Downloads\OneDrive - Asesoftwar...
3,white_removed,4121,RandomForestRegressor,"{""pca"": ""passthrough"", ""model__n_estimators"": ...",0.422119,0.38757,0.622551,0.459143,0.470351,0.646845,...,0.344525,0.586962,0.437517,0.513905,0.68,0.978182,36.319022,0.149889,0.152734,C:\Users\david\Downloads\OneDrive - Asesoftwar...
2,white_capped,4898,RandomForestRegressor,"{""pca"": ""passthrough"", ""model__n_estimators"": ...",0.433163,0.422647,0.650113,0.483279,0.478878,0.616327,...,0.361254,0.601044,0.443345,0.518348,0.665306,0.97551,40.633896,0.15061,0.14554,C:\Users\david\Downloads\OneDrive - Asesoftwar...
0,red_capped,1599,ExtraTreesRegressor,"{""pca"": ""passthrough"", ""model__n_estimators"": ...",0.369965,0.369956,0.60824,0.475814,0.427534,0.640625,...,0.382783,0.618695,0.472904,0.391644,0.646875,0.978125,31.447029,0.251318,0.32269,C:\Users\david\Downloads\OneDrive - Asesoftwar...


In [None]:
results_df.to_csv(RESULTS_CSV, index=False)

print("=== SUMMARY (sorted by Test MSE) ===")
print(results_df[[
    "dataset","best_model","best_cv_MSE",
    "val_MSE","val_RMSE","val_MAE","val_R2","val_Acc_exact","val_Acc_±1",
    "test_MSE","test_RMSE","test_MAE","test_R2","test_Acc_exact","test_Acc_±1",
    "saved_model_path"
]])
print(f"\nSaved CSV: {RESULTS_CSV}")


=== SUMMARY (sorted by Test MSE) ===
         dataset             best_model  best_cv_MSE   val_MSE  val_RMSE  \
1    red_removed  RandomForestRegressor     0.362932  0.320350  0.565995   
3  white_removed  RandomForestRegressor     0.422119  0.387570  0.622551   
2   white_capped  RandomForestRegressor     0.433163  0.422647  0.650113   
0     red_capped    ExtraTreesRegressor     0.369965  0.369956  0.608240   

    val_MAE    val_R2  val_Acc_exact  val_Acc_±1  test_MSE  test_RMSE  \
1  0.430139  0.487931       0.660000    0.980000  0.305213   0.552461   
3  0.459143  0.470351       0.646845    0.962379  0.344525   0.586962   
2  0.483279  0.478878       0.616327    0.968367  0.361254   0.601044   
0  0.475814  0.427534       0.640625    0.978125  0.382783   0.618695   

   test_MAE   test_R2  test_Acc_exact  test_Acc_±1  \
1  0.413776  0.496628        0.692000     0.984000   
3  0.437517  0.513905        0.680000     0.978182   
2  0.443345  0.518348        0.665306     0.975510   


In [None]:
# We'll refit & save for the two removed datasets
targets = ["red_removed", "white_removed"]

# Mapping from model name to a fresh estimator (so we can set params)
estimator_factory = {
    "RandomForestRegressor": lambda: RandomForestRegressor(random_state=RANDOM_SEED, n_jobs=-1),
    "ExtraTreesRegressor":  lambda: ExtraTreesRegressor(random_state=RANDOM_SEED, n_jobs=-1),
    "GradientBoostingRegressor": lambda: GradientBoostingRegressor(random_state=RANDOM_SEED),
    "SVR":   lambda: SVR(),
    "Ridge": lambda: Ridge(random_state=RANDOM_SEED),
    "Lasso": lambda: Lasso(random_state=RANDOM_SEED),
}

# Load results
res = pd.read_csv(RESULTS_CSV)

# Ensure best_params is a dict (parse JSON string)
if res["best_params"].dtype == object and isinstance(res["best_params"].iloc[0], str):
    res["best_params"] = res["best_params"].apply(json.loads)

refit_rows = res[res["dataset"].isin(targets)].copy()

final_summaries = []

for _, row in refit_rows.iterrows():
    name = row["dataset"]
    best_model_name = row["best_model"]
    best_params = row["best_params"]  # pca OR pca__n_components + model__*

    csv_path = DATASETS[name]
    df = pd.read_csv(csv_path, sep=";")
    X = df.drop(columns=["quality"])
    y = df["quality"]

    # 60/20/20 again, then we merge train+val into 80% to refit a FINAL model
    X_train, X_tmp, y_train, y_tmp = train_test_split(
        X, y, test_size=(1.0 - TRAIN_FRAC), random_state=RANDOM_SEED
    )
    tmp_test_size = TEST_FRAC / (VAL_FRAC + TEST_FRAC)
    X_val, X_test, y_val, y_test = train_test_split(
        X_tmp, y_tmp, test_size=tmp_test_size, random_state=RANDOM_SEED
    )
    X_trainval = pd.concat([X_train, X_val], axis=0)
    y_trainval = pd.concat([y_train, y_val], axis=0)

    # Rebuild the pipeline with the winning model class
    model = estimator_factory[best_model_name]()  # fresh estimator
    pipe = Pipeline(steps=[
        ("scale", StandardScaler()),
        ("pca", PCA()),   # can be set to 'passthrough'
        ("model", model)
    ])

    # Apply the saved best hyperparameters (includes PCA choice and model__* params)
    pipe.set_params(**best_params)

    # Fit on the 80% train+val
    pipe.fit(X_trainval, y_trainval)

    # One final test evaluation
    y_test_pred = pipe.predict(X_test)
    mse  = mean_squared_error(y_test, y_test_pred)
    rmse = float(np.sqrt(mse))
    mae  = mean_absolute_error(y_test, y_test_pred)
    r2   = r2_score(y_test, y_test_pred)
    y_round = np.rint(y_test_pred)
    acc_exact = float(np.mean(y_test.values == y_round))
    acc_pm1   = float(np.mean(np.abs(y_test.values - y_round) <= 1))

    # Save final refit model
    final_path = MODEL_DIR / f"{best_model_name.lower()}_final_{name}.pkl"
    joblib.dump(pipe, final_path)

    final_summaries.append({
        "dataset": name,
        "best_model": best_model_name,
        "used_best_params": best_params,
        "test_MSE_after_refit_on_80%": float(mse),
        "test_RMSE": rmse,
        "test_MAE": float(mae),
        "test_R2": float(r2),
        "test_Acc_exact": acc_exact,
        "test_Acc_±1": acc_pm1,
        "saved_final_model_path": str(final_path)
    })

pd.DataFrame(final_summaries)


Unnamed: 0,dataset,best_model,used_best_params,test_MSE_after_refit_on_80%,test_RMSE,test_MAE,test_R2,test_Acc_exact,test_Acc_±1,saved_final_model_path
0,red_removed,RandomForestRegressor,"{'pca': 'passthrough', 'model__n_estimators': ...",0.284034,0.532948,0.392837,0.531557,0.728,0.984,C:\Users\david\Downloads\OneDrive - Asesoftwar...
1,white_removed,RandomForestRegressor,"{'pca': 'passthrough', 'model__n_estimators': ...",0.315505,0.561698,0.407443,0.554849,0.704242,0.978182,C:\Users\david\Downloads\OneDrive - Asesoftwar...


In [6]:
res = pd.read_csv(RESULTS_CSV)
removed_top2 = (res[res["dataset"].isin(["red_removed", "white_removed"])]
                .sort_values("test_MSE")
                [["dataset","best_model","test_MSE","test_RMSE","test_MAE","test_R2","best_params"]])
removed_top2
 

Unnamed: 0,dataset,best_model,test_MSE,test_RMSE,test_MAE,test_R2,best_params
0,red_removed,RandomForestRegressor,0.305213,0.552461,0.413776,0.496628,"{""pca"": ""passthrough"", ""model__n_estimators"": ..."
1,white_removed,RandomForestRegressor,0.344525,0.586962,0.437517,0.513905,"{""pca"": ""passthrough"", ""model__n_estimators"": ..."
