In [10]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import VarianceThreshold
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error
import optuna

# Cargar los datos
train_df = pd.read_csv("./data/train.csv")
test_df = pd.read_csv("./data/test.csv")

# Preprocesamiento: Separación de CPU, GPU y Memoria
def process_cpu(cpu):
    parts = cpu.split()
    return parts[0], parts[2], float(parts[-1].replace('GHz', ''))

def process_gpu(gpu):
    parts = gpu.split()
    return parts[0], ' '.join(parts[1:])

def process_memory(memory):
    types = ['SSD', 'HDD', 'Flash', 'Hybrid']
    for t in types:
        if t in memory:
            numeric_part = ''.join(filter(str.isdigit, memory))  # Extrae solo los dígitos
            return int(numeric_part), t
    numeric_part = ''.join(filter(str.isdigit, memory))
    return int(numeric_part) if numeric_part else 0, "Unknown"

# Aplicar transformación
train_df[['Cpu_Brand', 'Cpu_Family', 'Cpu_Speed']] = train_df['Cpu'].apply(lambda x: pd.Series(process_cpu(x)))
test_df[['Cpu_Brand', 'Cpu_Family', 'Cpu_Speed']] = test_df['Cpu'].apply(lambda x: pd.Series(process_cpu(x)))

train_df[['Gpu_Brand', 'Gpu_Model']] = train_df['Gpu'].apply(lambda x: pd.Series(process_gpu(x)))
test_df[['Gpu_Brand', 'Gpu_Model']] = test_df['Gpu'].apply(lambda x: pd.Series(process_gpu(x)))

train_df[['Memory_Capacity', 'Memory_Type']] = train_df['Memory'].apply(lambda x: pd.Series(process_memory(x)))
test_df[['Memory_Capacity', 'Memory_Type']] = test_df['Memory'].apply(lambda x: pd.Series(process_memory(x)))

# Convertir Ram y Weight a valores numéricos
train_df['Ram'] = train_df['Ram'].apply(lambda x: int(''.join(filter(str.isdigit, str(x)))))
test_df['Ram'] = test_df['Ram'].apply(lambda x: int(''.join(filter(str.isdigit, str(x)))))

train_df['Weight'] = train_df['Weight'].apply(lambda x: float(str(x).replace('kg', '').strip()))
test_df['Weight'] = test_df['Weight'].apply(lambda x: float(str(x).replace('kg', '').strip()))

# Codificar variables categóricas con categorías alineadas
categorical_cols = ['Company', 'TypeName', 'OpSys', 'Cpu_Brand', 'Cpu_Family', 'Gpu_Brand', 'Gpu_Model', 'Memory_Type']
for col in categorical_cols:
    train_df[col] = train_df[col].astype("category").cat.codes
    test_df[col] = test_df[col].map(train_df[col].to_dict()).fillna(-1).astype(int)

# Selección de características
X = train_df.drop(columns=['Price_in_euros', 'Cpu', 'Gpu', 'Memory', 'ScreenResolution', 'Product'])
y = train_df['Price_in_euros']
X_test = test_df.drop(columns=['Cpu', 'Gpu', 'Memory', 'ScreenResolution', 'Product'])

# Dividir en train y validación con shuffle
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=True)

# Optimización de hiperparámetros con Optuna
def objective(trial):
    params = {
        'iterations': trial.suggest_int('iterations', 500, 2000),
        'depth': trial.suggest_int('depth', 4, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1, 10),
        'loss_function': 'RMSE',
        'eval_metric': 'RMSE',
        'random_seed': 42,
        'verbose': 0
    }
    model = CatBoostRegressor(**params)
    model.fit(X_train, y_train, eval_set=(X_val, y_val), early_stopping_rounds=50, verbose=0)
    y_pred = model.predict(X_val)
    rmse = mean_squared_error(y_val, y_pred, squared=False)
    print(f'Validation RMSE: {rmse}')
    return rmse

study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=50)

# Entrenar modelo final
best_params = study.best_params
model = CatBoostRegressor(**best_params)
model.fit(X, y, verbose=100)

# Calcular RMSE final en conjunto de validación
y_val_pred = model.predict(X_val)
final_rmse = mean_squared_error(y_val, y_val_pred, squared=False)
print(f'Final Validation RMSE: {final_rmse}')

# Predecir precios en test.csv
test_df['Price_in_euros'] = model.predict(X_test)
test_df[['laptop_ID', 'Price_in_euros']].to_csv("submission.csv", index=False)




[I 2025-02-10 15:52:03,023] A new study created in memory with name: no-name-4d83d6ad-e35b-4341-a85a-a51ac18b3623
[I 2025-02-10 15:52:03,994] Trial 0 finished with value: 330.2160478300763 and parameters: {'iterations': 1949, 'depth': 8, 'learning_rate': 0.23010246519193533, 'l2_leaf_reg': 3.7969574160521935}. Best is trial 0 with value: 330.2160478300763.


Validation RMSE: 330.2160478300763


[I 2025-02-10 15:52:04,705] Trial 1 finished with value: 295.85004065173797 and parameters: {'iterations': 591, 'depth': 7, 'learning_rate': 0.17764380674484131, 'l2_leaf_reg': 5.863803929157864}. Best is trial 1 with value: 295.85004065173797.


Validation RMSE: 295.85004065173797


[I 2025-02-10 15:52:06,159] Trial 2 finished with value: 302.12125753530296 and parameters: {'iterations': 1609, 'depth': 6, 'learning_rate': 0.022107264040416755, 'l2_leaf_reg': 1.8041666664353961}. Best is trial 1 with value: 295.85004065173797.


Validation RMSE: 302.12125753530296


[I 2025-02-10 15:52:08,643] Trial 3 finished with value: 338.94175099272553 and parameters: {'iterations': 1787, 'depth': 10, 'learning_rate': 0.20993766477829146, 'l2_leaf_reg': 2.3842733701642027}. Best is trial 1 with value: 295.85004065173797.


Validation RMSE: 338.94175099272553


[I 2025-02-10 15:52:09,660] Trial 4 finished with value: 296.28098906489794 and parameters: {'iterations': 790, 'depth': 5, 'learning_rate': 0.03671090246148503, 'l2_leaf_reg': 8.31094367145263}. Best is trial 1 with value: 295.85004065173797.


Validation RMSE: 296.28098906489794


[I 2025-02-10 15:52:10,122] Trial 5 finished with value: 295.19628227806254 and parameters: {'iterations': 1774, 'depth': 6, 'learning_rate': 0.22695598005376477, 'l2_leaf_reg': 6.928420644180674}. Best is trial 5 with value: 295.19628227806254.


Validation RMSE: 295.19628227806254


[I 2025-02-10 15:52:10,439] Trial 6 finished with value: 301.54541715861455 and parameters: {'iterations': 1774, 'depth': 5, 'learning_rate': 0.1978781793323871, 'l2_leaf_reg': 3.9968989870348053}. Best is trial 5 with value: 295.19628227806254.


Validation RMSE: 301.54541715861455


[I 2025-02-10 15:52:10,865] Trial 7 finished with value: 296.5557732263157 and parameters: {'iterations': 1860, 'depth': 4, 'learning_rate': 0.1054939887750296, 'l2_leaf_reg': 4.543019797454522}. Best is trial 5 with value: 295.19628227806254.


Validation RMSE: 296.5557732263157


[I 2025-02-10 15:52:11,306] Trial 8 finished with value: 288.088840755514 and parameters: {'iterations': 1466, 'depth': 5, 'learning_rate': 0.21516787597263462, 'l2_leaf_reg': 3.1144773783989192}. Best is trial 8 with value: 288.088840755514.


Validation RMSE: 288.088840755514


[I 2025-02-10 15:52:11,762] Trial 9 finished with value: 292.2580536176456 and parameters: {'iterations': 1122, 'depth': 4, 'learning_rate': 0.14895888596129905, 'l2_leaf_reg': 5.532026522239959}. Best is trial 8 with value: 288.088840755514.


Validation RMSE: 292.2580536176456


[I 2025-02-10 15:52:13,090] Trial 10 finished with value: 326.7043710778934 and parameters: {'iterations': 1413, 'depth': 9, 'learning_rate': 0.28081543682690535, 'l2_leaf_reg': 9.892489782897616}. Best is trial 8 with value: 288.088840755514.


Validation RMSE: 326.7043710778934


[I 2025-02-10 15:52:13,617] Trial 11 finished with value: 287.70285354317565 and parameters: {'iterations': 1078, 'depth': 4, 'learning_rate': 0.12708733303964226, 'l2_leaf_reg': 6.020483667918315}. Best is trial 11 with value: 287.70285354317565.


Validation RMSE: 287.70285354317565


[I 2025-02-10 15:52:14,170] Trial 12 finished with value: 295.4514416952019 and parameters: {'iterations': 1171, 'depth': 4, 'learning_rate': 0.11333013988515026, 'l2_leaf_reg': 2.8080613776791443}. Best is trial 11 with value: 287.70285354317565.


Validation RMSE: 295.4514416952019


[I 2025-02-10 15:52:14,624] Trial 13 finished with value: 292.81223236462046 and parameters: {'iterations': 919, 'depth': 5, 'learning_rate': 0.2932613627326148, 'l2_leaf_reg': 7.057062590567689}. Best is trial 11 with value: 287.70285354317565.


Validation RMSE: 292.81223236462046


[I 2025-02-10 15:52:15,362] Trial 14 finished with value: 299.4747305607304 and parameters: {'iterations': 1416, 'depth': 6, 'learning_rate': 0.08214065493610744, 'l2_leaf_reg': 1.3072268158274554}. Best is trial 11 with value: 287.70285354317565.


Validation RMSE: 299.4747305607304


[I 2025-02-10 15:52:16,491] Trial 15 finished with value: 307.1535114020801 and parameters: {'iterations': 1357, 'depth': 7, 'learning_rate': 0.15490172547567557, 'l2_leaf_reg': 3.110623399196634}. Best is trial 11 with value: 287.70285354317565.


Validation RMSE: 307.1535114020801


[I 2025-02-10 15:52:17,319] Trial 16 finished with value: 290.3073416265305 and parameters: {'iterations': 1009, 'depth': 5, 'learning_rate': 0.06890130178556853, 'l2_leaf_reg': 5.017111162520141}. Best is trial 11 with value: 287.70285354317565.


Validation RMSE: 290.3073416265305


[I 2025-02-10 15:52:17,696] Trial 17 finished with value: 293.75436287513173 and parameters: {'iterations': 1543, 'depth': 4, 'learning_rate': 0.2635583281106759, 'l2_leaf_reg': 6.535977555529804}. Best is trial 11 with value: 287.70285354317565.


Validation RMSE: 293.75436287513173


[I 2025-02-10 15:52:18,339] Trial 18 finished with value: 299.0176652881118 and parameters: {'iterations': 670, 'depth': 7, 'learning_rate': 0.1284793707658341, 'l2_leaf_reg': 8.162395731154113}. Best is trial 11 with value: 287.70285354317565.


Validation RMSE: 299.0176652881118


[I 2025-02-10 15:52:18,709] Trial 19 finished with value: 300.4363672920188 and parameters: {'iterations': 1245, 'depth': 5, 'learning_rate': 0.25066833446124215, 'l2_leaf_reg': 3.786170966084281}. Best is trial 11 with value: 287.70285354317565.


Validation RMSE: 300.4363672920188


[I 2025-02-10 15:52:19,216] Trial 20 finished with value: 298.7391001530665 and parameters: {'iterations': 941, 'depth': 6, 'learning_rate': 0.1830535055073661, 'l2_leaf_reg': 6.051399127758241}. Best is trial 11 with value: 287.70285354317565.


Validation RMSE: 298.7391001530665


[I 2025-02-10 15:52:19,854] Trial 21 finished with value: 289.7566856269345 and parameters: {'iterations': 1045, 'depth': 5, 'learning_rate': 0.09362176120864638, 'l2_leaf_reg': 4.869782364743458}. Best is trial 11 with value: 287.70285354317565.


Validation RMSE: 289.7566856269345


[I 2025-02-10 15:52:20,691] Trial 22 finished with value: 288.64514753574224 and parameters: {'iterations': 1050, 'depth': 4, 'learning_rate': 0.06867977539562932, 'l2_leaf_reg': 4.842669095939226}. Best is trial 11 with value: 287.70285354317565.


Validation RMSE: 288.64514753574224


[I 2025-02-10 15:52:21,380] Trial 23 finished with value: 292.552769309522 and parameters: {'iterations': 849, 'depth': 4, 'learning_rate': 0.06688593133163903, 'l2_leaf_reg': 3.3759335126957657}. Best is trial 11 with value: 287.70285354317565.


Validation RMSE: 292.552769309522


[I 2025-02-10 15:52:22,061] Trial 24 finished with value: 297.05203948343245 and parameters: {'iterations': 1272, 'depth': 4, 'learning_rate': 0.05206323086891075, 'l2_leaf_reg': 4.442073542912976}. Best is trial 11 with value: 287.70285354317565.


Validation RMSE: 297.05203948343245


[I 2025-02-10 15:52:22,793] Trial 25 finished with value: 284.52186387120963 and parameters: {'iterations': 1598, 'depth': 4, 'learning_rate': 0.13035343832420987, 'l2_leaf_reg': 8.001260469316373}. Best is trial 25 with value: 284.52186387120963.


Validation RMSE: 284.52186387120963


[I 2025-02-10 15:52:23,296] Trial 26 finished with value: 282.9829075386805 and parameters: {'iterations': 1592, 'depth': 5, 'learning_rate': 0.13921888679636715, 'l2_leaf_reg': 7.909867308844091}. Best is trial 26 with value: 282.9829075386805.


Validation RMSE: 282.9829075386805


[I 2025-02-10 15:52:24,455] Trial 27 finished with value: 311.5017012441645 and parameters: {'iterations': 1595, 'depth': 8, 'learning_rate': 0.13722260235200137, 'l2_leaf_reg': 8.056243948391213}. Best is trial 26 with value: 282.9829075386805.


Validation RMSE: 311.5017012441645


[I 2025-02-10 15:52:25,013] Trial 28 finished with value: 305.54691966277363 and parameters: {'iterations': 1678, 'depth': 6, 'learning_rate': 0.16924854900597044, 'l2_leaf_reg': 9.284450798487496}. Best is trial 26 with value: 282.9829075386805.


Validation RMSE: 305.54691966277363


[I 2025-02-10 15:52:26,504] Trial 29 finished with value: 306.01640484089467 and parameters: {'iterations': 1304, 'depth': 8, 'learning_rate': 0.12224565096339156, 'l2_leaf_reg': 7.574577013327994}. Best is trial 26 with value: 282.9829075386805.


Validation RMSE: 306.01640484089467


[I 2025-02-10 15:52:27,039] Trial 30 finished with value: 287.26570294014317 and parameters: {'iterations': 1916, 'depth': 4, 'learning_rate': 0.10129740175756988, 'l2_leaf_reg': 8.986010928878342}. Best is trial 26 with value: 282.9829075386805.


Validation RMSE: 287.26570294014317


[I 2025-02-10 15:52:27,844] Trial 31 finished with value: 285.00527723519286 and parameters: {'iterations': 1947, 'depth': 4, 'learning_rate': 0.10120105606451563, 'l2_leaf_reg': 9.1054144710057}. Best is trial 26 with value: 282.9829075386805.


Validation RMSE: 285.00527723519286


[I 2025-02-10 15:52:28,720] Trial 32 finished with value: 293.62227300041627 and parameters: {'iterations': 1965, 'depth': 4, 'learning_rate': 0.10077954020804941, 'l2_leaf_reg': 8.874597645537722}. Best is trial 26 with value: 282.9829075386805.


Validation RMSE: 293.62227300041627


[I 2025-02-10 15:52:29,300] Trial 33 finished with value: 290.8717461804573 and parameters: {'iterations': 1916, 'depth': 5, 'learning_rate': 0.14321651490152507, 'l2_leaf_reg': 9.912914023909078}. Best is trial 26 with value: 282.9829075386805.


Validation RMSE: 290.8717461804573


[I 2025-02-10 15:52:29,867] Trial 34 finished with value: 283.7139967400199 and parameters: {'iterations': 1996, 'depth': 4, 'learning_rate': 0.16977806378636723, 'l2_leaf_reg': 9.002384490885788}. Best is trial 26 with value: 282.9829075386805.


Validation RMSE: 283.7139967400199


[I 2025-02-10 15:52:30,357] Trial 35 finished with value: 297.75637459043674 and parameters: {'iterations': 1693, 'depth': 5, 'learning_rate': 0.16896696989256932, 'l2_leaf_reg': 7.680458857326739}. Best is trial 26 with value: 282.9829075386805.


Validation RMSE: 297.75637459043674


[I 2025-02-10 15:52:35,844] Trial 36 finished with value: 329.06798375605115 and parameters: {'iterations': 1997, 'depth': 10, 'learning_rate': 0.1859492225991574, 'l2_leaf_reg': 8.847742919290658}. Best is trial 26 with value: 282.9829075386805.


Validation RMSE: 329.06798375605115


[I 2025-02-10 15:52:36,553] Trial 37 finished with value: 300.9453766556836 and parameters: {'iterations': 1840, 'depth': 6, 'learning_rate': 0.17042246223372876, 'l2_leaf_reg': 9.428324512361739}. Best is trial 26 with value: 282.9829075386805.


Validation RMSE: 300.9453766556836


[I 2025-02-10 15:52:46,298] Trial 38 finished with value: 322.62613381368345 and parameters: {'iterations': 1703, 'depth': 9, 'learning_rate': 0.011841528190227185, 'l2_leaf_reg': 8.627526924340247}. Best is trial 26 with value: 282.9829075386805.


Validation RMSE: 322.62613381368345


[I 2025-02-10 15:52:46,691] Trial 39 finished with value: 287.2451879460649 and parameters: {'iterations': 1517, 'depth': 5, 'learning_rate': 0.20102717213470433, 'l2_leaf_reg': 7.337107378039496}. Best is trial 26 with value: 282.9829075386805.


Validation RMSE: 287.2451879460649


[I 2025-02-10 15:52:47,198] Trial 40 finished with value: 296.02776307347904 and parameters: {'iterations': 1846, 'depth': 4, 'learning_rate': 0.15841671398602594, 'l2_leaf_reg': 7.969803796919278}. Best is trial 26 with value: 282.9829075386805.


Validation RMSE: 296.02776307347904


[I 2025-02-10 15:52:47,774] Trial 41 finished with value: 286.8341192933995 and parameters: {'iterations': 1515, 'depth': 5, 'learning_rate': 0.1930059126309891, 'l2_leaf_reg': 7.341263217792593}. Best is trial 26 with value: 282.9829075386805.


Validation RMSE: 286.8341192933995


[I 2025-02-10 15:52:48,452] Trial 42 finished with value: 294.51014935234247 and parameters: {'iterations': 1627, 'depth': 5, 'learning_rate': 0.24019633912989666, 'l2_leaf_reg': 6.601559740069968}. Best is trial 26 with value: 282.9829075386805.


Validation RMSE: 294.51014935234247


[I 2025-02-10 15:52:48,959] Trial 43 finished with value: 281.8754941710709 and parameters: {'iterations': 1762, 'depth': 4, 'learning_rate': 0.1925243001910627, 'l2_leaf_reg': 8.348713537303649}. Best is trial 43 with value: 281.8754941710709.


Validation RMSE: 281.8754941710709


[I 2025-02-10 15:52:49,372] Trial 44 finished with value: 298.0861930215124 and parameters: {'iterations': 1767, 'depth': 4, 'learning_rate': 0.21749084346361158, 'l2_leaf_reg': 8.632865594883613}. Best is trial 43 with value: 281.8754941710709.


Validation RMSE: 298.0861930215124


[I 2025-02-10 15:52:49,927] Trial 45 finished with value: 291.6467659912286 and parameters: {'iterations': 1773, 'depth': 4, 'learning_rate': 0.11239979069020171, 'l2_leaf_reg': 9.423166260924894}. Best is trial 43 with value: 281.8754941710709.


Validation RMSE: 291.6467659912286


[I 2025-02-10 15:52:50,446] Trial 46 finished with value: 283.52646819895756 and parameters: {'iterations': 1893, 'depth': 4, 'learning_rate': 0.1320028010277751, 'l2_leaf_reg': 9.471881752011972}. Best is trial 43 with value: 281.8754941710709.


Validation RMSE: 283.52646819895756


[I 2025-02-10 15:52:51,131] Trial 47 finished with value: 288.594308667656 and parameters: {'iterations': 1881, 'depth': 4, 'learning_rate': 0.13528081767325503, 'l2_leaf_reg': 8.35097469796052}. Best is trial 43 with value: 281.8754941710709.


Validation RMSE: 288.594308667656


[I 2025-02-10 15:52:51,806] Trial 48 finished with value: 294.51479261550145 and parameters: {'iterations': 1809, 'depth': 5, 'learning_rate': 0.15190092986722023, 'l2_leaf_reg': 9.895164914964605}. Best is trial 43 with value: 281.8754941710709.


Validation RMSE: 294.51479261550145


[I 2025-02-10 15:52:52,229] Trial 49 finished with value: 282.6037976714553 and parameters: {'iterations': 1673, 'depth': 4, 'learning_rate': 0.20778265752621128, 'l2_leaf_reg': 9.55875455129166}. Best is trial 43 with value: 281.8754941710709.


Validation RMSE: 282.6037976714553
0:	learn: 618.4417323	total: 1.07ms	remaining: 1.88s
100:	learn: 209.9910951	total: 74.5ms	remaining: 1.22s
200:	learn: 175.9140168	total: 143ms	remaining: 1.11s
300:	learn: 150.7448356	total: 215ms	remaining: 1.04s
400:	learn: 129.2872958	total: 291ms	remaining: 986ms
500:	learn: 115.6836719	total: 377ms	remaining: 949ms
600:	learn: 105.0506885	total: 452ms	remaining: 873ms
700:	learn: 96.6706314	total: 518ms	remaining: 784ms
800:	learn: 89.1149574	total: 583ms	remaining: 699ms
900:	learn: 82.3654421	total: 675ms	remaining: 645ms
1000:	learn: 76.1633634	total: 751ms	remaining: 571ms
1100:	learn: 70.8755972	total: 824ms	remaining: 495ms
1200:	learn: 66.4307606	total: 905ms	remaining: 423ms
1300:	learn: 62.3018416	total: 975ms	remaining: 346ms
1400:	learn: 58.6931942	total: 1.05s	remaining: 271ms
1500:	learn: 55.5308728	total: 1.12s	remaining: 195ms
1600:	learn: 52.4756794	total: 1.19s	remaining: 120ms
1700:	learn: 49.7525590	total: 1.27s	remaining: 45

