In [3]:
import numpy as np
import pandas as pd
import re
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from lightgbm import LGBMRegressor
from sklearn.linear_model import LinearRegression
from sklearn.base import BaseEstimator, RegressorMixin

############################################
# 1. Funciones de feature-engineering
############################################

def extract_product_keyword(product):
    words = product.split()
    if words[-1].isalnum() and any(c.isalpha() for c in words[-1]) and any(c.isdigit() for c in words[-1]):
        return words[-1]
    return words[0]

def extract_screen_res(resolution):
    matches = re.findall(r'\d+x\d+', resolution)
    return matches[-1] if matches else "Unknown"

def extract_cpu_parts(cpu):
    parts = cpu.split()
    first_word = parts[0] if len(parts) > 0 else "Unknown"
    third_word = parts[2] if len(parts) > 2 else "Unknown"
    last_word = parts[-1] if len(parts) > 0 else "Unknown"
    return first_word, third_word, last_word

def categorize_cpu(cpu):
    if "i7" in cpu:
        return 3
    elif "i5" in cpu:
        return 2
    elif "i3" in cpu:
        return 1
    else:
        return 2

def extract_memory_parts(memory):
    parts = memory.split()
    first_word = parts[0] if len(parts) > 0 else "Unknown"
    last_word = parts[-1] if len(parts) > 1 else "Unknown"
    return first_word, last_word

def extract_gpu_parts(gpu):
    parts = gpu.split()
    first_word = parts[0] if len(parts) > 0 else "Unknown"
    second_word = parts[1] if len(parts) > 1 else "Unknown"
    last_word = parts[-1] if len(parts) > 2 else "Unknown"
    return first_word, second_word, last_word

def categorize_os(os_str):
    os_str = os_str.lower()
    if "windows" in os_str:
        return "Windows"
    elif "linux" in os_str:
        return "Linux"
    elif "mac" in os_str or "macos" in os_str:
        return "Mac"
    elif "android" in os_str:
        return "Android"
    elif "chrome" in os_str or "chromebook" in os_str:
        return "Chrome"
    else:
        return "Others"

############################################
# 2. Preprocesado de train y test
############################################

def preprocess_train(df):
    df = df.copy()
    # --- Transformaciones básicas ---
    df["Product"] = df["Product"].apply(extract_product_keyword)
    df["ScreenResolution"] = df["ScreenResolution"].apply(extract_screen_res)
    
    df[["CPU_Brand", "CPU_Model", "CPU_Speed"]] = df["Cpu"].apply(lambda x: pd.Series(extract_cpu_parts(x)))
    df["CPU_Model_number"] = df["CPU_Model"].apply(categorize_cpu)
    
    df[["Memory_Capacity", "Memory_Type"]] = df["Memory"].apply(lambda x: pd.Series(extract_memory_parts(x)))
    
    df[["GPU_Brand", "GPU_Type", "GPU_Model"]] = df["Gpu"].apply(lambda x: pd.Series(extract_gpu_parts(x)))
    
    df["OpSys_type"] = df["OpSys"].apply(categorize_os)
    df["OpSys_number"] = df["OpSys"].str.extract(r'(\d+)')
    df["OpSys_number"] = df["OpSys_number"].fillna(0).astype(int)
    
    # --- Conversión de variables numéricas ---
    df["Ram"] = df["Ram"].str.replace("GB", "", regex=True).astype(int)
    df["Weight"] = df["Weight"].str.replace("kg", "", regex=True).astype(float)
    # Rellenamos faltantes en variables categóricas
    for col in df.select_dtypes(include=["object"]).columns:
        df[col].fillna(df[col].mode()[0], inplace=True)
    df["CPU_Speed"] = df["CPU_Speed"].str[:-3].astype(float)
    df["Memory_Capacity"] = df["Memory_Capacity"].str.replace('TB', '000', regex=True)\
                                               .str.replace('GB', '', regex=True).astype(float)
    
    # --- Nuevas features ---
    df["Pixel_Count"] = df["ScreenResolution"].apply(lambda x: int(x.split('x')[0]) * int(x.split('x')[1]) if 'x' in x else 0)
    df["DPI"] = df["Pixel_Count"] / (df["Inches"] ** 2)
    df["Weight_per_Inch"] = df["Weight"] / df["Inches"]
    df["CPU_Ratio"] = df["CPU_Brand"].astype(str) + "_" + df["Ram"].astype(str)
    df["Ram_per_CPU"] = df["Ram"] / (df["CPU_Speed"] + 0.1)
    df["Pixel_Density"] = df["Pixel_Count"] / (df["Inches"] ** 2)
    df["Storage_Efficiency"] = df["Memory_Capacity"] / (df["Weight"] + 0.1)
    
    # --- Aseguramos el tipo string en las variables categóricas importantes ---
    cat_cols = ["Company", "Product", "TypeName", "CPU_Brand", "Memory_Type", 
                "GPU_Brand", "GPU_Type", "GPU_Model", "OpSys_type"]
    for col in cat_cols:
        df[col] = df[col].astype(str)
        
    # --- Eliminamos las columnas originales que ya fueron transformadas ---
    df.drop(columns=["OpSys", "Cpu", "CPU_Model", "Memory", "Gpu", "ScreenResolution", "Weight", "Inches"], inplace=True)
    return df

def preprocess_test(df):
    df = df.copy()
    df["Product"] = df["Product"].apply(extract_product_keyword)
    df["ScreenResolution"] = df["ScreenResolution"].apply(extract_screen_res)
    
    df[["CPU_Brand", "CPU_Model", "CPU_Speed"]] = df["Cpu"].apply(lambda x: pd.Series(extract_cpu_parts(x)))
    df["CPU_Model_number"] = df["CPU_Model"].apply(categorize_cpu)
    
    df[["Memory_Capacity", "Memory_Type"]] = df["Memory"].apply(lambda x: pd.Series(extract_memory_parts(x)))
    
    df[["GPU_Brand", "GPU_Type", "GPU_Model"]] = df["Gpu"].apply(lambda x: pd.Series(extract_gpu_parts(x)))
    
    df["OpSys_type"] = df["OpSys"].apply(categorize_os)
    df["OpSys_number"] = df["OpSys"].str.extract(r'(\d+)')
    df["OpSys_number"] = df["OpSys_number"].fillna(0).astype(int)
    
    df["Ram"] = df["Ram"].str.replace("GB", "", regex=True).astype(int)
    df["Weight"] = df["Weight"].str.replace("kg", "", regex=True).astype(float)
    for col in df.select_dtypes(include=["object"]).columns:
        df[col].fillna(df[col].mode()[0], inplace=True)
    df["CPU_Speed"] = df["CPU_Speed"].str[:-3].astype(float)
    df["Memory_Capacity"] = df["Memory_Capacity"].str.replace('TB', '000', regex=True)\
                                               .str.replace('GB', '', regex=True).astype(float)
    
    df["Pixel_Count"] = df["ScreenResolution"].apply(lambda x: int(x.split('x')[0]) * int(x.split('x')[1]) if 'x' in x else 0)
    df["DPI"] = df["Pixel_Count"] / (df["Inches"] ** 2)
    df["Weight_per_Inch"] = df["Weight"] / df["Inches"]
    df["CPU_Ratio"] = df["CPU_Brand"].astype(str) + "_" + df["Ram"].astype(str)
    df["Ram_per_CPU"] = df["Ram"] / (df["CPU_Speed"] + 0.1)
    df["Pixel_Density"] = df["Pixel_Count"] / (df["Inches"] ** 2)
    df["Storage_Efficiency"] = df["Memory_Capacity"] / (df["Weight"] + 0.1)
    
    cat_cols = ["Company", "Product", "TypeName", "CPU_Brand", "Memory_Type", 
                "GPU_Brand", "GPU_Type", "GPU_Model", "OpSys_type"]
    for col in cat_cols:
        df[col] = df[col].astype(str)
    df.drop(columns=["OpSys", "Cpu", "CPU_Model", "Memory", "Gpu", "ScreenResolution", "Weight", "Inches"], inplace=True)
    return df

############################################
# 3. Lectura y preprocesado de datos
############################################

# --- Datos de train ---
df_train = pd.read_csv("./data/train.csv", index_col=0)
df_train.index.name = None
df_train_processed = preprocess_train(df_train)

# --- Datos de test ---
df_test = pd.read_csv("./data/test.csv", index_col=0)
df_test.index.name = None
df_test_processed = preprocess_test(df_test)

############################################
# 4. Preparar dos versiones de los datos:
#    a) Versión para CatBoost (con categoricals sin transformar)
#    b) Versión para otros modelos (One-Hot encoding de categoricals)
############################################

# Lista de columnas categóricas importantes (para explicabilidad)
cat_cols = ["Company", "Product", "TypeName", "CPU_Brand", "Memory_Type", 
            "GPU_Brand", "GPU_Type", "GPU_Model", "OpSys_type", "CPU_Ratio"]

# Versión para CatBoost: se mantiene la versión original con categoricals
df_train_cb = df_train_processed.copy()
df_test_cb  = df_test_processed.copy()

# Versión para "otros" modelos: se aplican One-Hot Encoding a las categoricals
df_train_ohe = pd.get_dummies(df_train_processed, columns=cat_cols, drop_first=True)
df_test_ohe  = pd.get_dummies(df_test_processed, columns=cat_cols, drop_first=True)

# Align the test DataFrame with the train DataFrame:
df_test_ohe = df_test_ohe.reindex(columns=df_train_ohe.columns, fill_value=0)

# Remove forbidden characters from column names
df_train_ohe.columns = df_train_ohe.columns.str.replace('[\[\]<>]', '', regex=True)
df_test_ohe.columns = df_test_ohe.columns.str.replace('[\[\]<>]', '', regex=True)

############################################
# 5. Escalado de variables numéricas
############################################

# Determinar las columnas numéricas (excluyendo la variable objetivo)
num_cols = [col for col in df_train_cb.columns 
            if df_train_cb[col].dtype in [np.int64, np.float64] and col != "Price_in_euros"]

# Se usa el mismo scaler para ambas versiones (ya que las columnas numéricas son iguales)
scaler = StandardScaler()
df_train_cb[num_cols] = scaler.fit_transform(df_train_cb[num_cols])
df_test_cb[num_cols] = scaler.transform(df_test_cb[num_cols])
df_train_ohe[num_cols] = scaler.transform(df_train_ohe[num_cols])
df_test_ohe[num_cols] = scaler.transform(df_test_ohe[num_cols])

############################################
# 6. División en train/validation
############################################

# Para CatBoost version:
X_cb = df_train_cb.drop(columns=["Price_in_euros"])
y = df_train_cb["Price_in_euros"]
X_train_cb, X_val_cb, y_train, y_val = train_test_split(X_cb, y, test_size=0.2, random_state=42)

# Para one-hot version (otros modelos):
X_ohe = df_train_ohe.drop(columns=["Price_in_euros"])
# y es the same as above (same order)
X_train_ohe, X_val_ohe, _, _ = train_test_split(X_ohe, y, test_size=0.2, random_state=42)

############################################
# 7. Definir un wrapper para CatBoost que permita pasar cat_features
############################################

class CatBoostWrapper(BaseEstimator, RegressorMixin):
    def __init__(self, cat_features=None, **params):
        self.cat_features = cat_features
        self.params = params
        self.model = CatBoostRegressor(**params)
    def fit(self, X, y):
        self.model.fit(X, y, cat_features=self.cat_features, verbose=0)
        return self
    def predict(self, X):
        return self.model.predict(X)
    def get_params(self, deep=True):
        out = self.params.copy()
        out['cat_features'] = self.cat_features
        return out
    def set_params(self, **params):
        self.cat_features = params.pop('cat_features', self.cat_features)
        self.params.update(params)
        self.model = CatBoostRegressor(**self.params)
        return self

# Para CatBoost, obtener los índices de las columnas categóricas
cat_features_indices = [X_cb.columns.get_loc(col) for col in cat_cols if col in X_cb.columns]

############################################
# 8. Definir modelos y espacios de hiperparámetros
############################################

# Usaremos los datos one-hot para modelos que requieren numeric inputs
# y la versión original (con categoricals) para CatBoost.
models = {
    "Linear Regression": LinearRegression(),
    "XGBoost": XGBRegressor(random_state=42),
    "Random Forest": RandomForestRegressor(random_state=42),
    "LightGBM": LGBMRegressor(random_state=42),
    "CatBoost": CatBoostWrapper(cat_features=cat_features_indices, loss_function="RMSE", random_seed=42)
}

param_grids = {
    "CatBoost": {"depth": [6, 8, 10], "learning_rate": [0.01, 0.05, 0.1], "iterations": [500, 1000, 1500]},
    "XGBoost": {"n_estimators": [50, 100, 200], "learning_rate": [0.01, 0.05, 0.1], "max_depth": [3, 5, 7]},
    "Random Forest": {"n_estimators": [50, 100, 200], "max_depth": [None, 10, 20], "min_samples_split": [2, 5, 10]},
    "LightGBM": {"n_estimators": [50, 100, 200], "learning_rate": [0.01, 0.05, 0.1], "max_depth": [3, 5, 7]}
}

############################################
# 9. Entrenamiento y optimización de modelos
############################################

best_models = {}

for name, model in models.items():
    print(f"\nEntrenando y optimizando {name}...")
    if name in param_grids:
        # Para CatBoost se usa la versión con categoricals; para otros, la versión one-hot
        if name == "CatBoost":
            X_train_model, X_val_model = X_train_cb, X_val_cb
        else:
            X_train_model, X_val_model = X_train_ohe, X_val_ohe
            
        search = RandomizedSearchCV(model, param_grids[name], n_iter=15, cv=3,
                                      scoring="neg_root_mean_squared_error", verbose=1, n_jobs=-1, random_state=42)
        search.fit(X_train_model, y_train)
        best_models[name] = search.best_estimator_
        print(f"{name} mejores hiperparámetros: {search.best_params_}")
    else:
        # Para Linear Regression sin tuning
        if name == "CatBoost":
            X_train_model, X_val_model = X_train_cb, X_val_cb
        else:
            X_train_model, X_val_model = X_train_ohe, X_val_ohe
        model.fit(X_train_model, y_train)
        best_models[name] = model

# Evaluamos cada modelo en el conjunto de validación
for name, model in best_models.items():
    if name == "CatBoost":
        X_val_model = X_val_cb
    else:
        X_val_model = X_val_ohe
    y_pred = model.predict(X_val_model)
    rmse = np.sqrt(mean_squared_error(y_val, y_pred))
    print(f"{name} RMSE en validación: {rmse}")

# Seleccionamos el mejor modelo (según RMSE en validación)
best_model_name = min(best_models, key=lambda m: np.sqrt(mean_squared_error(y_val, best_models[m].predict(X_val_cb if m=="CatBoost" else X_val_ohe))))
best_model = best_models[best_model_name]
print(f"\nMejor modelo seleccionado: {best_model_name}")

############################################
# 10. Predicciones y generación del submission
############################################

# Para generar predicciones, usamos la versión correspondiente del test:
if best_model_name == "CatBoost":
    X_test_final = df_test_cb.drop(columns=["Price_in_euros"], errors='ignore')
else:
    X_test_final = df_test_ohe.drop(columns=["Price_in_euros"], errors='ignore')

y_test_pred = best_model.predict(X_test_final)
df_submission = pd.DataFrame({"laptop_ID": df_test.index, "Price_in_euros": y_test_pred})
df_submission.to_csv("submission_best_model.csv", index=False)
print(f"\nPredicciones guardadas en 'submission_best_model.csv' usando {best_model_name}.")


  df_train_ohe.columns = df_train_ohe.columns.str.replace('[\[\]<>]', '', regex=True)
  df_test_ohe.columns = df_test_ohe.columns.str.replace('[\[\]<>]', '', regex=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna


Entrenando y optimizando Linear Regression...

Entrenando y optimizando XGBoost...
Fitting 3 folds for each of 15 candidates, totalling 45 fits
XGBoost mejores hiperparámetros: {'n_estimators': 200, 'max_depth': 3, 'learning_rate': 0.05}

Entrenando y optimizando Random Forest...
Fitting 3 folds for each of 15 candidates, totalling 45 fits
Random Forest mejores hiperparámetros: {'n_estimators': 200, 'min_samples_split': 2, 'max_depth': None}

Entrenando y optimizando LightGBM...
Fitting 3 folds for each of 15 candidates, totalling 45 fits
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000347 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 538
[LightGBM] [Info] Number of data points in the train set: 729, number of used features: 51
[LightGBM] [Info] Start training from score 1103.789314
LightGBM mejores hiperparámetros: {'n_estimators