In [None]:
import numpy as np
import pandas as pd
import re
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from lightgbm import LGBMRegressor
from sklearn.linear_model import LinearRegression

############################################
# 1. Funciones de feature-engineering
############################################

def extract_product_keyword(product):
    words = product.split()
    if words[-1].isalnum() and any(c.isalpha() for c in words[-1]) and any(c.isdigit() for c in words[-1]):
        return words[-1]
    return words[0]

def extract_screen_res(resolution):
    matches = re.findall(r'\d+x\d+', resolution)
    return matches[-1] if matches else "Unknown"

def extract_cpu_parts(cpu):
    parts = cpu.split()
    first_word = parts[0] if len(parts) > 0 else "Unknown"
    third_word = parts[2] if len(parts) > 2 else "Unknown"
    last_word = parts[-1] if len(parts) > 0 else "Unknown"
    return first_word, third_word, last_word

def categorize_cpu(cpu):
    if "i7" in cpu:
        return 3
    elif "i5" in cpu:
        return 2
    elif "i3" in cpu:
        return 1
    else:
        return 2

def extract_memory_parts(memory):
    parts = memory.split()
    first_word = parts[0] if len(parts) > 0 else "Unknown"
    last_word = parts[-1] if len(parts) > 1 else "Unknown"
    return first_word, last_word

def extract_gpu_parts(gpu):
    parts = gpu.split()
    first_word = parts[0] if len(parts) > 0 else "Unknown"
    second_word = parts[1] if len(parts) > 1 else "Unknown"
    last_word = parts[-1] if len(parts) > 2 else "Unknown"
    return first_word, second_word, last_word

def categorize_os(os):
    os = os.lower()
    if "windows" in os:
        return "Windows"
    elif "linux" in os:
        return "Linux"
    elif "mac" in os or "macos" in os:
        return "Mac"
    elif "android" in os:
        return "Android"
    elif "chrome" in os or "chromebook" in os:
        return "Chrome"
    else:
        return "Others"

############################################
# 2. Preprocesado de train y test
############################################

def preprocess_train(df):
    df = df.copy()
    # --- Transformaciones originales ---
    df["Product"] = df["Product"].apply(extract_product_keyword)
    df["ScreenResolution"] = df["ScreenResolution"].apply(extract_screen_res)
    
    df[["CPU_Brand", "CPU_Model", "CPU_Speed"]] = df["Cpu"].apply(lambda x: pd.Series(extract_cpu_parts(x)))
    df["CPU_Model_number"] = df["CPU_Model"].apply(categorize_cpu)
    
    df[["Memory_Capacity", "Memory_Type"]] = df["Memory"].apply(lambda x: pd.Series(extract_memory_parts(x)))
    
    df[["GPU_Brand", "GPU_Type", "GPU_Model"]] = df["Gpu"].apply(lambda x: pd.Series(extract_gpu_parts(x)))
    
    df["OpSys_type"] = df["OpSys"].apply(categorize_os)
    df["OpSys_number"] = df["OpSys"].str.extract(r'(\d+)')
    df["OpSys_number"] = df["OpSys_number"].fillna(0).astype(int)
    
    # --- Conversión de variables numéricas ---
    df["Ram"] = df["Ram"].str.replace("GB", "", regex=True).astype(int)
    df["Weight"] = df["Weight"].str.replace("kg", "", regex=True).astype(float)
    for col in df.select_dtypes(include=["object"]).columns:
        df[col].fillna(df[col].mode()[0], inplace=True)
    df["CPU_Speed"] = df["CPU_Speed"].str[:-3].astype(float)
    df["Memory_Capacity"] = df["Memory_Capacity"].str.replace('TB', '000', regex=True)\
                                               .str.replace('GB', '', regex=True).astype(float)
    
    # --- Nuevas features ---
    df["Pixel_Count"] = df["ScreenResolution"].apply(lambda x: int(x.split('x')[0]) * int(x.split('x')[1]) if 'x' in x else 0)
    df["DPI"] = df["Pixel_Count"] / (df["Inches"] ** 2)
    df["Weight_per_Inch"] = df["Weight"] / df["Inches"]
    df["CPU_Ratio"] = df["CPU_Brand"].astype(str) + "_" + df["Ram"].astype(str)
    df["Ram_per_CPU"] = df["Ram"] / (df["CPU_Speed"] + 0.1)
    df["Pixel_Density"] = df["Pixel_Count"] / (df["Inches"] ** 2)
    df["Storage_Efficiency"] = df["Memory_Capacity"] / (df["Weight"] + 0.1)
    
    # --- Aseguramos que ciertas columnas sean string ---
    for col in ["Company", "Product", "TypeName", "ScreenResolution", "CPU_Brand", 
                "CPU_Model", "Memory_Type", "GPU_Brand", "GPU_Model", "OpSys"]:
        df[col] = df[col].astype(str)
        
    # --- Eliminamos columnas que ya se transformaron ---
    df.drop(columns=["OpSys", "Cpu", "CPU_Model", "Memory", "Gpu", "ScreenResolution", "Weight", "Inches"], inplace=True)
    return df

def transform_categoricals_train(df):
    """
    Ordena el DataFrame por Price_in_euros descendente, selecciona las columnas categóricas con ≤10 categorías,
    y para cada una genera una nueva columna *_transformed asignando números descendentes según la suma de los
    precios de los primeros 20 registros de cada categoría.
    Devuelve el DataFrame resultante, el diccionario de mapeos y la lista de columnas transformadas.
    """
    df_sorted = df.sort_values(by="Price_in_euros", ascending=False).copy()
    valid_categorical_cols = [col for col in df_sorted.select_dtypes(include=["object"]).columns 
                              if df_sorted[col].nunique() <= 10]
    category_mappings = {}
    for col in valid_categorical_cols:
        # Suma de los precios de los primeros 20 registros por categoría
        top20_sum = df_sorted.groupby(col).apply(lambda x: x["Price_in_euros"].head(50).sum())
        sorted_categories = top20_sum.sort_values(ascending=False).index.tolist()
        # Asignar números descendentes: el mayor es igual al número de categorías y el menor es 1.
        mapping = {cat: rank for cat, rank in zip(sorted_categories, range(len(sorted_categories), 0, -1))}
        category_mappings[col] = mapping
        df_sorted[col + "_transformed"] = df_sorted[col].map(mapping)
    # Eliminar las columnas categóricas originales
    df_numeric = df_sorted.drop(columns=valid_categorical_cols)
    return df_numeric, category_mappings, valid_categorical_cols

def preprocess_test(df):
    df = df.copy()
    df["Product"] = df["Product"].apply(extract_product_keyword)
    df["ScreenResolution"] = df["ScreenResolution"].apply(extract_screen_res)
    
    df[["CPU_Brand", "CPU_Model", "CPU_Speed"]] = df["Cpu"].apply(lambda x: pd.Series(extract_cpu_parts(x)))
    df["CPU_Model_number"] = df["CPU_Model"].apply(categorize_cpu)
    
    df[["Memory_Capacity", "Memory_Type"]] = df["Memory"].apply(lambda x: pd.Series(extract_memory_parts(x)))
    
    df[["GPU_Brand", "GPU_Type", "GPU_Model"]] = df["Gpu"].apply(lambda x: pd.Series(extract_gpu_parts(x)))
    
    df["OpSys_type"] = df["OpSys"].apply(categorize_os)
    df["OpSys_number"] = df["OpSys"].str.extract(r'(\d+)')
    df["OpSys_number"] = df["OpSys_number"].fillna(0).astype(int)
    
    df["Ram"] = df["Ram"].str.replace("GB", "", regex=True).astype(int)
    df["Weight"] = df["Weight"].str.replace("kg", "", regex=True).astype(float)
    for col in df.select_dtypes(include=["object"]).columns:
        df[col].fillna(df[col].mode()[0], inplace=True)
    df["CPU_Speed"] = df["CPU_Speed"].str[:-3].astype(float)
    df["Memory_Capacity"] = df["Memory_Capacity"].str.replace('TB', '000', regex=True)\
                                               .str.replace('GB', '', regex=True).astype(float)
    
    df["Pixel_Count"] = df["ScreenResolution"].apply(lambda x: int(x.split('x')[0]) * int(x.split('x')[1]) if 'x' in x else 0)
    df["DPI"] = df["Pixel_Count"] / (df["Inches"] ** 2)
    df["Weight_per_Inch"] = df["Weight"] / df["Inches"]
    df["CPU_Ratio"] = df["CPU_Brand"].astype(str) + "_" + df["Ram"].astype(str)
    df["Ram_per_CPU"] = df["Ram"] / (df["CPU_Speed"] + 0.1)
    df["Pixel_Density"] = df["Pixel_Count"] / (df["Inches"] ** 2)
    df["Storage_Efficiency"] = df["Memory_Capacity"] / (df["Weight"] + 0.1)
    
    for col in ["Company", "Product", "TypeName", "ScreenResolution", "CPU_Brand", 
                "CPU_Model", "Memory_Type", "GPU_Brand", "GPU_Model", "OpSys"]:
        df[col] = df[col].astype(str)
    df.drop(columns=["OpSys", "Cpu", "CPU_Model", "Memory", "Gpu", "ScreenResolution", "Weight", "Inches"], inplace=True)
    return df

def transform_categoricals_test(df, category_mappings, valid_categorical_cols):
    """
    Para test se aplican los mismos mapeos obtenidos en train.
    Luego se eliminan las columnas originales.
    """
    df_transformed = df.copy()
    for col in valid_categorical_cols:
        if col in df_transformed.columns:
            df_transformed[col + "_transformed"] = df_transformed[col].map(category_mappings.get(col, {}))
    df_numeric = df_transformed.drop(columns=valid_categorical_cols)
    return df_numeric

############################################
# 3. Lectura y preprocesado de datos
############################################

# --- Datos de train ---
df_train = pd.read_csv("./data/train.csv", index_col=0)
df_train.index.name = None

# Preprocesamos y creamos nuevas features
df_train_processed = preprocess_train(df_train)
# Transformamos las columnas categóricas según lo solicitado
df_final, category_mappings, valid_categorical_cols = transform_categoricals_train(df_train_processed)
# After computing the mapping and before splitting into train and validation:
df_final = df_final.sample(frac=1, random_state=42).reset_index(drop=True)
print("Train data (después de transformación):")
print(df_final.head())
print(df_final.info())

# --- Datos de test ---
df_test = pd.read_csv("./data/test.csv", index_col=0)
df_test.index.name = None

df_test_processed = preprocess_test(df_test)
df_final_test = transform_categoricals_test(df_test_processed, category_mappings, valid_categorical_cols)
print("Test data (después de transformación):")
print(df_final_test.head())
print(df_final_test.info())

############################################
# 4. Codificación y escalado
############################################

# Si quedan columnas de tipo object, las codificamos (aunque según el punto (v) se desechan)
df_encoded = df_final.copy()
for col in df_encoded.select_dtypes(include=["object"]).columns:
    le = LabelEncoder()
    df_encoded[col] = le.fit_transform(df_encoded[col])

df_test_encoded = df_final_test.copy()
for col in df_test_encoded.select_dtypes(include=["object"]).columns:
    if col in df_final.columns:
        le = LabelEncoder()
        le.fit(df_final[col])
        df_test_encoded[col] = df_test_encoded[col].map(lambda x: le.transform([x])[0] if x in le.classes_ else -1)

# Seleccionamos las columnas numéricas (excluyendo la variable objetivo)
numerical_cols = df_encoded.select_dtypes(include=["number"]).columns.tolist()
if "Price_in_euros" in numerical_cols:
    numerical_cols.remove("Price_in_euros")

# Escalado con StandardScaler
scaler = StandardScaler()
df_encoded[numerical_cols] = scaler.fit_transform(df_encoded[numerical_cols])
df_test_encoded[numerical_cols] = scaler.transform(df_test_encoded[numerical_cols])
print("Datos numéricos normalizados correctamente.")

# Dividimos en variables predictoras y objetivo
X = df_encoded.drop(columns=["Price_in_euros"])
y = df_encoded["Price_in_euros"]

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

############################################
# 5. Entrenamiento y optimización de modelos
############################################

# Definición de modelos a probar
models = {
    "Linear Regression": LinearRegression(),
    "CatBoost": CatBoostRegressor(loss_function="RMSE", verbose=0, random_state=42),
    "XGBoost": XGBRegressor(random_state=42),
    "Random Forest": RandomForestRegressor(random_state=42),
    "LightGBM": LGBMRegressor(random_state=42)
}

# Espacios de hiperparámetros para los modelos que lo admiten
param_grids = {
    "CatBoost": {"depth": [6, 8, 10], "learning_rate": [0.01, 0.05, 0.1], "iterations": [500, 1000, 1500]},
    "XGBoost": {"n_estimators": [50, 100, 200], "learning_rate": [0.01, 0.05, 0.1], "max_depth": [3, 5, 7]},
    "Random Forest": {"n_estimators": [50, 100, 200], "max_depth": [None, 10, 20], "min_samples_split": [2, 5, 10]},
    "LightGBM": {"n_estimators": [50, 100, 200], "learning_rate": [0.01, 0.05, 0.1], "max_depth": [3, 5, 7]}
}

best_models = {}

for name, model in models.items():
    if name in param_grids:
        search = RandomizedSearchCV(model, param_grids[name], n_iter=15, cv=3,
                                      scoring="neg_root_mean_squared_error", verbose=1, n_jobs=-1, random_state=42)
        search.fit(X_train, y_train)
        best_models[name] = search.best_estimator_
        print(f"{name} mejores hiperparámetros: {search.best_params_}")
    else:
        model.fit(X_train, y_train)
        best_models[name] = model

# Evaluamos cada modelo en el conjunto de validación
for name, model in best_models.items():
    y_pred = model.predict(X_val)
    rmse = np.sqrt(mean_squared_error(y_val, y_pred))
    print(f"{name} RMSE en validación: {rmse}")

# Seleccionamos el mejor modelo (según RMSE)
best_model_name = min(best_models, key=lambda m: np.sqrt(mean_squared_error(y_val, best_models[m].predict(X_val))))
best_model = best_models[best_model_name]
print(f"Mejor modelo seleccionado: {best_model_name}")

############################################
# 6. Predicciones y generación del submission
############################################

# Se usa el DataFrame de test ya procesado y escalado
X_test_final = df_test_encoded
y_test_pred = best_model.predict(X_test_final)
df_submission = pd.DataFrame({"laptop_ID": df_test.index, "Price_in_euros": y_test_pred})
df_submission.to_csv("submission_best_model.csv", index=False)
print(f"Predicciones guardadas en 'submission_best_model.csv' usando {best_model_name}.")

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mode()[0], inplace=True)
  top20_sum = df_sorted.groupby(col).apply(lambda x: x["Price_in_euros"].head(50).sum())
  top20_sum = df_sorted.groupby(col).apply(lambda x: x["Price_in_euros"].head(50).sum())
  top20_sum = df_sorted.groupby(col).apply(lambda x: x["Price_in_euros"].head(50).sum())
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].m

Train data (después de transformación):
  Company       Product            TypeName  Ram  Price_in_euros  CPU_Speed  \
0  Lenovo          Yoga  2 in 1 Convertible    4           638.0        2.5   
1    Acer  ES1-523-84K7            Notebook    8           469.0        2.2   
2  Lenovo        Legion              Gaming    8           809.0        2.5   
3    Asus      VivoBook            Notebook    8          1145.0        1.8   
4      HP       ProBook            Notebook    4          1149.0        2.3   

   CPU_Model_number  Memory_Capacity GPU_Type GPU_Model  ... Pixel_Count  \
0                 3            128.0       HD       520  ...     2073600   
1                 2            256.0   Radeon        R5  ...     2073600   
2                 2            128.0  GeForce      1050  ...     2073600   
3                 3            128.0  GeForce     150MX  ...     2073600   
4                 2            500.0       HD       520  ...     1049088   

            DPI  Weight_per_