In [16]:
# Importaciones básicas
import numpy as np
import pandas as pd

# Visualización
import matplotlib.pyplot as plt
import seaborn as sns

# Modelos
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor

# Evaluación y división de datos
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error

# Preprocesamiento
from sklearn.preprocessing import PolynomialFeatures
import re  # Para procesar cadenas

# Manejo de advertencias
import warnings
warnings.filterwarnings('ignore')

# LightGBM (si se usa en otro punto del código)
from lightgbm import LGBMRegressor


In [17]:
# Cargar los datos
train_df = pd.read_csv('data/train.csv')
test_df = pd.read_csv('data/test.csv')
sample_df = pd.read_csv('data/sample_submission.csv')

In [18]:
# Procesar la columna 'Weight' y convertirla a numérico
train_df['Weight'] = train_df['Weight'].str.replace('kg', '').astype(float)
test_df['Weight'] = test_df['Weight'].str.replace('kg', '').astype(float)

# Procesar la columna 'Ram' y convertirla a numérico
train_df['Ram'] = train_df['Ram'].str.replace('GB', '').astype(int)
test_df['Ram'] = test_df['Ram'].str.replace('GB', '').astype(int)

# Extraer la resolución de pantalla y combinarla en una sola columna 'Resolution'
train_df[['res_width', 'res_height']] = train_df['ScreenResolution'].str.extract(r'(\d{3,4})x(\d{3,4})').astype(float)
test_df[['res_width', 'res_height']] = test_df['ScreenResolution'].str.extract(r'(\d{3,4})x(\d{3,4})').astype(float)

# Combinar 'res_width' y 'res_height' en una sola columna 'Resolution'
train_df['Resolution'] = train_df['res_width'] * train_df['res_height']
test_df['Resolution'] = test_df['res_width'] * test_df['res_height']

# Eliminar las columnas 'res_width' y 'res_height'
train_df = train_df.drop(columns=['res_width', 'res_height'])
test_df = test_df.drop(columns=['res_width', 'res_height'])

# Procesar la columna 'Memory' para separar SSD, HDD, Flash Storage y Hybrid
def parse_memory_details(memory_str):
    if isinstance(memory_str, str):
        memory_str = memory_str.strip().lower()
        ssd, hdd, flash, hybrid = 0, 0, 0, 0
        
        if 'ssd' in memory_str:
            matches = re.findall(r'(\d+)(tb|gb) ssd', memory_str)
            for size, unit in matches:
                ssd += int(size) * (1024 if unit == 'tb' else 1)
        
        if 'hdd' in memory_str:
            matches = re.findall(r'(\d+)(tb|gb) hdd', memory_str)
            for size, unit in matches:
                hdd += int(size) * (1024 if unit == 'tb' else 1)
        
        if 'flash storage' in memory_str:
            flash = 1
        
        if 'hybrid' in memory_str:
            hybrid = 1
        
        return ssd, hdd, flash, hybrid
    
    return 0, 0, 0, 0

train_df[['Memory_SSD', 'Memory_HDD', 'Flash_Storage', 'Hybrid']] = train_df['Memory'].apply(
    lambda x: pd.Series(parse_memory_details(x))
)
test_df[['Memory_SSD', 'Memory_HDD', 'Flash_Storage', 'Hybrid']] = test_df['Memory'].apply(
    lambda x: pd.Series(parse_memory_details(x))
)

train_df = train_df.drop(columns=['Memory'])
test_df = test_df.drop(columns=['Memory'])


In [19]:
# Procesar la columna 'Cpu' para extraer detalles
def parse_cpu_details(cpu_str):
    if isinstance(cpu_str, str):
        cpu_str = cpu_str.strip().lower()
        cores = re.search(r'(\d+)\s*core', cpu_str)
        cores = int(cores.group(1)) if cores else 0
        frequency = re.search(r'(\d+\.?\d*)\s*ghz', cpu_str)
        frequency = float(frequency.group(1)) if frequency else 0.0
        family = 'Intel' if 'intel' in cpu_str else 'AMD' if 'amd' in cpu_str else 'Other'
        series = re.search(r'(i\d|ryzen \d)', cpu_str)
        series = series.group(1) if series else 'Other'
        return cores, frequency, family, series
    return 0, 0.0, 'Other', 'Other'

train_df[['Cpu_Cores', 'Cpu_Frequency', 'Cpu_Family', 'Cpu_Series']] = train_df['Cpu'].apply(
    lambda x: pd.Series(parse_cpu_details(x))
)
test_df[['Cpu_Cores', 'Cpu_Frequency', 'Cpu_Family', 'Cpu_Series']] = test_df['Cpu'].apply(
    lambda x: pd.Series(parse_cpu_details(x))
)

train_df = train_df.drop(columns=['Cpu'])
test_df = test_df.drop(columns=['Cpu'])

train_df = pd.get_dummies(train_df, columns=['Cpu_Family', 'Cpu_Series'], drop_first=True)
test_df = pd.get_dummies(test_df, columns=['Cpu_Family', 'Cpu_Series'], drop_first=True)

# Procesar la columna 'Gpu' para extraer detalles
def parse_gpu_details(gpu_str):
    if isinstance(gpu_str, str):
        gpu_str = gpu_str.strip().lower()
        brand = 'NVIDIA' if 'nvidia' in gpu_str else 'AMD' if 'amd' in gpu_str else 'Intel' if 'intel' in gpu_str else 'Other'
        family = re.search(r'(gtx|rtx|radeon|iris|hd graphics)', gpu_str)
        family = family.group(1) if family else 'Other'
        model = re.search(r'\b(\d+)\b', gpu_str)
        model = int(model.group(1)) if model else 0
        return brand, family, model
    return 'Other', 'Other', 0

train_df[['Gpu_Brand', 'Gpu_Family', 'Gpu_Model']] = train_df['Gpu'].apply(
    lambda x: pd.Series(parse_gpu_details(x))
)
test_df[['Gpu_Brand', 'Gpu_Family', 'Gpu_Model']] = test_df['Gpu'].apply(
    lambda x: pd.Series(parse_gpu_details(x))
)

train_df = pd.get_dummies(train_df, columns=['Gpu_Brand', 'Gpu_Family'], drop_first=True)
test_df = pd.get_dummies(test_df, columns=['Gpu_Brand', 'Gpu_Family'], drop_first=True)


In [20]:
# Procesar la columna 'Company' como variable categórica (One-Hot Encoding)
train_companies = pd.get_dummies(train_df['Company'], prefix='Company')
test_companies = pd.get_dummies(test_df['Company'], prefix='Company')

# Asegurar que ambas tablas tengan las mismas columnas
common_cols = train_companies.columns.union(test_companies.columns)
train_companies = train_companies.reindex(columns=common_cols, fill_value=0)
test_companies = test_companies.reindex(columns=common_cols, fill_value=0)

# Agregar las columnas procesadas de 'Company' al conjunto principal
train_df = pd.concat([train_df, train_companies], axis=1)
test_df = pd.concat([test_df, test_companies], axis=1)

# Procesar las columnas 'TypeName' y 'OpSys' como variables categóricas (One-Hot Encoding)
for col in ['TypeName', 'OpSys']:
    train_encoded = pd.get_dummies(train_df[col], prefix=col)
    test_encoded = pd.get_dummies(test_df[col], prefix=col)
    
    # Asegurar columnas comunes
    common_cols = train_encoded.columns.union(test_encoded.columns)
    train_encoded = train_encoded.reindex(columns=common_cols, fill_value=0)
    test_encoded = test_encoded.reindex(columns=common_cols, fill_value=0)
    
    # Agregar al conjunto principal
    train_df = pd.concat([train_df, train_encoded], axis=1)
    test_df = pd.concat([test_df, test_encoded], axis=1)
    
    # Eliminar la columna original
    train_df = train_df.drop(columns=[col])
    test_df = test_df.drop(columns=[col])


In [21]:
# Actualizar las características finales después de asegurar columnas comunes
features_to_use_improved = (
    ['Inches', 'Ram', 'Weight', 'Resolution', 'Memory_SSD', 'Memory_HDD', 'Flash_Storage', 'Hybrid', 'Cpu_Cores', 'Cpu_Frequency', 'Gpu_Model']
    + [col for col in train_df.columns if col.startswith('TypeName_') or col.startswith('OpSys_') or col.startswith('Cpu_Family_') or col.startswith('Cpu_Series_') or col.startswith('Gpu_Brand_') or col.startswith('Gpu_Family_') or col.startswith('Company_')]
)

# Crear los conjuntos de entrenamiento y prueba
X_train_improved = train_df[features_to_use_improved]
X_test_improved = test_df[features_to_use_improved]

# Manejar columnas faltantes en el conjunto de prueba
missing_cols_test = set(X_train_improved.columns) - set(X_test_improved.columns)
for col in missing_cols_test:
    X_test_improved[col] = 0
X_test_improved = X_test_improved[X_train_improved.columns]

# Separar la variable objetivo (target)
y_train = train_df['Price_euros']

# Dividir los datos en conjunto de entrenamiento y validación
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(
    X_train_improved, y_train, test_size=0.2, random_state=42
)

# Aplicar la transformación logarítmica para la variable objetivo
y_train_log = np.log1p(y_train_split)
y_val_log = np.log1p(y_val_split)

print("Preprocesamiento completo. Datos listos para modelar.")


Preprocesamiento completo. Datos listos para modelar.


In [22]:
# Función para asignar rango de precios
def assign_price_range(price):
    if price <= 500:
        return 'Low'
    elif 500 < price <= 3000:
        return 'Medium'
    else:
        return 'High'

# Asignar rangos de precios en el conjunto de entrenamiento
train_df['Price_Range'] = y_train.apply(assign_price_range)
val_df = X_val_split.copy()
val_df['Price_Range'] = y_val_split.apply(assign_price_range)

# Crear diccionario para almacenar modelos y métricas por rango
range_results = {}
price_ranges = ['Low', 'Medium', 'High']

# Hiperparámetros para ajustar por rango
hyperparams = {
    'n_estimators': [100, 150, 200],
    'max_depth': [4, 6, 8],
    'learning_rate': [0.01, 0.05, 0.1],
    'subsample': [0.6, 0.8, 1.0],
}

# Entrenar modelos por rango de precios
for price_range in price_ranges:
    print(f"\nTraining models for {price_range} range...")
    # Filtrar datos por rango
    X_train_range = X_train_improved[train_df['Price_Range'] == price_range]
    y_train_range = y_train[train_df['Price_Range'] == price_range]
    X_val_range = X_val_split[val_df['Price_Range'] == price_range]
    y_val_range = y_val_split[val_df['Price_Range'] == price_range]

    # Validación de datos
    if X_train_range.empty or X_val_range.empty:
        print(f"Skipping {price_range} range due to insufficient data.")
        continue

    # Ajuste de hiperparámetros con GridSearchCV
    model = GradientBoostingRegressor(random_state=42)
    grid_search = GridSearchCV(
        estimator=model,
        param_grid=hyperparams,
        scoring='neg_mean_absolute_error',
        cv=3,
        n_jobs=-1,
        verbose=1
    )
    grid_search.fit(X_train_range, np.log1p(y_train_range))

    # Seleccionar el mejor modelo
    best_model = grid_search.best_estimator_

    # Predecir en el conjunto de validación
    y_pred_log = best_model.predict(X_val_range)
    y_pred = np.expm1(y_pred_log)

    # Calcular métricas de evaluación
    mae = mean_absolute_error(y_val_range, y_pred)
    rmse = np.sqrt(mean_squared_error(y_val_range, y_pred))
    r2 = best_model.score(X_val_range, np.log1p(y_val_range))
    print(f"{price_range} range - MAE: {mae:.2f}, RMSE: {rmse:.2f}, R²: {r2:.2f}")

    # Guardar resultados
    range_results[price_range] = {
        'model': best_model,
        'best_params': grid_search.best_params_,
        'mae': mae,
        'rmse': rmse,
        'r2': r2
    }



Training models for Low range...
Fitting 3 folds for each of 81 candidates, totalling 243 fits
Low range - MAE: 15.41, RMSE: 20.38, R²: 0.95

Training models for Medium range...
Fitting 3 folds for each of 81 candidates, totalling 243 fits
Medium range - MAE: 55.00, RMSE: 70.99, R²: 0.98

Training models for High range...
Fitting 3 folds for each of 81 candidates, totalling 243 fits
High range - MAE: 188.20, RMSE: 192.54, R²: -12.48


In [23]:
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor

# Diccionario de modelos a probar
models_to_test = {
    "Random Forest": RandomForestRegressor(random_state=42, n_jobs=-1),
    "Gradient Boosting": GradientBoostingRegressor(random_state=42),
    "XGBoost": XGBRegressor(random_state=42, n_jobs=-1)
}

# Hiperparámetros para cada modelo
hyperparams = {
    "Random Forest": {
        'n_estimators': [100, 150],
        'max_depth': [6, 8, 10],
        'min_samples_split': [2, 5],
        'min_samples_leaf': [1, 2]
    },
    "Gradient Boosting": {
        'n_estimators': [100, 150],
        'max_depth': [4, 6],
        'learning_rate': [0.05, 0.1],
        'subsample': [0.8, 1.0]
    },
    "XGBoost": {
        'n_estimators': [150, 200],
        'max_depth': [6, 8],
        'learning_rate': [0.05, 0.1],
        'subsample': [0.8, 1.0],
        'colsample_bytree': [0.8, 1.0]
    }
}

# Crear diccionario para almacenar modelos y métricas por rango y algoritmo
range_results = {}
price_ranges = ['Low', 'Medium', 'High']

# Entrenar modelos por rango de precios y algoritmo
for price_range in price_ranges:
    print(f"\nTraining models for {price_range} range...")
    # Filtrar datos por rango
    X_train_range = X_train_improved[train_df['Price_Range'] == price_range]
    y_train_range = y_train[train_df['Price_Range'] == price_range]
    X_val_range = X_val_split[val_df['Price_Range'] == price_range]
    y_val_range = y_val_split[val_df['Price_Range'] == price_range]

    # Validación de datos
    if X_train_range.empty or X_val_range.empty:
        print(f"Skipping {price_range} range due to insufficient data.")
        continue

    best_model = None
    best_mae = float('inf')
    best_algo = None
    best_params = None

    # Probar cada algoritmo
    for algo_name, model in models_to_test.items():
        print(f"\nTesting {algo_name} for {price_range} range...")
        grid_search = GridSearchCV(
            estimator=model,
            param_grid=hyperparams[algo_name],
            scoring='neg_mean_absolute_error',
            cv=3,
            n_jobs=-1,
            verbose=1
        )
        grid_search.fit(X_train_range, np.log1p(y_train_range))

        # Seleccionar el mejor modelo
        current_model = grid_search.best_estimator_
        y_pred_log = current_model.predict(X_val_range)
        y_pred = np.expm1(y_pred_log)
        current_mae = mean_absolute_error(y_val_range, y_pred)

        print(f"MAE for {algo_name} in {price_range} range: {current_mae:.2f}")

        # Guardar el mejor modelo
        if current_mae < best_mae:
            best_model = current_model
            best_mae = current_mae
            best_algo = algo_name
            best_params = grid_search.best_params_

    print(f"\nBest model for {price_range} range: {best_algo}")
    print(f"Best MAE: {best_mae:.2f}")
    print(f"Best Parameters: {best_params}")

    # Guardar resultados
    range_results[price_range] = {
        'model': best_model,
        'algorithm': best_algo,
        'best_params': best_params,
        'mae': best_mae
    }

# Mostrar resultados por rango
for price_range, metrics in range_results.items():
    print(f"\n{price_range} Range Results:")
    print(f"  Algorithm: {metrics['algorithm']}")
    print(f"  Best Parameters: {metrics['best_params']}")
    print(f"  MAE: {metrics['mae']:.2f}")



Training models for Low range...

Testing Random Forest for Low range...
Fitting 3 folds for each of 24 candidates, totalling 72 fits
MAE for Random Forest in Low range: 18.73

Testing Gradient Boosting for Low range...
Fitting 3 folds for each of 16 candidates, totalling 48 fits
MAE for Gradient Boosting in Low range: 17.89

Testing XGBoost for Low range...
Fitting 3 folds for each of 32 candidates, totalling 96 fits
MAE for XGBoost in Low range: 2.20

Best model for Low range: XGBoost
Best MAE: 2.20
Best Parameters: {'colsample_bytree': 0.8, 'learning_rate': 0.1, 'max_depth': 8, 'n_estimators': 200, 'subsample': 0.8}

Training models for Medium range...

Testing Random Forest for Medium range...
Fitting 3 folds for each of 24 candidates, totalling 72 fits
MAE for Random Forest in Medium range: 111.86

Testing Gradient Boosting for Medium range...
Fitting 3 folds for each of 16 candidates, totalling 48 fits
MAE for Gradient Boosting in Medium range: 35.87

Testing XGBoost for Medium 

In [26]:
# Generar predicciones iniciales si aún no se han generado
if 'Price_euros' not in test_df.columns:
    print("Generating initial predictions...")
    y_pred_test_log = best_model.predict(X_test_improved)  # Asegúrate de que `best_model` esté definido
    y_pred_test = np.expm1(y_pred_test_log)  # Invertir la transformación logarítmica
    test_df['Price_euros'] = y_pred_test  # Agregar columna de precios predichos al conjunto de prueba
else:
    print("'Price_euros' already exists in test_df. Using existing values.")

# Asignar rangos de precios en el conjunto de prueba
print("Assigning price ranges to the test set...")
test_df['Price_Range'] = test_df['Price_euros'].apply(assign_price_range)

# Crear predicciones por rango
y_pred_test_final = []
for price_range in price_ranges:
    print(f"Predicting for {price_range} range...")
    # Filtrar datos de prueba para el rango
    X_test_range = X_test_improved[test_df['Price_Range'] == price_range]

    # Validación de datos
    if X_test_range.empty:
        print(f"No data found for {price_range} range in test set. Skipping.")
        continue

    # Usar el modelo correspondiente
    model = range_results[price_range]['model']
    y_pred_test_log = model.predict(X_test_range)
    y_pred_test_range = np.expm1(y_pred_test_log)

    # Guardar predicciones
    y_pred_test_final.append(pd.Series(y_pred_test_range, index=X_test_range.index))

# Combinar resultados
if y_pred_test_final:
    y_pred_test_combined = pd.concat(y_pred_test_final).sort_index()
else:
    print("Warning: No predictions were made for any range. Check your data or models.")
    y_pred_test_combined = pd.Series(dtype=float)

# Guardar predicciones finales en un archivo CSV consolidado
if 'id' in test_df.columns:
    output_df = test_df[['id']].copy()
    output_df['Price_euros'] = y_pred_test_combined
else:
    output_df = pd.DataFrame({'Price_euros': y_pred_test_combined})

output_filename = 'laptop_price_predictions_ranges.csv'
output_df.to_csv(output_filename, index=False)
print(f"Archivo de predicciones guardado como {output_filename}")

# Mostrar resumen de errores
if not y_pred_test_combined.empty:
    print("\nResumen de Predicciones por Rango:")
    for price_range in price_ranges:
        count = (test_df['Price_Range'] == price_range).sum()
        print(f"{price_range} range: {count} laptops predicted")
else:
    print("No predictions to summarize.")


Generating initial predictions...
Assigning price ranges to the test set...
Predicting for Low range...
No data found for Low range in test set. Skipping.
Predicting for Medium range...
Predicting for High range...
Archivo de predicciones guardado como laptop_price_predictions_ranges.csv

Resumen de Predicciones por Rango:
Low range: 0 laptops predicted
Medium range: 2 laptops predicted
High range: 389 laptops predicted
