## Predicción de precios de portátiles 24-03 FT
https://www.kaggle.com/competitions/prediccion-de-precios-de-portatiles-24-03-ft/data

In [114]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

In [115]:
data= pd.read_csv("./data/train.csv")
data.isnull().sum()

id                  0
laptop_ID           0
Company             0
Product             0
TypeName            0
Inches              0
ScreenResolution    0
Cpu                 0
Ram                 0
Memory              0
Gpu                 0
OpSys               0
Weight              0
Price_euros         0
dtype: int64

In [116]:
data.shape

(912, 14)

In [117]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 912 entries, 0 to 911
Data columns (total 14 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   id                912 non-null    int64  
 1   laptop_ID         912 non-null    int64  
 2   Company           912 non-null    object 
 3   Product           912 non-null    object 
 4   TypeName          912 non-null    object 
 5   Inches            912 non-null    float64
 6   ScreenResolution  912 non-null    object 
 7   Cpu               912 non-null    object 
 8   Ram               912 non-null    object 
 9   Memory            912 non-null    object 
 10  Gpu               912 non-null    object 
 11  OpSys             912 non-null    object 
 12  Weight            912 non-null    object 
 13  Price_euros       912 non-null    float64
dtypes: float64(2), int64(2), object(10)
memory usage: 99.9+ KB


In [118]:
data.head(2)

Unnamed: 0,id,laptop_ID,Company,Product,TypeName,Inches,ScreenResolution,Cpu,Ram,Memory,Gpu,OpSys,Weight,Price_euros
0,697,705,Asus,Chromebook Flip,2 in 1 Convertible,12.5,Full HD / Touchscreen 1920x1080,Intel Core M M3-6Y30 0.9GHz,4GB,64GB Flash Storage,Intel HD Graphics 515,Chrome OS,1.2kg,669.0
1,435,442,Asus,Rog Strix,Gaming,17.3,Full HD 1920x1080,AMD Ryzen 1600 3.2GHz,8GB,256GB SSD + 1TB HDD,AMD Radeon RX 580,Windows 10,3.2kg,1695.0


In [119]:
# Quitamos las unidades kg en Weithg
data['Weight'] = data['Weight'].str.replace('kg', '').astype(float)


In [120]:
# Quitamos las unidades GB en Ram
data['Ram'] = data['Ram'].str.replace('GB', '').astype(float)

In [121]:
data.head(3)

Unnamed: 0,id,laptop_ID,Company,Product,TypeName,Inches,ScreenResolution,Cpu,Ram,Memory,Gpu,OpSys,Weight,Price_euros
0,697,705,Asus,Chromebook Flip,2 in 1 Convertible,12.5,Full HD / Touchscreen 1920x1080,Intel Core M M3-6Y30 0.9GHz,4.0,64GB Flash Storage,Intel HD Graphics 515,Chrome OS,1.2,669.0
1,435,442,Asus,Rog Strix,Gaming,17.3,Full HD 1920x1080,AMD Ryzen 1600 3.2GHz,8.0,256GB SSD + 1TB HDD,AMD Radeon RX 580,Windows 10,3.2,1695.0
2,735,743,Lenovo,V310-15IKB (i7-7500U/4GB/1TB/FHD/W10),Notebook,15.6,Full HD 1920x1080,Intel Core i7 7500U 2.7GHz,4.0,1TB HDD,Intel HD Graphics 620,Windows 10,1.85,779.0


In [123]:
# Dividir características y objetivo
X_train = data.drop(['Price_euros'], axis=1)
y_train = data['Price_euros']

# Dividir datos en conjuntos de entrenamiento y prueba
X_test = pd.read_csv("./data/test.csv")

In [125]:
# Quitamos las unidades kg en Weithg
X_test['Weight'] = X_test['Weight'].str.replace('kg', '').astype(float)

# Quitamos las unidades GB en Ram
X_test['Ram'] = X_test['Ram'].str.replace('GB', '').astype(float)

In [126]:
# Definir las columnas numéricas y categóricas
numeric_features = X_test.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X_test.select_dtypes(include=['object']).columns

# Crear un transformador para manejar características numéricas y categóricas
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

In [128]:
# Definir las columnas numéricas y categóricas
numeric_features = X_train.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X_train.select_dtypes(include=['object']).columns

# Crear un transformador para manejar características numéricas y categóricas
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

# Crear un pipeline con el preprocesamiento y el modelo
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('regressor', GradientBoostingRegressor())])

# Definir la cuadrícula de hiperparámetros a buscar
param_grid = {
    'regressor__n_estimators': [50, 100, 200],
    'regressor__learning_rate': [0.05, 0.1, 0.2],
    'regressor__max_depth': [3, 4, 5]
}

# Realizar la búsqueda de hiperparámetros utilizando GridSearchCV
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)

# Obtener el mejor modelo y sus hiperparámetros
best_model = grid_search.best_estimator_
best_params = grid_search.best_params_
print("Best Parameters:", best_params)

# Predecir en el conjunto de entrenamiento y prueba usando el mejor modelo
y_train_pred = best_model.predict(X_train)
y_test_pred = best_model.predict(X_test)

# Calcular el MAE en los conjuntos de entrenamiento y prueba
mae_train = mean_absolute_error(y_train, y_train_pred)

print("Mean Absolute Error on Training Set (After Hyperparameter Tuning):", mae_train)


Best Parameters: {'regressor__learning_rate': 0.2, 'regressor__max_depth': 3, 'regressor__n_estimators': 200}
Mean Absolute Error on Training Set (After Hyperparameter Tuning): 105.95970828070061


In [129]:
# Crea una copia de X_test para evitar cambios no deseados en el original
X_test_copy = X_test.copy()

# Agrega las predicciones como una nueva columna al DataFrame
X_test_copy['Price_euros'] = y_test_pred

# Selecciona las columnas 'id' y 'Price_euros'
result_df = X_test_copy[['id', 'Price_euros']]

# Guarda el DataFrame resultante a un archivo CSV
result_df.to_csv('predictions_last.csv', index=False)

# Muestra las primeras filas del DataFrame resultante para verificar
result_df.shape

(391, 2)

In [None]:
# - MAE FINAL DE COMPETI- 179
# HACIENDO CASI NADA
# Al final de la copetición este modelo fue el ganador - 160 MAE con el 100% de los datos