In [23]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import Ridge

In [2]:
url = 'https://raw.githubusercontent.com/alexeygrigorev/datasets/master/laptops.csv'
df = pd.read_csv(url)

# Para guardar localmente
df.to_csv('laptops.csv', index=False)

In [3]:
df.columns = df.columns.str.lower().str.replace(' ', '_')

In [9]:
df

Unnamed: 0,laptop,status,brand,model,cpu,ram,storage,storage_type,gpu,screen,touch,final_price
0,ASUS ExpertBook B1 B1502CBA-EJ0436X Intel Core...,New,Asus,ExpertBook,Intel Core i5,8,512,SSD,,15.6,No,1009.00
1,Alurin Go Start Intel Celeron N4020/8GB/256GB ...,New,Alurin,Go,Intel Celeron,8,256,SSD,,15.6,No,299.00
2,ASUS ExpertBook B1 B1502CBA-EJ0424X Intel Core...,New,Asus,ExpertBook,Intel Core i3,8,256,SSD,,15.6,No,789.00
3,MSI Katana GF66 12UC-082XES Intel Core i7-1270...,New,MSI,Katana,Intel Core i7,16,1000,SSD,RTX 3050,15.6,No,1199.00
4,HP 15S-FQ5085NS Intel Core i5-1235U/16GB/512GB...,New,HP,15S,Intel Core i5,16,512,SSD,,15.6,No,669.01
...,...,...,...,...,...,...,...,...,...,...,...,...
2155,Razer Blade 17 FHD 360Hz Intel Core i7-11800H/...,Refurbished,Razer,Blade,Intel Core i7,16,1000,SSD,RTX 3060,17.3,No,2699.99
2156,Razer Blade 17 FHD 360Hz Intel Core i7-11800H/...,Refurbished,Razer,Blade,Intel Core i7,16,1000,SSD,RTX 3070,17.3,No,2899.99
2157,Razer Blade 17 FHD 360Hz Intel Core i7-11800H/...,Refurbished,Razer,Blade,Intel Core i7,32,1000,SSD,RTX 3080,17.3,No,3399.99
2158,Razer Book 13 Intel Evo Core i7-1165G7/16GB/1T...,Refurbished,Razer,Book,Intel Evo Core i7,16,1000,SSD,,13.4,Yes,1899.99


### Q1. 
There's one column with missing values. What is it?

In [11]:
missing_values = df.isnull().sum()
missing_columns = missing_values[missing_values > 0]

print(missing_columns)

storage_type      42
gpu             1371
screen             4
dtype: int64


### Q2. 
What's the median (50% percentile) for variable 'ram'?

In [13]:
ram_median = df['ram'].median()
print(f"La mediana de la columna 'RAM' es: {ram_median}")

La mediana de la columna 'RAM' es: 16.0


### Q3.
We need to deal with missing values for the column from Q1.
We have two options: fill it with 0 or with the mean of this variable.
Try both options. For each, train a linear regression model without regularization using the code from the lessons.
For computing the mean, use the training only!
Use the validation dataset to evaluate the models and compare the RMSE of each option.
Round the RMSE scores to 2 decimal digits using round(score, 2)
Which option gives better RMSE?

In [21]:
# Filtrar el dataset que creaste anteriormente (ajusta según lo que necesites)
df_filtered = df[['ram', 'storage', 'screen', 'final_price']].dropna(subset=['final_price'])

# Revolver los datos con semilla 42
df_filtered = df_filtered.sample(frac=1, random_state=42).reset_index(drop=True)


In [22]:
# Renombrar las columnas para mayor claridad
df_filtered.columns = ['ram', 'storage', 'screen', 'final_price']

# Dividir en características (X) y etiqueta (y)
X = df_filtered[['ram', 'storage', 'screen']]
y = df_filtered['final_price']

# Dividir el conjunto de datos en entrenamiento y validación, barajando con una semilla de 42
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Opción 1: Llenar los valores faltantes con 0
X_train_0 = X_train.fillna(0)
X_val_0 = X_val.fillna(0)

# Opción 2: Llenar los valores faltantes con la media (calculada sobre el conjunto de entrenamiento)
mean_values = X_train.mean()
X_train_mean = X_train.fillna(mean_values)
X_val_mean = X_val.fillna(mean_values)

# Entrenar un modelo de regresión lineal para cada opción

# Opción 1: Con 0
model_0 = LinearRegression()
model_0.fit(X_train_0, y_train)
y_pred_0 = model_0.predict(X_val_0)
rmse_0 = mean_squared_error(y_val, y_pred_0, squared=False)
rmse_0_rounded = round(rmse_0, 2)

# Opción 2: Con la media
model_mean = LinearRegression()
model_mean.fit(X_train_mean, y_train)
y_pred_mean = model_mean.predict(X_val_mean)
rmse_mean = mean_squared_error(y_val, y_pred_mean, squared=False)
rmse_mean_rounded = round(rmse_mean, 2)

# Imprimir los resultados
print(f"RMSE con 0: {rmse_0_rounded}")
print(f"RMSE con la media: {rmse_mean_rounded}")

# Comparación
if rmse_0_rounded < rmse_mean_rounded:
    print("La opción de llenar con 0 da un mejor RMSE.")
elif rmse_0_rounded > rmse_mean_rounded:
    print("La opción de llenar con la media da un mejor RMSE.")
else:
    print("Ambas opciones son igualmente buenas.")


RMSE con 0: 579.53
RMSE con la media: 580.34
La opción de llenar con 0 da un mejor RMSE.


### Q4.
Now let's train a regularized linear regression.
For this question, fill the NAs with 0.
Try different values of r from this list: [0, 0.01, 0.1, 1, 5, 10, 100].
Use RMSE to evaluate the model on the validation dataset.
Round the RMSE scores to 2 decimal digits.
Which r gives the best RMSE?
If there are multiple options, select the smallest 'r'.


In [24]:
# Lista de valores de r a probar
r_values = [0, 0.01, 0.1, 1, 5, 10, 100]

# Diccionario para almacenar los RMSE para cada valor de r
rmse_scores = {}

# Entrenar y evaluar el modelo para cada valor de r
for r in r_values:
    # Crear el modelo de Ridge Regression con regularización r
    model = Ridge(alpha=r)
    
    # Entrenar el modelo
    model.fit(X_train_0, y_train)
    
    # Hacer predicciones en el conjunto de validación
    y_pred = model.predict(X_val_0)
    
    # Calcular el RMSE
    rmse = mean_squared_error(y_val, y_pred, squared=False)
    
    # Almacenar el RMSE redondeado a 2 dígitos
    rmse_scores[r] = round(rmse, 2)

# Encontrar el valor de r con el menor RMSE
best_r = min(rmse_scores, key=rmse_scores.get)
best_rmse = rmse_scores[best_r]

# Imprimir los resultados
print("RMSE para cada valor de r:")
for r, rmse in rmse_scores.items():
    print(f"r = {r}: RMSE = {rmse}")

print(f"\nEl mejor valor de r es {best_r} con un RMSE de {best_rmse}")


RMSE para cada valor de r:
r = 0: RMSE = 579.53
r = 0.01: RMSE = 579.53
r = 0.1: RMSE = 579.53
r = 1: RMSE = 579.53
r = 5: RMSE = 579.53
r = 10: RMSE = 579.53
r = 100: RMSE = 579.47

El mejor valor de r es 100 con un RMSE de 579.47


# Q5.
We used seed 42 for splitting the data. Let's find out how selecting the seed influences our score.
Try different seed values: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9].
For each seed, do the train/validation/test split with 60%/20%/20% distribution.
Fill the missing values with 0 and train a model without regularization.
For each seed, evaluate the model on the validation dataset and collect the RMSE scores.
What's the standard deviation of all the scores? To compute the standard deviation, use np.std.
Round the result to 3 decimal digits (round(std, 3))

In [25]:
# Lista de semillas a probar
seeds = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

# Lista para almacenar los RMSEs para cada semilla
rmse_scores = []

# Probar con diferentes semillas
for seed in seeds:
    # Dividir el conjunto de datos en entrenamiento (60%), validación (20%) y test (20%)
    X_train_full, X_test, y_train_full, y_test = train_test_split(X, y, test_size=0.2, random_state=seed)
    X_train, X_val, y_train, y_val = train_test_split(X_train_full, y_train_full, test_size=0.25, random_state=seed) # 0.25 x 0.8 = 0.2

    # Llenar los valores faltantes con 0
    X_train = X_train.fillna(0)
    X_val = X_val.fillna(0)

    # Entrenar el modelo sin regularización
    model = LinearRegression()
    model.fit(X_train, y_train)

    # Hacer predicciones en el conjunto de validación
    y_pred = model.predict(X_val)

    # Calcular el RMSE
    rmse = mean_squared_error(y_val, y_pred, squared=False)

    # Almacenar el RMSE redondeado a 2 dígitos
    rmse_scores.append(round(rmse, 2))

# Calcular la desviación estándar de los RMSEs
std_rmse = np.std(rmse_scores)

# Redondear a 3 decimales
std_rmse_rounded = round(std_rmse, 3)

# Imprimir los resultados
print(f"Desviación estándar de los RMSEs: {std_rmse_rounded}")


Desviación estándar de los RMSEs: 25.269


# Q6.
Split the dataset like previously, use seed 9.
Combine train and validation datasets.
Fill the missing values with 0 and train a model with r=0.001.
What's the RMSE on the test dataset?

In [26]:
# Dividir en características (X) y etiqueta (y)
X = df_filtered[['ram', 'storage', 'screen']]
y = df_filtered['final_price']

# Dividir el conjunto de datos en entrenamiento (60%), validación (20%) y test (20%) usando seed=9
X_train_full, X_test, y_train_full, y_test = train_test_split(X, y, test_size=0.2, random_state=9)
X_train, X_val, y_train, y_val = train_test_split(X_train_full, y_train_full, test_size=0.25, random_state=9) # 0.25 x 0.8 = 0.2

# Combinar los conjuntos de entrenamiento y validación
X_combined = pd.concat([X_train, X_val])
y_combined = pd.concat([y_train, y_val])

# Llenar los valores faltantes con 0
X_combined = X_combined.fillna(0)
X_test = X_test.fillna(0)

# Entrenar un modelo de Ridge Regression con r=0.001
model = Ridge(alpha=0.001)
model.fit(X_combined, y_combined)

# Hacer predicciones en el conjunto de test
y_pred = model.predict(X_test)

# Calcular el RMSE en el conjunto de test
rmse_test = mean_squared_error(y_test, y_pred, squared=False)

# Redondear a 2 decimales
rmse_test_rounded = round(rmse_test, 2)

# Imprimir el RMSE
print(f"RMSE en el conjunto de test: {rmse_test_rounded}")


RMSE en el conjunto de test: 549.95
