# Bibliotecas


In [1]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error

# Funciones

In [2]:
def calcular_memoria(df, columna_memoria="Memory"):
    
    df['HDD'] = 0
    df['SSD'] = 0
    df['Flash_Storage'] = 0

    for index, row in df.iterrows():

        memory_str = row[columna_memoria]

        ssd_match = re.search(r'(\d+)\s*GB\s*SSD', memory_str)
        if ssd_match:
            df.at[index, 'SSD'] = int(ssd_match.group(1))  # Convertir a entero

        flash_match = re.search(r'(\d+)\s*GB\s*(Flash Storage|Flash)', memory_str)
        if flash_match:
            df.at[index, 'Flash_Storage'] = int(flash_match.group(1))  

        hdd_match_tb = re.search(r'(\d+\.?\d*)\s*TB\s*(Hybrid|HDD)?', memory_str)
        if hdd_match_tb:
            df.at[index, 'HDD'] += int(float(hdd_match_tb.group(1)) * 1024)  
        
        hdd_match_gb = re.search(r'(\d+)\s*GB\s*HDD', memory_str)
        if hdd_match_gb:
            df.at[index, 'HDD'] += int(hdd_match_gb.group(1))  

    df["TotalMemory"] = df["HDD"] + df["SSD"] + df["Flash_Storage"]
    
    return df

In [3]:
def clasificar_cpu(df, columna_cpu="Cpu", columna_categoria="Cpu_Categoria"):
    
    df[columna_categoria] = ""
    
    for index, row in df.iterrows():
        cpu_str = row[columna_cpu]
        
        if "Core i3" in cpu_str:
            df.at[index, columna_categoria] = 'Intel Core i3'
        elif "Core i5" in cpu_str:
            df.at[index, columna_categoria] = "Intel Core i5"
        elif "Core i7" in cpu_str:
            df.at[index, columna_categoria] = "Intel Core i7"
        elif "AMD" in cpu_str:
            df.at[index, columna_categoria] = "AMD"
        else:
            df.at[index, columna_categoria] = "Otros"  
    
    return df

In [4]:
def rank_cpu(df_laptop):
    
    mediana_precios_cpu = df_laptop.groupby('Cpu_Categoria')['Price_euros'].median().reset_index()
    mediana_precios_cpu.columns = ['Cpu_Categoria', 'Media_Price_Cpu']
    mediana_precios_cpu = mediana_precios_cpu.sort_values(by='Media_Price_Cpu')
    mediana_precios_cpu['Cpu_Rank'] = range(len(mediana_precios_cpu))

    df_laptop = df_laptop.merge(mediana_precios_cpu[['Cpu_Categoria', 'Media_Price_Cpu', 'Cpu_Rank']], on='Cpu_Categoria', how='left')

    return df_laptop

In [5]:
# Función para clasificar GPU
def clasificar_gpu(df, columna_gpu="Gpu", columna_categoria="Gpu_Categoria"):
    
    df[columna_categoria] = ""

    for index, row in df.iterrows():
        gpu_str = row[columna_gpu]
        
        if "Intel" in gpu_str:
            df.at[index, columna_categoria] = "Intel"
        elif "Nvidia" in gpu_str:
            df.at[index, columna_categoria] = "NVIDIA"
        elif "AMD" in gpu_str:
            df.at[index, columna_categoria] = "AMD"
        else:
            df.at[index, columna_categoria] = "Otros" 
    
    return df

In [6]:
# Función para asignar rankings de GPU
def rank_gpu(df_laptop):

    mediana_precios_gpu = df_laptop.groupby('Gpu_Categoria')['Price_euros'].median().reset_index()
    mediana_precios_gpu.columns = ['Gpu_Categoria', 'Media_Precio_Gpu']
    mediana_precios_gpu = mediana_precios_gpu.sort_values(by='Media_Precio_Gpu')
    mediana_precios_gpu['Gpu_Rank'] = range(1, len(mediana_precios_gpu) + 1)

    df_laptop = df_laptop.merge(mediana_precios_gpu[['Gpu_Categoria', 'Media_Precio_Gpu', 'Gpu_Rank']], on='Gpu_Categoria', how='left')
    
    return df_laptop

In [7]:
# Función para entrenar el modelo
def modelo_ln(df, variables_independientes, variable_dependiente, test_size=0.2, random_state=35):

    X = df[variables_independientes]
    y = df[variable_dependiente]
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
    ln = LinearRegression()
    ln.fit(X_train, y_train)
    
    pred = ln.predict(X_test)
    
    print("MAE:", mean_absolute_error(y_test, pred))
    return ln, pred, X_train, X_test, y_train, y_test

# Limpieza DataSet-Train

In [8]:
# Cargar y procesar dataset de entrenamiento
df = pd.read_csv("./train.csv", index_col="id")
df.drop(columns=['laptop_ID'], inplace=True)

df['Ram'] = df['Ram'].str.replace('GB', '').astype(int)
df['Weight'] = df['Weight'].str.replace('kg', '').astype(float)

# Aplicar funciones de limpieza y transformación al dataset de entrenamiento
df = calcular_memoria(df)
df = clasificar_cpu(df)
df = rank_cpu(df)
df = clasificar_gpu(df)
df = rank_gpu(df)


In [9]:
# Definir variables para el modelo
variables_independientes = ["Weight", "Ram", "Media_Precio_Gpu", "Media_Price_Cpu", "TotalMemory"]
variable_dependiente = "Price_euros"

# Entrenar el modelo y almacenar el modelo entrenado
modelo_entrenado, predicciones, X_train, X_test, y_train, y_test = modelo_ln(df, variables_independientes, variable_dependiente, test_size=0.2, random_state=35)

MAE: 323.4322778903786


In [10]:
# Crear diccionarios de medianas y rankings de CPU y GPU
cpu_price_dict = dict(zip(df['Cpu_Categoria'], df['Media_Price_Cpu']))
cpu_rank_dict = dict(zip(df['Cpu_Categoria'], df['Cpu_Rank']))

gpu_price_dict = dict(zip(df['Gpu_Categoria'], df['Media_Precio_Gpu']))
gpu_rank_dict = dict(zip(df['Gpu_Categoria'], df['Gpu_Rank']))

In [11]:
# Procesar el dataset de test
df_test = pd.read_csv("test.csv")
df_test.drop(columns=['laptop_ID'], inplace=True)

df_test['Ram'] = df_test['Ram'].str.replace('GB', '').astype(int)
df_test['Weight'] = df_test['Weight'].str.replace('kg', '').astype(float)

# Aplicar funciones de limpieza y transformación al dataset de test
df_test = calcular_memoria(df_test)
df_test = clasificar_cpu(df_test)
df_test = clasificar_gpu(df_test)

In [12]:
# Asignar medianas y rankings usando los diccionarios del conjunto de entrenamiento
df_test['Media_Price_Cpu'] = df_test['Cpu_Categoria'].map(cpu_price_dict)
df_test['Cpu_Rank'] = df_test['Cpu_Categoria'].map(cpu_rank_dict)
df_test['Media_Precio_Gpu'] = df_test['Gpu_Categoria'].map(gpu_price_dict)
df_test['Gpu_Rank'] = df_test['Gpu_Categoria'].map(gpu_rank_dict)

In [13]:
df[["Weight","Ram","Media_Precio_Gpu","Media_Price_Cpu","TotalMemory"]]

Unnamed: 0,Weight,Ram,Media_Precio_Gpu,Media_Price_Cpu,TotalMemory
0,2.36,8,686.995,1458.0,1024
1,2.00,4,848.000,349.0,1024
2,1.20,8,848.000,943.0,256
3,4.42,16,1254.500,1458.0,1280
4,1.26,8,848.000,943.0,256
...,...,...,...,...,...
907,1.25,4,848.000,349.0,32
908,1.20,4,848.000,943.0,128
909,2.40,8,686.995,450.0,1024
910,1.36,8,848.000,1458.0,256


In [None]:
#Esto lo hago por que tenia un null value justo en una fila:
if df_test['Media_Precio_Gpu'].isna().sum() > 0:
    mediana_precio_gpu = df['Media_Precio_Gpu'].median()
    df_test['Media_Precio_Gpu'].fillna(mediana_precio_gpu, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_test['Media_Precio_Gpu'].fillna(mediana_precio_gpu, inplace=True)


In [15]:
data = df_test[["Weight","Ram","Media_Precio_Gpu","Media_Price_Cpu","TotalMemory"]]
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 391 entries, 0 to 390
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Weight            391 non-null    float64
 1   Ram               391 non-null    int64  
 2   Media_Precio_Gpu  391 non-null    float64
 3   Media_Price_Cpu   391 non-null    float64
 4   TotalMemory       391 non-null    int64  
dtypes: float64(3), int64(2)
memory usage: 15.4 KB


In [22]:
# Verificar que el conjunto de test tenga las columnas necesarias
X_test_final = df_test[variables_independientes]

# Hacer las predicciones en el dataset de test
predicciones_test = modelo_entrenado.predict(X_test_final)
predicciones_test


array([2063.10843909,  821.62075018,  459.89451971, 1127.70358366,
       1133.67153075, 2016.69107287, 2134.9611845 ,  697.80901318,
       6441.90384355, 1347.20820685, 1353.83925917,  638.88504729,
        542.3165897 ,  573.74568256,  387.50974796,  897.17467443,
        548.94764202, 1343.22957546, 1131.68221505, 1322.60619411,
       1397.2016235 ,  893.19604304, 2013.77668178, 1142.29189876,
       1946.43794117,  721.2171495 , 1079.99602985,  429.39209903,
       1231.86391939,  343.30517605, 1129.69289936, 1125.05116274,
       2019.37951667,  698.70485262,  530.69259301, 1939.80688885,
       1031.11369844, 1176.58880478, 3345.21738239,  696.09250387,
        668.5013041 , 1443.59584584, 1199.48920991, 1977.60388706,
       1065.04260255, 1329.90035166,  579.53890155, 3284.80768713,
        323.81199351, 1352.51304871, 1403.80953193, 1265.01918098,
       1117.45901215,  409.4588699 , 1333.94610222,  708.95072324,
        773.0689937 ,  339.69323807, 1132.34532029, 1191.59906

In [25]:
resultados = pd.DataFrame({'id':df_test["id"],
                           "Price_euros":predicciones_test})
resultados
#Imprimir el CSV:
resultados.to_csv("Resultados/Prueba_4_linear.csv", index= False)

# Modelo Polinomico

In [28]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Cargar y procesar el dataset de entrenamiento
df = pd.read_csv("./train.csv", index_col="id")
df.drop(columns=['laptop_ID'], inplace=True)

df['Ram'] = df['Ram'].str.replace('GB', '').astype(int)
df['Weight'] = df['Weight'].str.replace('kg', '').astype(float)

# Aplicar funciones de limpieza y transformación al dataset de entrenamiento
df = calcular_memoria(df)
df = clasificar_cpu(df)
df = rank_cpu(df)
df = clasificar_gpu(df)
df = rank_gpu(df)

# Definir variables para el modelo
variables_independientes = ["Weight", "Ram", "Media_Precio_Gpu", "Media_Price_Cpu", "TotalMemory"]
variable_dependiente = "Price_euros"

# Crear términos polinómicos de segundo grado
poly = PolynomialFeatures(degree=2, include_bias=False)
X_poly = poly.fit_transform(df[variables_independientes])

# Separar los datos en conjuntos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X_poly, df[variable_dependiente], test_size=0.2, random_state=35)

# Entrenar el modelo de regresión lineal con los términos polinómicos
modelo_polinomico = LinearRegression()
modelo_polinomico.fit(X_train, y_train)

# Predecir sobre el conjunto de prueba y calcular el error cuadrático medio
predicciones_train = modelo_polinomico.predict(X_train)
predicciones_test = modelo_polinomico.predict(X_test)
error_train = mean_squared_error(y_train, predicciones_train, squared=False)
error_test = mean_squared_error(y_test, predicciones_test, squared=False)

print(f"RMSE en entrenamiento: {error_train}")
print(f"RMSE en prueba: {error_test}")

# Procesar el dataset de prueba para predecir nuevos valores
df_test = pd.read_csv("test.csv")
df_test.drop(columns=['laptop_ID'], inplace=True)

df_test['Ram'] = df_test['Ram'].str.replace('GB', '').astype(int)
df_test['Weight'] = df_test['Weight'].str.replace('kg', '').astype(float)

# Aplicar funciones de limpieza y transformación al dataset de prueba
df_test = calcular_memoria(df_test)
df_test = clasificar_cpu(df_test)
df_test = clasificar_gpu(df_test)

# Asignar medianas y rankings usando los diccionarios del conjunto de entrenamiento
cpu_price_dict = dict(zip(df['Cpu_Categoria'], df['Media_Price_Cpu']))
cpu_rank_dict = dict(zip(df['Cpu_Categoria'], df['Cpu_Rank']))
gpu_price_dict = dict(zip(df['Gpu_Categoria'], df['Media_Precio_Gpu']))
gpu_rank_dict = dict(zip(df['Gpu_Categoria'], df['Gpu_Rank']))

df_test['Media_Price_Cpu'] = df_test['Cpu_Categoria'].map(cpu_price_dict)
df_test['Cpu_Rank'] = df_test['Cpu_Categoria'].map(cpu_rank_dict)
df_test['Media_Precio_Gpu'] = df_test['Gpu_Categoria'].map(gpu_price_dict)
df_test['Gpu_Rank'] = df_test['Gpu_Categoria'].map(gpu_rank_dict)

# Rellenar cualquier NaN en 'Media_Precio_Gpu' con la mediana
if df_test['Media_Precio_Gpu'].isna().sum() > 0:
    mediana_precio_gpu = df['Media_Precio_Gpu'].median()
    df_test['Media_Precio_Gpu'].fillna(mediana_precio_gpu, inplace=True)

# Generar términos polinómicos para el conjunto de prueba
X_test_final_poly = poly.transform(df_test[variables_independientes])

# Realizar predicciones en el conjunto de prueba
predicciones_test = modelo_polinomico.predict(X_test_final_poly)

# Crear archivo de resultados con las predicciones
resultados = pd.DataFrame({
    'id': df_test["id"],
    'Price_euros': predicciones_test
})

# Guardar el archivo en formato CSV para Kaggle
resultados.to_csv("Resultados/Prueba_5_polinomial.csv", index=False)


RMSE en entrenamiento: 383.24022199737436
RMSE en prueba: 445.1189939988661


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_test['Media_Precio_Gpu'].fillna(mediana_precio_gpu, inplace=True)
