In [2]:
import pandas as pd

import urllib.request
from PIL import Image

In [3]:
train = pd.read_csv('data/train.csv')

In [4]:
train.head(5)

Unnamed: 0,laptop_ID,Company,Product,TypeName,Inches,ScreenResolution,Cpu,Ram,Memory,Gpu,OpSys,Weight,Price_euros
0,1104,Acer,Aspire ES1-523,Notebook,15.6,1366x768,AMD A8-Series 7410 2.2GHz,4GB,500GB HDD,AMD Radeon R5,Windows 10,2.4kg,387.0
1,114,Dell,XPS 13,Ultrabook,13.3,Quad HD+ / Touchscreen 3200x1800,Intel Core i7 7560U 2.4GHz,8GB,256GB SSD,Intel Iris Plus Graphics 640,Windows 10,1.23kg,1379.0
2,172,Acer,Aspire A517-51G,Notebook,17.3,IPS Panel Full HD 1920x1080,Intel Core i5 8250U 1.6GHz,8GB,256GB SSD,Nvidia GeForce MX150,Windows 10,3kg,854.0
3,918,HP,Elitebook 820,Netbook,12.5,Full HD 1920x1080,Intel Core i7 7500U 2.7GHz,8GB,512GB SSD,Intel HD Graphics 620,Windows 10,1.26kg,1483.0
4,447,Lenovo,IdeaPad 320-15AST,Notebook,17.3,1600x900,AMD A6-Series 9220 2.5GHz,8GB,1TB HDD,AMD Radeon R4,Windows 10,2.8kg,519.0


In [5]:
train.shape

(912, 13)

In [6]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 912 entries, 0 to 911
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   laptop_ID         912 non-null    int64  
 1   Company           912 non-null    object 
 2   Product           912 non-null    object 
 3   TypeName          912 non-null    object 
 4   Inches            912 non-null    float64
 5   ScreenResolution  912 non-null    object 
 6   Cpu               912 non-null    object 
 7   Ram               912 non-null    object 
 8   Memory            912 non-null    object 
 9   Gpu               912 non-null    object 
 10  OpSys             912 non-null    object 
 11  Weight            912 non-null    object 
 12  Price_euros       912 non-null    float64
dtypes: float64(2), int64(1), object(10)
memory usage: 92.8+ KB


## Data processing

In [7]:
train.columns

Index(['laptop_ID', 'Company', 'Product', 'TypeName', 'Inches',
       'ScreenResolution', 'Cpu', 'Ram', 'Memory', 'Gpu', 'OpSys', 'Weight',
       'Price_euros'],
      dtype='object')

In [86]:
# Funciones

def apply_onehot_encoder(train:pd.DataFrame, columns_to_encode:list, test:pd.DataFrame=None):
    
    # Resetear índices para evitar desalineación
    train = train.reset_index(drop=True)
    
    # Crear el OneHotEncoder
    encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')

    # Ajustar y transformar las columnas seleccionadas
    transformed_data = encoder.fit_transform(train[columns_to_encode])

    # Crear un DataFrame con las columnas transformadas
    transformed_df = pd.DataFrame(transformed_data, columns=encoder.get_feature_names_out(columns_to_encode))
    
    # Concatenar con el DataFrame original excluyendo las columnas transformadas
    df_concatenated = pd.concat([train.drop(columns_to_encode, axis=1), transformed_df], axis=1)

    # Si se proporciona un segundo DataFrame, aplicar la misma transformación
    if test is not None:
        transformed_data_to_transform = encoder.transform(test[columns_to_encode])
        transformed_df_to_transform = pd.DataFrame(transformed_data_to_transform, columns=encoder.get_feature_names_out(columns_to_encode))
        df_to_transform_concatenated = pd.concat([test.drop(columns_to_encode, axis=1), transformed_df_to_transform], axis=1)
        return df_concatenated, df_to_transform_concatenated

    return df_concatenated

def data_report(df):
    '''Esta funcion describe los campos de un dataframe de pandas de forma bastante clara, crack'''
    # Sacamos los NOMBRES
    cols = pd.DataFrame(df.columns.values, columns=["COL_N"])

    # Sacamos los TIPOS
    types = pd.DataFrame(df.dtypes.values, columns=["DATA_TYPE"])

    # Sacamos los MISSINGS
    percent_missing = round(df.isnull().sum() * 100 / len(df), 2)
    percent_missing_df = pd.DataFrame(percent_missing.values, columns=["MISSINGS (%)"])

    # Sacamos los VALORES UNICOS
    unicos = pd.DataFrame(df.nunique().values, columns=["UNIQUE_VALUES"])
    
    percent_cardin = round(unicos['UNIQUE_VALUES']*100/len(df), 2)
    percent_cardin_df = pd.DataFrame(percent_cardin.values, columns=["CARDIN (%)"])

    concatenado = pd.concat([cols, types, percent_missing_df, unicos, percent_cardin_df], axis=1, sort=False)
    concatenado.set_index('COL_N', drop=True, inplace=True)


    return concatenado.T

import re
def convertir_a_GB(expression):
    '''
Convierte un numero a GB a partir de una columna que nos entrega un STR compuesto por un valor numerico y una unidad en GB o TB. 

'''
    if pd.isna(expression):
        return None
    match = re.search(r'(\d+(?:\.\d*)?)\s*([GBT]+)', expression)
    if match:
        num, unid = match.groups()
        num = float(num)
        if unid == 'TB':
            return num * 1024
        
        elif unid == 'GB':
            return num
    return None


def chequeator(df_to_submit):
    """
    Esta función se asegura de que tu submission tenga la forma requerida por Kaggle.
    
    Si es así, se guardará el dataframe en un `csv` y estará listo para subir a Kaggle.
    
    Si no, LEE EL MENSAJE Y HAZLE CASO.
    
    Si aún no:
    - apaga tu ordenador, 
    - date una vuelta, 
    - enciendelo otra vez, 
    - abre este notebook y 
    - leelo todo de nuevo. 
    Todos nos merecemos una segunda oportunidad. También tú.
    """
    if df_to_submit.shape == sample.shape:
        if df_to_submit.columns.all() == sample.columns.all():
            if df_to_submit.laptop_ID.all() == sample.laptop_ID.all():
                print("You're ready to submit!")
                submission.to_csv("submission.csv", index = False) #muy importante el index = False
                urllib.request.urlretrieve("https://i.kym-cdn.com/photos/images/facebook/000/747/556/27a.jpg", "gfg.png")     
                img = Image.open("gfg.png")
                img.show()   
            else:
                print("Check the ids and try again")
        else:
            print("Check the names of the columns and try again")
    else:
        print("Check the number of rows and/or columns and try again")
        print("\nMensaje secreto de Clara: No me puedo creer que después de todo este notebook hayas hecho algún cambio en las filas de `diamonds_test.csv`. Lloro.")


In [9]:
data_report(train)

COL_N,laptop_ID,Company,Product,TypeName,Inches,ScreenResolution,Cpu,Ram,Memory,Gpu,OpSys,Weight,Price_euros
DATA_TYPE,int64,object,object,object,float64,object,object,object,object,object,object,object,float64
MISSINGS (%),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
UNIQUE_VALUES,912,19,467,6,17,34,96,8,32,94,9,166,603
CARDIN (%),100.0,2.08,51.21,0.66,1.86,3.73,10.53,0.88,3.51,10.31,0.99,18.2,66.12


### Variables Ram y Weight

Realizamos variacion en variables Ram y Weight para quitar las unidades y convertirlas a variables numericas, eliminamos las variables originales

In [10]:
train['Ram_1'] = train ['Ram'].map (lambda x: x.rstrip('GB')).astype(int)
train['Weight_1'] = train ['Weight'].map (lambda x: x.rstrip('kg')).astype(float)
train.drop(columns=['Ram', 'Weight'], inplace=True)

### Variable Company

In [11]:
train['Company'].unique()

array(['Acer', 'Dell', 'HP', 'Lenovo', 'Asus', 'Xiaomi', 'Apple',
       'Samsung', 'Razer', 'Toshiba', 'Fujitsu', 'Microsoft', 'MSI', 'LG',
       'Mediacom', 'Google', 'Vero', 'Huawei', 'Chuwi'], dtype=object)

### Variable Product

Esta variable se deja como esta y no se incluye en el analisis.

In [12]:
train['Product'].unique()

array(['Aspire ES1-523', 'XPS 13', 'Aspire A517-51G', 'Elitebook 820',
       'IdeaPad 320-15AST', 'IdeaPad 500-15ISK', 'IdeaPad 520S-14IKB',
       'Pavilion 14-BK001nv', 'Omen 17-w212nv', 'Rog GL553VE-FY052T',
       'Latitude 3180', 'Yoga 720-13IKB', 'IdeaPad 320-15IKBN',
       'Mi Notebook', 'EliteBook 850', 'Aspire ES1-531', 'MacBook 12"',
       'Latitude E5570', 'XPS 15', 'Alienware 17', 'Inspiron 7567',
       'B51-80 (i7-6500U/8GB/1008GB/Radeon', 'Spectre 13-V111dx',
       'Inspiron 3567', 'Spectre Pro', 'Latitude 7480', 'ProBook 440',
       'Swift 3', 'Latitude 5480',
       'X556UJ-XO044T (i7-6500U/4GB/500GB/GeForce', 'EliteBook x360',
       'Notebook Odyssey', 'Precision 3520', 'ThinkPad L470',
       'Probook 450', 'Aspire 5', 'Rog GL753VE-GC070T', 'Blade Pro',
       'Chromebook 14', 'Chromebook Flip', 'Tecra Z50-C-144',
       'Insprion 5767', 'Macbook Air', '250 G6',
       'E402WA-GA007T (E2-6110/4GB/64GB/W10', 'VivoBook S14',
       'Latitude 5590', 'Yoga 900-13IS

### Variable TypeName

Se realizará el proceso de OHE para esta variable

In [13]:
train['TypeName'].value_counts()

TypeName
Notebook              515
Ultrabook             141
Gaming                134
2 in 1 Convertible     83
Workstation            20
Netbook                19
Name: count, dtype: int64

### Variable Inches

Variable numerica, se deja como está para incluirla en el analisis

In [14]:
train['Inches'].value_counts()

Inches
15.6    454
14.0    151
13.3    116
17.3    110
12.5     29
11.6     22
12.0      6
13.5      6
13.9      4
15.4      3
12.3      3
15.0      2
10.1      2
18.4      1
14.1      1
17.0      1
13.0      1
Name: count, dtype: int64

### Variable ScreenResulution 

Se crearon dos variables, una llamada Resolution que aloja los valores de resolucion y otra llamada Screen que contiene la descripcion de la pantalla. Finalmente se elimina la variable ScreenResolution

In [15]:
train['ScreenResolution'].unique()

array(['1366x768', 'Quad HD+ / Touchscreen 3200x1800',
       'IPS Panel Full HD 1920x1080', 'Full HD 1920x1080', '1600x900',
       'IPS Panel Full HD / Touchscreen 1920x1080',
       'IPS Panel Retina Display 2304x1440',
       '4K Ultra HD / Touchscreen 3840x2160', 'Touchscreen 2560x1440',
       'Full HD / Touchscreen 1920x1080', '1440x900',
       'IPS Panel Quad HD+ / Touchscreen 3200x1800',
       'Touchscreen 2256x1504',
       'IPS Panel 4K Ultra HD / Touchscreen 3840x2160',
       'IPS Panel Touchscreen / 4K Ultra HD 3840x2160',
       'IPS Panel 1366x768', '4K Ultra HD 3840x2160',
       'IPS Panel 4K Ultra HD 3840x2160',
       'IPS Panel Retina Display 2880x1800',
       'IPS Panel Touchscreen 2560x1440', 'Touchscreen 1366x768',
       '2560x1440', 'IPS Panel Retina Display 2560x1600',
       'Quad HD+ 3200x1800', 'IPS Panel Full HD 2560x1440', '1920x1080',
       'Touchscreen 2400x1600', 'IPS Panel 2560x1440',
       'IPS Panel Quad HD+ 3200x1800', 'IPS Panel Touchscreen 

In [16]:
train['Resolution'] = train['ScreenResolution'].str.extract(r'(\d+x\d+)$', expand=True) 
train['Screen'] = train['ScreenResolution'].str.extract(r'^(.*?)(?=\d+x\d+$)', expand=True)
train.drop(columns=['ScreenResolution'], inplace=True)

### Variable Cpu

Se genera una nueva variable llamada cpu_GHz que contiene las velocidades en GHz de los procesadores. Se crea otra vaariable que guarda el nombre de la marca, esta se llama cpu_Marca. Finalemnte eliminamos la columna Cpu

In [17]:
train['Cpu'].unique()

array(['AMD A8-Series 7410 2.2GHz', 'Intel Core i7 7560U 2.4GHz',
       'Intel Core i5 8250U 1.6GHz', 'Intel Core i7 7500U 2.7GHz',
       'AMD A6-Series 9220 2.5GHz', 'Intel Core i7 6500U 2.5GHz',
       'Intel Core i3 7130U 2.7GHz', 'Intel Core i5 7200U 2.5GHz',
       'Intel Core i7 7700HQ 2.8GHz',
       'Intel Pentium Quad Core N4200 1.1GHz',
       'Intel Core i5 6300U 2.4GHz',
       'Intel Celeron Dual Core N3060 1.6GHz', 'Intel Core M 1.1GHz',
       'Intel Core i5 7300HQ 2.5GHz', 'Intel Core i3 6006U 2GHz',
       'Intel Core i7 6600U 2.6GHz', 'Intel Core i3 7100U 2.4GHz',
       'Intel Core i5 7440HQ 2.8GHz', 'Intel Core i7 7600U 2.8GHz',
       'Intel Core i7 6820HQ 2.7GHz', 'Intel Core i5 6200U 2.3GHz',
       'AMD A6-Series 9220 2.9GHz', 'Intel Core M M3-6Y30 0.9GHz',
       'Intel Core i3 6006U 2.0GHz', 'Intel Core i5 1.8GHz',
       'AMD E-Series 6110 1.5GHz', 'Intel Core i7 7Y75 1.3GHz',
       'Intel Core i7 8650U 1.9GHz', 'Intel Core i7 6560U 2.2GHz',
       'Intel 

In [18]:
train['cpu_GHz'] = train['Cpu'].str.extract(r'(\d+(?:\.\d+)?)GHz', expand=True).astype(float) 
train['cpu_Marca'] = train['Cpu'].str.extract(r'([a-zA-Z]+)(?![a-zA-Z0-9])', expand=True)
train.drop(columns=['Cpu'], inplace=True)

In [19]:
train['cpu_Marca'].value_counts()

cpu_Marca
Intel    871
AMD       41
Name: count, dtype: int64

### Variable Memory

Se generaron 3 variables Capacidad_1_GB y Capacidad_2_GB guardan las capacidades de los discos duros del equipo y una tercera llamada Tipo_Almacenamiento que contiene el tipo de almacenamiento del primer disco duro.
se elimino la cariable Memory

In [20]:
train['Memory'].unique()

array(['500GB HDD', '256GB SSD', '512GB SSD', '1TB HDD', '1.0TB Hybrid',
       '128GB SSD +  1TB HDD', '256GB SSD +  1TB HDD', '128GB SSD',
       '256GB SSD +  500GB HDD', '256GB Flash Storage',
       '512GB SSD +  1TB HDD', '32GB Flash Storage', '64GB Flash Storage',
       '64GB SSD', '512GB SSD +  256GB SSD', '1TB SSD', '180GB SSD',
       '128GB SSD +  2TB HDD', '128GB Flash Storage', '2TB HDD',
       '1TB HDD +  1TB HDD', '32GB SSD', '16GB Flash Storage',
       '256GB SSD +  256GB SSD', '512GB SSD +  2TB HDD', '16GB SSD',
       '256GB SSD +  2TB HDD', '512GB Flash Storage', '508GB Hybrid',
       '64GB Flash Storage +  1TB HDD', '512GB SSD +  512GB SSD',
       '1TB SSD +  1TB HDD'], dtype=object)

In [21]:
train['Capacidad_1'] = train['Memory'].str.extract(r'(\d+\.*\d*[GT]B)', expand=True)
train['Capacidad_1_GB']= train ['Capacidad_1'].apply(lambda x: convertir_a_GB(x) )
train['Tipo_Almacenamiento_1'] = train['Memory'].str.extract(r'\b(SSD|HDD|Flash Storage|Hybrid)\b', expand=False)
train['Capacidad_2'] = train['Memory'].str.extract(r'\+(\s*\d+(?:\.\d*)?\s*[GBT]+)', expand=True)
train['Capacidad_2_GB']= train ['Capacidad_2'].apply(lambda x: convertir_a_GB(x) )

train.drop(columns=['Memory'], inplace=True)

### Variable Gpu

Se genera una unica variable con el fabricante de las tarjetas, se elimina la variable Gpu

In [22]:
train['Fabricante_Gpu'] = train['Gpu'].str.extract(r'([a-zA-Z]+)', expand=True)
train.drop(columns=['Gpu'], inplace=True)

In [23]:
train['Fabricante_Gpu'].value_counts()

Fabricante_Gpu
Intel     516
Nvidia    271
AMD       125
Name: count, dtype: int64

### Variable OpSys

Se deja igual para hacer OHE a los 9 unicos valores

In [24]:
train['OpSys'].nunique()

9

In [31]:
train.columns

Index(['laptop_ID', 'Company', 'Product', 'TypeName', 'Inches', 'OpSys',
       'Price_euros', 'Ram_1', 'Weight_1', 'Resolution', 'Screen', 'cpu_GHz',
       'cpu_Marca', 'Capacidad_1', 'Capacidad_1_GB', 'Tipo_Almacenamiento_1',
       'Capacidad_2', 'Capacidad_2_GB', 'Fabricante_Gpu'],
      dtype='object')

In [32]:
#train.drop(columns=['Resolucition', 'Capacidad_1', 'Capacidad_2', 'Tipo_Almacenamiento'], inplace=True)
train.drop(columns=['Capacidad_1', 'Capacidad_2'], inplace=True)

-------------------------------------------------------------------------------------------------------------------------------------------

In [33]:
data_report(train)

COL_N,laptop_ID,Company,Product,TypeName,Inches,OpSys,Price_euros,Ram_1,Weight_1,Resolution,Screen,cpu_GHz,cpu_Marca,Capacidad_1_GB,Tipo_Almacenamiento_1,Capacidad_2_GB,Fabricante_Gpu
DATA_TYPE,int64,object,object,object,float64,object,float64,int32,float64,object,object,float64,object,float64,object,float64,object
MISSINGS (%),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,84.76,0.0
UNIQUE_VALUES,912,19,467,6,17,9,603,8,160,14,19,23,2,11,4,5,3
CARDIN (%),100.0,2.08,51.21,0.66,1.86,0.99,66.12,0.88,17.54,1.54,2.08,2.52,0.22,1.21,0.44,0.55,0.33


Agrupamos las variables numericas y categoricas por separado

In [35]:
train_cat = ['Company', 'TypeName', 'OpSys', 'Screen', 'Resolution', 'cpu_Marca',
              'Tipo_Almacenamiento_1', 'Fabricante_Gpu']
train_num = ['laptop_ID', 'Inches', 'Price_euros', 'Ram_1',	'Weight_1']

Aplicamos la funcion de OHE a las varaibles categoricas

In [36]:
from sklearn.preprocessing import OneHotEncoder
train_ohe = apply_onehot_encoder(train=train, columns_to_encode=train_cat)



In [37]:
train_ohe.shape

(912, 85)

In [38]:
train_ohe.columns

Index(['laptop_ID', 'Product', 'Inches', 'Price_euros', 'Ram_1', 'Weight_1',
       'cpu_GHz', 'Capacidad_1_GB', 'Capacidad_2_GB', 'Company_Acer',
       'Company_Apple', 'Company_Asus', 'Company_Chuwi', 'Company_Dell',
       'Company_Fujitsu', 'Company_Google', 'Company_HP', 'Company_Huawei',
       'Company_LG', 'Company_Lenovo', 'Company_MSI', 'Company_Mediacom',
       'Company_Microsoft', 'Company_Razer', 'Company_Samsung',
       'Company_Toshiba', 'Company_Vero', 'Company_Xiaomi',
       'TypeName_2 in 1 Convertible', 'TypeName_Gaming', 'TypeName_Netbook',
       'TypeName_Notebook', 'TypeName_Ultrabook', 'TypeName_Workstation',
       'OpSys_Android', 'OpSys_Chrome OS', 'OpSys_Linux', 'OpSys_Mac OS X',
       'OpSys_No OS', 'OpSys_Windows 10', 'OpSys_Windows 10 S',
       'OpSys_Windows 7', 'OpSys_macOS', 'Screen_', 'Screen_4K Ultra HD ',
       'Screen_4K Ultra HD / Touchscreen ', 'Screen_Full HD ',
       'Screen_Full HD / Touchscreen ', 'Screen_IPS Panel ',
       'Screen_I

In [39]:
train_ohe['Capacidad_2_GB'] = train_ohe['Capacidad_2_GB'].fillna(0)

definimos X e Y

In [40]:
x = train_ohe.drop(['Product', 'Price_euros'], axis=1)
y = train_ohe['Price_euros'].copy()
y.shape

(912,)

Dividir X_train, X_test, y_train, y_test

In [41]:
from sklearn.model_selection import train_test_split

In [42]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.20, random_state = 42)

Asignamos el modelo 

In [43]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()

In [44]:
model.fit(x,y)

In [45]:
pred = model.predict(X_test)
pred

array([1110.13065297, -174.94892112,  772.51049195, 1480.55762637,
       2003.48876808,  529.00157221, 1018.12704577, 1692.57407318,
        812.2498657 , 1351.25022983,  892.6885151 ,  451.25567962,
        369.3900077 , 1443.83581726, 1783.92876973, 1691.5125403 ,
       1790.17205204, 1219.10330844, 1019.45430113,  660.97053287,
        891.51634026,  496.30854004, 1065.62444031, 1068.05764855,
       1052.41014373,  454.21965472,  956.36872678, 1844.20968196,
        829.02934821,  838.22657131, 1502.43089085,  986.48716005,
        698.15817123, 1549.02792416, 1178.16834093,  168.46450192,
        612.79506997,  553.0497078 , 1391.96451881, 1048.58366515,
       1306.14809204, 1210.76262763, 1077.90815548, 2755.96391077,
       1084.94218982,  302.55126859, 1168.62115047,  498.68559289,
       1198.12945977,  727.7841916 , 1574.28185499,  785.28393646,
        200.78337618, 2559.07185253, 1259.11449511, 1777.73445455,
       1679.33643429, 1047.3300026 , 2658.0785059 , 1708.90179

Sacamos metricas y evaluamos 

In [46]:
from sklearn.metrics import mean_squared_error
import numpy as np

print('RMSE :',np.sqrt(mean_squared_error(y_test,pred)))

RMSE : 316.72146941213595


#### Ahora trabajamos los datos del archivo TEST

### Procesamiento

In [47]:
X_pred = pd.read_csv("data/test.csv")
X_pred.head()

Unnamed: 0,laptop_ID,Company,Product,TypeName,Inches,ScreenResolution,Cpu,Ram,Memory,Gpu,OpSys,Weight
0,750,Toshiba,Tecra X40-D-10G,Notebook,14.0,IPS Panel Full HD / Touchscreen 1920x1080,Intel Core i5 7200U 2.5GHz,8GB,256GB SSD,Intel HD Graphics 620,Windows 10,1.25kg
1,726,Mediacom,Smartbook 142,Notebook,14.0,IPS Panel Full HD 1920x1200,Intel Atom Z8350 1.92GHz,4GB,32GB HDD,Intel HD Graphics 400,Windows 10,1.4kg
2,633,Mediacom,SmartBook 140,Notebook,14.0,Full HD 1920x1080,Intel Atom x5-Z8350 1.44GHz,2GB,32GB Flash Storage,Intel HD Graphics,Windows 10,1.4kg
3,363,Dell,Inspiron 7577,Gaming,15.6,Full HD 1920x1080,Intel Core i5 7300HQ 2.5GHz,8GB,1TB HDD,Nvidia GeForce GTX 1050,Windows 10,2.65kg
4,319,Asus,VivoBook Flip,2 in 1 Convertible,11.6,Touchscreen 1366x768,Intel Celeron Dual Core N3350 1.1GHz,2GB,32GB Flash Storage,Intel HD Graphics 500,Windows 10,1.1kg


### Variables Ram y Weight

In [48]:
X_pred['Ram_1'] = X_pred ['Ram'].map (lambda x: x.rstrip('GB')).astype(int)
X_pred['Weight_1'] = X_pred ['Weight'].map (lambda x: x.rstrip('kg')).astype(float)
X_pred.drop(columns=['Ram', 'Weight'], inplace=True)

### Variable ScreenResulution 

In [49]:
X_pred['Resolution'] = X_pred['ScreenResolution'].str.extract(r'(\d+x\d+)$', expand=True) 
X_pred['Screen'] = X_pred['ScreenResolution'].str.extract(r'^(.*?)(?=\d+x\d+$)', expand=True)
X_pred.drop(columns=['ScreenResolution'], inplace=True)

### Variable Cpu

In [50]:
X_pred['cpu_GHz'] = X_pred['Cpu'].str.extract(r'(\d+(?:\.\d+)?)GHz', expand=True).astype(float) 
X_pred['cpu_Marca'] = X_pred['Cpu'].str.extract(r'([a-zA-Z]+)(?![a-zA-Z0-9])', expand=True)
X_pred.drop(columns=['Cpu'], inplace=True)

### Variable Memory

In [51]:
X_pred['Capacidad_1'] = X_pred['Memory'].str.extract(r'(\d+\.*\d*[GT]B)', expand=True)
X_pred['Capacidad_1_GB']= X_pred ['Capacidad_1'].apply(lambda x: convertir_a_GB(x) )
X_pred['Tipo_Almacenamiento_1'] = X_pred['Memory'].str.extract(r'\b(SSD|HDD|Flash Storage|Hybrid)\b', expand=False)
X_pred['Capacidad_2'] = X_pred['Memory'].str.extract(r'\+(\s*\d+(?:\.\d*)?\s*[GBT]+)', expand=True)
X_pred['Capacidad_2_GB']= X_pred ['Capacidad_2'].apply(lambda x: convertir_a_GB(x) )

X_pred.drop(columns=['Memory'], inplace=True)

### Variable Gpu

In [52]:
X_pred['Fabricante_Gpu'] = X_pred['Gpu'].str.extract(r'([a-zA-Z]+)', expand=True)
X_pred.drop(columns=['Gpu'], inplace=True)

In [54]:
X_pred.drop(columns=['Capacidad_1', 'Capacidad_2'], inplace=True)

In [55]:
data_report(X_pred)

COL_N,laptop_ID,Company,Product,TypeName,Inches,OpSys,Ram_1,Weight_1,Resolution,Screen,cpu_GHz,cpu_Marca,Capacidad_1_GB,Tipo_Almacenamiento_1,Capacidad_2_GB,Fabricante_Gpu
DATA_TYPE,int64,object,object,object,float64,object,int32,float64,object,object,float64,object,float64,object,float64,object
MISSINGS (%),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,82.35,0.0
UNIQUE_VALUES,391,16,256,6,13,8,9,116,12,19,22,3,12,4,2,4
CARDIN (%),100.0,4.09,65.47,1.53,3.32,2.05,2.3,29.67,3.07,4.86,5.63,0.77,3.07,1.02,0.51,1.02


In [56]:
X_pred_cat = ['Company', 'TypeName', 'OpSys', 'Screen', 'Resolution', 'cpu_Marca',
              'Tipo_Almacenamiento_1', 'Fabricante_Gpu']

In [64]:
train_one, test_one = apply_onehot_encoder(train=train, columns_to_encode=train_cat, test=X_pred)



In [69]:
train_one.shape

(912, 84)

In [68]:
train_one=train_one.drop(columns='Price_euros')

In [66]:
test_one.shape

(391, 84)

In [67]:
columnas_df1 = set(train_one.columns)
columnas_df2 = set(test_one.columns)

# Comparar las columnas
columnas_comunes = columnas_df1.intersection(columnas_df2)
columnas_solo_df1 = columnas_df1 - columnas_comunes
columnas_solo_df2 = columnas_df2 - columnas_comunes

# Imprimir resultados
print(f"Columnas comunes: {columnas_comunes}")
print(f"Columnas solo en df1: {columnas_solo_df1}")
print(f"Columnas solo en df2: {columnas_solo_df2}")

Columnas comunes: {'Resolution_2560x1600', 'Tipo_Almacenamiento_1_Hybrid', 'TypeName_Notebook', 'OpSys_Mac OS X', 'Company_Fujitsu', 'TypeName_Workstation', 'Tipo_Almacenamiento_1_Flash Storage', 'Company_Mediacom', 'Weight_1', 'Company_Toshiba', 'Screen_Full HD / Touchscreen ', 'Screen_IPS Panel 4K Ultra HD ', 'Screen_IPS Panel Touchscreen / 4K Ultra HD ', 'OpSys_Windows 7', 'Company_Chuwi', 'Company_Lenovo', 'Screen_IPS Panel Full HD / Touchscreen ', 'Company_Vero', 'Screen_IPS Panel Quad HD+ / Touchscreen ', 'cpu_Marca_Intel', 'Fabricante_Gpu_AMD', 'Capacidad_2_GB', 'Resolution_2880x1800', 'Resolution_2160x1440', 'Resolution_1920x1080', 'Screen_Full HD ', 'Screen_Touchscreen ', 'OpSys_Android', 'Screen_IPS Panel 4K Ultra HD / Touchscreen ', 'Screen_IPS Panel Full HD ', 'Screen_IPS Panel ', 'Resolution_1920x1200', 'Fabricante_Gpu_Intel', 'Company_Xiaomi', 'Inches', 'cpu_GHz', 'Resolution_3840x2160', 'Company_MSI', 'Resolution_1440x900', 'Company_Acer', 'Screen_Quad HD+ / Touchscreen 

In [74]:
train_one=train_one.drop(columns='Product')

In [75]:
test_one=test_one.drop(columns='Product')

In [78]:
test_one['Capacidad_2_GB'] = test_one['Capacidad_2_GB'].fillna(0)

In [79]:
predictions_submit = model.predict(test_one)
predictions_submit

array([1101.45580923,  259.15924065,   96.54927757, 1013.57879724,
        338.88955845,  607.66222315,  467.20555268, 1741.7681606 ,
       1740.12811989, 1306.30355823,  905.77983403,  981.95188535,
       1158.24560946,  957.08830066,  807.00887095, 1168.26939393,
       1151.57061927, 2606.0673267 ,  432.42244722,  775.28905057,
       1412.61653583,  867.56626201,  549.64578981, 1095.23299431,
        442.36524295, 1606.20872807,  719.46609575,  612.07900896,
        474.92407335, 1286.02289497,  917.88903432, 1099.48169986,
       1032.27912759, 1034.4197896 , 2094.21617745, 1195.67095761,
       1094.24351142, 1468.15800323,  768.0305716 , 1920.27735148,
       1887.04670193, 1167.14534429,  332.39400682, 1749.47527672,
        955.14076709, 2016.25473472,  724.100577  , 1702.7892884 ,
       1233.80714006, 1854.98716988, 1132.12669662,  382.30233501,
        793.17313208, 1494.0979789 ,  399.31249612,  987.81994824,
        968.42144882,  292.56048705, 1226.91411459, 1271.67626

In [80]:
sample = pd.read_csv("data/sample_submission.csv")

In [81]:
sample.head()

Unnamed: 0,laptop_ID,Price_euros
0,750,500
1,726,500
2,633,500
3,363,650
4,319,650


In [82]:
sample.shape

(391, 2)

In [83]:
submission = pd.DataFrame({"laptop_ID": sample['laptop_ID'], "Price_euros": predictions_submit})

In [84]:
submission.head()

Unnamed: 0,laptop_ID,Price_euros
0,750,1101.455809
1,726,259.159241
2,633,96.549278
3,363,1013.578797
4,319,338.889558


In [85]:
submission.shape

(391, 2)

In [87]:
chequeator(submission)

You're ready to submit!
