## Process data

In [183]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import regex as re
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.decomposition import PCA
#import pygwalker as pyg

data = pd.read_csv("dai_combined.csv")
data.head()

Unnamed: 0,brand,price,old,new,cpu,cpu_brand,ram_capacity,ram_brand,hard_drive_type,hard_drive_capacity,card,card_brand,screen_size,screen_type
0,Asus,26990000.0,0,1,Intel Core Ultra 5 125H,Intel,16.0,DDR5,SSD,512GB,Intel Arc Graphics,Intel,14.0,OLED
1,Lenovo,16390000.0,0,1,AMD Ryzen 77730U,AMD,16.0,DDR4,SSD,512GB,AMD Radeon Graphics,AMD,15.6,HD
2,ASUS,28990000.0,0,1,Intel Core Ultra 5 125H,Intel,16.0,DDR5,SSD,512GB,Intel Arc Graphics,Intel,14.0,OLED
3,LG,36290000.0,0,1,Intel Core Ultra 5 125H,Intel,16.0,DDR5,SSD,512GB,Intel Arc graphics,Intel,14.0,IPS
4,Dell,18990000.0,0,1,Intel Core 5 processor 120U,Intel,8.0,DDR5,SSD,512GB,Intel Graphics,Intel,14.0,HD


In [184]:
data['card'].unique()

array(['Intel Arc Graphics', 'AMD Radeon Graphics', 'Intel Arc graphics',
       'Intel Graphics', 'AMD Radeon 890M Graphics',
       'Intel Iris Xᵉ Graphics', 'NVIDIA GeForce RTX 3050 6GB GDDR6',
       'Intel Iris Xe Graphics', 'NVIDIA GeForce RTX 4050 6GB GDDR6',
       'Intel UHD Graphics', 'Intel Arc Graphics 140V',
       'Integrated Intel Graphics', 'Intel Arc 140V GPU',
       'Intel Arc 130V GPU', 'Intel Arc Graphics 130V',
       'NVIDIA RTX 500 Ada Generation 4GB GDDR6',
       'AMD Radeon 760M Graphics', 'NVIDIA GeForce RTX 3050 4GB GDDR6',
       'AMD 780M Graphics', 'AMD Radeon 880M Graphics',
       'NVIDIA RTX A500 4GB GDDR6', 'Integrated Qualcomm Adreno GPU',
       'NVIDIA Geforce RTX 3050 6GB GDDR6', 'Intel Arc 140V Graphics',
       'NVIDIA RTX A1000 6GB GDDR6',
       'NVIDIA RTX 1000 Ada Generation 6GB GDDR6', 'Qualcomm Adreno GPU',
       'AMD Radeon graphics', 'NVIDIA GeForce MX570 2GB GDDR6',
       'NVIDIA GeForce RTX 4060 8GB GDDR6',
       'NVIDIA GeForce RT

In [185]:
ssd_counts = data['card'].value_counts()
print(ssd_counts)

card
Intel Iris Xe Graphics                      185
Intel UHD Graphics                          123
Intel Arc Graphics                          105
Intel Graphics                              102
NVIDIA GeForce RTX 4060 8GB GDDR6            76
                                           ... 
Card rời- NVIDIA GeForce RTX A500, 4 GB       1
Card rời- NVIDIA GeForce RTX 4090, 16 GB      1
AMD Radeon RX7600S 8GB GDDR6                  1
NVIDIA GeForce RTX  4060 8GB GDDR6            1
NVIDIA GeForce RTX 3050Ti 4GB GDDR6           1
Name: count, Length: 102, dtype: int64


In [186]:
brand_counts = train['brand'].value_counts()
print(brand_counts)

brand
Laptop Lenovo      348
Laptop HP          199
Laptop Dell        174
Laptop Asus        147
Laptop Acer        141
Laptop MSI          77
HP                  65
Laptop ASUS         41
Laptop MacBook      22
Lenovo              22
Dell                19
Laptop LG           13
Acer                12
Laptop Gigabyte      9
MSI                  5
Gigabyte             5
LG                   3
Laptop Samsung       2
Máy                  2
Laptop Creator       2
Laptop               1
Laptop Stealth       1
Laptop Gaming        1
Laptop Aspire        1
Name: count, dtype: int64


In [187]:
data['hard_drive_capacity'] = data['hard_drive_capacity'].str.replace(' ', '', regex=False)

In [188]:
cat_cols = ['brand', 'cpu', 'cpu_brand', 'ram_capacity',
       'ram_brand', 'hard_drive_type', 'hard_drive_capacity', 'card',
       'card_brand', 'screen_size', 'screen_type']

In [189]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1311 entries, 0 to 1310
Data columns (total 14 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   brand                1311 non-null   object 
 1   price                1250 non-null   float64
 2   old                  1311 non-null   int64  
 3   new                  1311 non-null   int64  
 4   cpu                  1311 non-null   object 
 5   cpu_brand            1310 non-null   object 
 6   ram_capacity         1310 non-null   float64
 7   ram_brand            1251 non-null   object 
 8   hard_drive_type      1297 non-null   object 
 9   hard_drive_capacity  1301 non-null   object 
 10  card                 1311 non-null   object 
 11  card_brand           1311 non-null   object 
 12  screen_size          1284 non-null   float64
 13  screen_type          1265 non-null   object 
dtypes: float64(3), int64(2), object(9)
memory usage: 143.5+ KB


In [190]:
data[cat_cols] = data[cat_cols].fillna('missing')
data.head()
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1311 entries, 0 to 1310
Data columns (total 14 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   brand                1311 non-null   object 
 1   price                1250 non-null   float64
 2   old                  1311 non-null   int64  
 3   new                  1311 non-null   int64  
 4   cpu                  1311 non-null   object 
 5   cpu_brand            1311 non-null   object 
 6   ram_capacity         1311 non-null   object 
 7   ram_brand            1311 non-null   object 
 8   hard_drive_type      1311 non-null   object 
 9   hard_drive_capacity  1311 non-null   object 
 10  card                 1311 non-null   object 
 11  card_brand           1311 non-null   object 
 12  screen_size          1311 non-null   object 
 13  screen_type          1311 non-null   object 
dtypes: float64(1), int64(2), object(11)
memory usage: 143.5+ KB


In [192]:
en = LabelEncoder()

for cols in cat_cols:
    data[cols] = en.fit_transform(data[cols])

print('Dataframe encoded by Label encoding dimension : ', encoded_data.shape)

TypeError: Encoders require their input argument must be uniformly strings or numbers. Got ['float', 'str']