In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
data_path = '../data/'
lap_data = 'Laptop-partially-cleaned.csv'
pd1 = pd.read_csv(data_path + lap_data, dtype=str)

In [5]:
# Process laptop data

pd_out = pd1.copy()

In [6]:
# The CPU and GPU will be processed later with cpu.csv and gpu.csv

# Display: including size, resolution, refresh rate, panel type, refresh sync
display = pd1['Display'].str.split(',', expand=True).to_numpy()
for row in display:
    if row[3] is None:
        if row[2] is not None and not((row[2].find('Hz') != -1 or row[2].isnumeric())):
            row[3] = row[2]
            row[2] = None
display = pd.DataFrame(display)
# dealing with column 0, minor error in some collumns, convert them all to float
display0 = display[0].str.extract(r'(\d+\.?\d+)', expand=True).astype(float)
display0.rename(columns={0: 'Display Size'}, inplace=True)
# dealing with column 1, since resolution quality approximates its length*height, convert them all to float
display1 = display[1].str.extract(r'(\d+)\s?x\s?(\d+)', expand=True).astype(float)
display1 = display1[0] * display1[1]
display1 = display1.rename('Resolution')
# dealing with column 2, extract refresh rate and convert them all to float
display2 = display[2].str.extract(r'(\d+)', expand=True).astype(float)
display2.rename(columns={0: 'Refresh Rate'}, inplace=True)
# fill NaN with 60, since 90 is the min in the known data, I assmue 60 is the missing since it's usually ignore in information sites
display2 = display2.fillna(60)
# dealing with column 3, leave alone
display3 = display[3]
# gather them back
display = pd.concat([display0, display1, display2, display3], axis=1)
display.rename(columns={3: 'Panel Type'}, inplace=True)

display

Unnamed: 0,Display Size,Resolution,Refresh Rate,Panel Type
0,17.3,3686400.0,240.0,IPS + G-Sync
1,16.0,4096000.0,240.0,IPS + FreeSync
2,16.0,4096000.0,240.0,IPS + FreeSync
3,16.0,4096000.0,240.0,IPS + FreeSync
4,17.3,3686400.0,240.0,IPS + G-Sync
...,...,...,...,...
289166,17.3,2073600.0,60.0,IPS
289167,17.3,1440000.0,60.0,TN
289168,17.3,1440000.0,60.0,TN
289169,17.3,1440000.0,60.0,TN


In [7]:
import re

# HDD/SDD: including capacity, type
disk = pd1['HDD/SSD'].str.split('+' ,expand=True)[[0, 1]]
part_1 = disk[0].astype(str)
part_2 = disk[1].astype(str)
result = np.zeros((len(part_1), 5))

def extract(lst):
    if lst.find('tb') != -1:
        res = re.findall(r'(\d+)\s?tb', lst)
        num = float(res[0]) * 1000
    elif lst.find('gb') != -1:
        res = re.findall(r'(\d+)\s?gb', lst)
        num = float(res[0])
    else:
        num = -1
    if lst.find('ssd') != -1:
        tp = 'SSD'
    elif lst.find('hdd') != -1:
        tp = 'HDD'
    elif lst.find('sshd') != -1:
        tp = 'SSHD'
    elif lst.find('optance') != -1:
        tp = 'Optane'
    else:
        tp = -1
    return num, tp

for i in range(len(part_1)):
    n = 0
    t = []
    lst = part_1.iloc[i].lower()
    if lst != 'None':
        a, b = extract(lst)
        if a != -1:
            n += a
        if b != -1:
            t.append(b)
    lst = part_2.iloc[i].lower()
    if lst != 'None':
        a, b = extract(lst)
        if a != -1:
            n += a
        if b != -1:
            t.append(b)
    result[i][0] = n
    for j in t:
        match j:
            case 'SSD':
                result[i][1] = 1
            case 'HDD':
                result[i][2] = 1
            case 'SSHD':
                result[i][3] = 1
            case 'Optane':
                result[i][4] = 1
    if not t:
        result[i][1] = np.nan
        result[i][2] = np.nan
        result[i][3] = np.nan
        result[i][4] = np.nan
    
result = pd.DataFrame(result, columns=['Disk Capacity', 'SSD', 'HDD', 'SSHD', 'Optane'])
result

Unnamed: 0,Disk Capacity,SSD,HDD,SSHD,Optane
0,2000.0,1.0,0.0,0.0,0.0
1,16000.0,1.0,0.0,0.0,0.0
2,2000.0,1.0,0.0,0.0,0.0
3,2000.0,1.0,0.0,0.0,0.0
4,2000.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...
289166,256.0,1.0,0.0,0.0,0.0
289167,256.0,1.0,0.0,0.0,0.0
289168,2256.0,1.0,1.0,0.0,0.0
289169,1512.0,1.0,1.0,0.0,0.0


In [8]:
# RAM: including capacity, type
ram = pd1['RAM'].astype('str').str.split(',', expand=True)

capacity = ram[0].str.extract(r'(\d+)', expand=True).astype(float)
capacity.rename(columns={0: 'RAM Capacity'}, inplace=True)

ram_type = ram[1]
ram_type.rename('RAM Type', inplace=True)

frequency = ram[2].str.extract(r'(\d+)', expand=True).astype(float)
frequency.rename(columns={0: 'RAM Frequency'}, inplace=True)

ram = pd.concat([capacity, ram_type, frequency], axis=1)
ram

Unnamed: 0,RAM Capacity,RAM Type,RAM Frequency
0,64.0,DDR5,
1,64.0,DDR5,
2,32.0,DDR5,
3,64.0,DDR5,
4,16.0,DDR5,
...,...,...,...
289166,8.0,DDR4,2666.0
289167,16.0,DDR4,
289168,16.0,DDR4,
289169,8.0,DDR4,


In [9]:
# OS
os = pd1['OS'].str.lower()
os

0          windows 11 pro
1          windows 10 pro
2          windows 11 pro
3          windows 10 pro
4         windows 11 home
               ...       
289166    windows 10 home
289167    windows 10 home
289168     windows 10 pro
289169    windows 10 home
289170    windows 10 home
Name: OS, Length: 289171, dtype: object

In [10]:
# Body material
material = pd1['Body Material'].str.lower().astype('str')
res = {}

for row in material:
    r = row.split(',')
    for w in r:
        if w not in res:
            res[w] = len(res)

aaaaa = np.zeros((len(material), len(res)))
for i in range(len(material)):
    r = material.iloc[i].split(',')
    for w in r:
        aaaaa[i][res[w]] = 1
material = pd.DataFrame(aaaaa, columns=['Body material: ' + x for x in res.keys()])
material = material.drop('Body material: nan', axis=1)
material

Unnamed: 0,Body material: plastic / polycarbonate,Body material: aluminum,Body material: aluminum.1,Body material: magnesium alloy,Body material: glass fiber,Body material: carbon,Body material: carbon.1,Body material: magnesium alloy.1,Body material: magnesium-lithium alloy,Body material: plastic,Body material: matte,Body material: black synthetic material,Body material: plastic.1,Body material: magnesium-lithium alloy.1
0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
289166,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
289167,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
289168,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
289169,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
# weight: convert to float in kg
weight = pd1['Weight'].str.extract(r'(\d+\.?\d+)', expand=True).astype(float)
weight.rename(columns={0: 'Weight'}, inplace=True)
weight

Unnamed: 0,Weight
0,2.80
1,2.67
2,2.67
3,2.67
4,2.80
...,...
289166,2.60
289167,2.05
289168,2.45
289169,2.05


In [12]:
# USB type C: number of USB type C ports
usb_c = pd1['USB Type-C'].astype('float')
usb_c

0         2.0
1         2.0
2         2.0
3         2.0
4         2.0
         ... 
289166    1.0
289167    1.0
289168    0.0
289169    1.0
289170    0.0
Name: USB Type-C, Length: 289171, dtype: float64

In [13]:
# USB type A: number of USB type A ports
usb_a = pd1['USB Type-A'].astype('float')
usb_a

0         2.0
1         2.0
2         2.0
3         2.0
4         2.0
         ... 
289166    3.0
289167    2.0
289168    3.0
289169    2.0
289170    3.0
Name: USB Type-A, Length: 289171, dtype: float64

In [14]:
# HDMI: generation of HDMI port
hdmi = pd1['HDMI'].astype('float')
hdmi

0         2.1
1         2.1
2         2.1
3         2.1
4         2.1
         ... 
289166    NaN
289167    1.4
289168    1.4
289169    1.4
289170    1.4
Name: HDMI, Length: 289171, dtype: float64

In [15]:
# Bluetooth: version of Bluetooth
bluetooth = pd1['Bluetooth'].astype('float')
bluetooth

0         5.2
1         5.2
2         5.2
3         5.2
4         5.2
         ... 
289166    5.0
289167    5.2
289168    4.2
289169    5.2
289170    4.2
Name: Bluetooth, Length: 289171, dtype: float64

In [16]:
# Ethernet/LAN: if there is a LAN port, then get the fastest speed, else 0
ethernet = pd1['Ethernet LAN'].astype('str')
for i in range(len(ethernet)):
    if ethernet.iloc[i] == 'None' or ethernet.iloc[i] == 'nan':
        ethernet.iloc[i] = 0
    else:
        res = re.findall(r'(\d+)', ethernet.iloc[i])
        ethernet.iloc[i] = res[-1]

ethernet = ethernet.astype(float)
ethernet

0         2500.0
1         2500.0
2         2500.0
3         2500.0
4         2500.0
           ...  
289166       0.0
289167       0.0
289168    1000.0
289169       0.0
289170    1000.0
Name: Ethernet LAN, Length: 289171, dtype: float64

In [17]:
# Security lock:
security = pd1['Security Lock slot'].astype('str')
res = {}

for row in security:
    if row == 'None' or row == 'nan' or row == '0':
        continue
    if row not in res:
        res[row] = len(res)

aaaaa = np.zeros((len(security), len(res)))
for i in range(len(security)):
    if security.iloc[i] == 'None' or security.iloc[i] == 'nan':
        aaaaa[i] = np.nan
        continue
    if security.iloc[i] == '0':
        continue
    r = security.iloc[i]
    aaaaa[i][res[r]] = 1

security = pd.DataFrame(aaaaa, columns=['Security lock: ' + x for x in res.keys()]).astype('float')
security

Unnamed: 0,Security lock: Kensington,Security lock: Kensington Nano,Security lock: 1,Security lock: Wedge,Security lock: Nano security,Security lock: HP Tamper,Security lock: Function key,Security lock: Noble
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...
289166,,,,,,,,
289167,,,,,,,,
289168,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
289169,,,,,,,,


In [18]:
# Fingerprint reader
fingerprint = pd1['Fingerprint reader'].astype('float')
fingerprint

0         0.0
1         0.0
2         0.0
3         0.0
4         0.0
         ... 
289166    0.0
289167    NaN
289168    NaN
289169    NaN
289170    NaN
Name: Fingerprint reader, Length: 289171, dtype: float64

In [19]:
# Backlit keyboard
backlit = pd1['Backlit keyboard'].astype('str')

for i in range(len(backlit)):
    if backlit.iloc[i] == 'None' or backlit.iloc[i] == 'nan':
        backlit.iloc[i] = np.nan
    elif backlit.iloc[i] == '0':
        backlit.iloc[i] = 0
    else:
        backlit.iloc[i] = 1
        
backlit = backlit.astype(float)
backlit

0         1.0
1         1.0
2         1.0
3         1.0
4         1.0
         ... 
289166    0.0
289167    NaN
289168    0.0
289169    NaN
289170    0.0
Name: Backlit keyboard, Length: 289171, dtype: float64

In [20]:
infos = pd1[['link', 'name']]
cols = [infos, display, result, ram, os, material, weight, usb_c, usb_a, hdmi, bluetooth, ethernet, security, fingerprint, backlit]

pd_out = pd.concat(cols, axis=1)
pd_out

Unnamed: 0,link,name,Display Size,Resolution,Refresh Rate,Panel Type,Disk Capacity,SSD,HDD,SSHD,...,Security lock: Kensington,Security lock: Kensington Nano,Security lock: 1,Security lock: Wedge,Security lock: Nano security,Security lock: HP Tamper,Security lock: Function key,Security lock: Noble,Fingerprint reader,Backlit keyboard
0,https://laptopmedia.com/laptop-specs/asus-rog-...,ASUS ROG Strix G17,17.3,3686400.0,240.0,IPS + G-Sync,2000.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,https://laptopmedia.com/laptop-specs/asus-rog-...,ASUS ROG Zephyrus Duo 16,16.0,4096000.0,240.0,IPS + FreeSync,16000.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,https://laptopmedia.com/laptop-specs/asus-rog-...,ASUS ROG Zephyrus Duo 16,16.0,4096000.0,240.0,IPS + FreeSync,2000.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,https://laptopmedia.com/laptop-specs/asus-rog-...,ASUS ROG Zephyrus Duo 16,16.0,4096000.0,240.0,IPS + FreeSync,2000.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,https://laptopmedia.com/laptop-specs/asus-rog-...,ASUS ROG Strix G17,17.3,3686400.0,240.0,IPS + G-Sync,2000.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
289166,https://laptopmedia.com/laptop-specs/acer-aspi...,Acer Aspire 5,17.3,2073600.0,60.0,IPS,256.0,1.0,0.0,0.0,...,,,,,,,,,0.0,0.0
289167,https://laptopmedia.com/laptop-specs/hp-17-5749/,HP 17,17.3,1440000.0,60.0,TN,256.0,1.0,0.0,0.0,...,,,,,,,,,,
289168,https://laptopmedia.com/laptop-specs/hp-17-17-...,HP 17 (17-by4000),17.3,1440000.0,60.0,TN,2256.0,1.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,,0.0
289169,https://laptopmedia.com/laptop-specs/hp-17-4594/,HP 17,17.3,1440000.0,60.0,TN,1512.0,1.0,1.0,0.0,...,,,,,,,,,,
