In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
data_path = 'data/'
cpu_data = 'cpu.csv'
pd1 = pd.read_csv(data_path + cpu_data, dtype=str)

In [3]:
# Drop useless columns
pd1 = pd1.drop(columns=['Unnamed: 0', 'cpu_link', 'name'])

In [4]:
# Ranking: extract number
rank = pd1['Ranking'].str.extract(r'(\d+)').astype(float)
rank.rename(columns={0: 'Ranking'}, inplace=True)
rank

Unnamed: 0,Ranking
0,1.0
1,2.0
2,3.0
3,4.0
4,5.0
...,...
251,252.0
252,253.0
253,254.0
254,255.0


In [5]:
# CPU frequency: extract number, include base and max
freq = pd1['Base / Max CPU frequency'].str.extract(r'(\d+\.\d+)\s-\s(\d+\.\d+)').astype(float)
freq.rename(columns={0: 'Base CPU freq', 1: 'Max CPU freq'}, inplace=True)
freq

Unnamed: 0,Base CPU freq,Max CPU freq
0,2.5,5.45
1,2.4,5.20
2,2.3,5.40
3,1.6,5.60
4,1.6,5.80
...,...,...
251,2.3,3.20
252,1.1,2.80
253,1.1,2.80
254,1.1,2.40


In [6]:
# Core/Thread: extract number, include core and thread
core_thread = pd1['Cores / Threads'].str.extract(r'(\d+)\s/\s(\d+)').astype(float)
core_thread.rename(columns={0: 'Cores', 1: 'Threads'}, inplace=True)
core_thread

Unnamed: 0,Cores,Threads
0,16.0,32.0
1,12.0,24.0
2,16.0,32.0
3,24.0,32.0
4,24.0,32.0
...,...,...
251,2.0,2.0
252,2.0,2.0
253,4.0,4.0
254,2.0,2.0


In [7]:
# Power consumption: extract number
power = pd1['TDP'].str.extract(r'(\d+)').astype(float)
power.rename(columns={0: 'TDP'}, inplace=True)
power

Unnamed: 0,TDP
0,55.0
1,55.0
2,55.0
3,55.0
4,55.0
...,...
251,15.0
252,6.0
253,6.0
254,6.0


In [8]:
# Max operating temperature: extract number
temp = pd1['Max. operating temperature'].str.extract(r'(\d+)').astype(float)
temp.rename(columns={0: 'Max operating temperature'}, inplace=True)
temp

Unnamed: 0,Max operating temperature
0,100.0
1,100.0
2,89.0
3,100.0
4,100.0
...,...
251,95.0
252,105.0
253,100.0
254,105.0


In [9]:
# Release quarter: extract number as convention: Q1 2024 = 2024.00, Q2 2024 = 2024.25, ...
release = pd1['Released'].str.extract(r'(\d+)\s(\d+)').astype(float)
release = release[1] + (release[0] - 1) / 4
release = pd.DataFrame(release)
release.rename(columns={0: 'Released'}, inplace=True)
release

Unnamed: 0,Released
0,2023.0
1,2024.0
2,2023.5
3,2023.0
4,2024.0
...,...
251,2020.0
252,2019.5
253,2021.0
254,2016.5


In [10]:
# Core architecture: ohe later, for now leave it as it is
arch = pd1['Core / Architecture']
arch

0             Zen 4 / Dragon Range
1      AMD Dragon Range-HX / Zen 4
2          Dragon Range-HX / Zen 4
3                   Raptor Lake-HX
4           Raptor Lake-HX Refresh
                  ...             
251                            Zen
252            Gemini Lake Refresh
253                    Jasper Lake
254                    Apollo Lake
255                   Stoney Ridge
Name: Core / Architecture, Length: 256, dtype: object

In [11]:
# Lithography: extract number
litho = pd1['Lithography'].str.extract(r'(\d+)').astype(float)
litho.rename(columns={0: 'Lithography'}, inplace=True)
litho

Unnamed: 0,Lithography
0,5.0
1,5.0
2,5.0
3,10.0
4,10.0
...,...
251,14.0
252,14.0
253,10.0
254,14.0


In [12]:
# Official website: get the main producer from the link
website = pd1['Official website'].str.extract(r'\w+?\.?(\w+)\.com')
website.rename(columns={0: 'Official website'}, inplace=True)
website

Unnamed: 0,Official website
0,amd
1,amd
2,amd
3,intel
4,intel
...,...
251,amd
252,intel
253,
254,intel


In [13]:
# Max GPU frequency: extract number
gpu_freq = pd1['Max. GPU frequency'].str.extract(r'(\d+)').astype(float)
gpu_freq.rename(columns={0: 'Max GPU frequency'}, inplace=True)
gpu_freq

Unnamed: 0,Max GPU frequency
0,2200.0
1,2200.0
2,2200.0
3,1650.0
4,
...,...
251,1100.0
252,650.0
253,800.0
254,650.0


In [14]:
# RAM type: should clean it down to type like DDR4, DDR3, ... but don't know how
# TODO: clean RAM type

In [14]:
# Max RAM: extract number
mram = pd1['Max Memory'].str.extract(r'(\d+)').astype(float)
mram.rename(columns={0: 'Max RAM'}, inplace=True)
mram

Unnamed: 0,Max RAM
0,64.0
1,64.0
2,64.0
3,128.0
4,192.0
...,...
251,
252,8.0
253,32.0
254,8.0


In [16]:
# Integrated GPU: ohe later, for now leave it as it is
igpu = pd1['Integrated GPU']
igpu

0                             AMD Radeon 610M
1                             AMD Radeon 610M
2                             AMD Radeon 610M
3                      Intel UHD Graphics 770
4                                         NaN
                        ...                  
251                      AMD Radeon RX Vega 3
252                    Intel UHD Graphics 600
253    Intel UHD Graphics (Jasper Lake 24 EU)
254       Intel HD Graphics 500 (Apollo Lake)
255                     AMD Radeon R3 (Beema)
Name: Integrated GPU, Length: 256, dtype: object

In [17]:
# Base GPU frequency: extract number
bgpu_freq = pd1['Base GPU frequency'].str.extract(r'(\d+)').astype(float)
bgpu_freq.rename(columns={0: 'Base GPU frequency'}, inplace=True)
bgpu_freq

Unnamed: 0,Base GPU frequency
0,
1,400.0
2,400.0
3,
4,
...,...
251,
252,200.0
253,350.0
254,200.0


In [18]:
# LL cache: extract number
llcache = pd1['LL cache'].str.extract(r'(\d+)').astype(float)
llcache.rename(columns={0: 'LL Cache'}, inplace=True)
llcache

Unnamed: 0,LL Cache
0,81920.0
1,65536.0
2,131072.0
3,36.0
4,36.0
...,...
251,4096.0
252,4096.0
253,4096.0
254,2048.0


In [19]:
# AI performance: extract number from base and total
ai_base = pd1['NPU AI Performance'].str.extract(r'(\d+)').astype(float)
ai_total = pd1['Total AI Performance'].str.extract(r'(\d+)').astype(float)
total = pd.concat([ai_base, ai_total], axis=1)
total = total.max(axis=1)
total = pd.DataFrame(total)
total.rename(columns={0: 'AI Performance'}, inplace=True)
total

Unnamed: 0,AI Performance
0,
1,
2,
3,
4,
...,...
251,
252,
253,
254,
