In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
data_path = '../data/'
cpu_data = 'best_best_cpu.csv'
pd1 = pd.read_csv(data_path + cpu_data, dtype=str)

In [5]:
pd1 = pd1.drop(['Column'], axis=1)
pd1['Name'] = pd1['Name'].str.lower()

In [6]:
# Socket
socket = pd1['Socket'].astype(str)
socket

0       AMD Socket FP6
1       AMD Socket FP7
2       AMD Socket FP7
3       AMD Socket FP7
4       AMD Socket FP6
            ...       
498    Intel Socket G3
499     Intel BGA 1364
500     Intel BGA 1364
501    Intel Socket G3
502    Intel Socket G3
Name: Socket, Length: 503, dtype: object

In [7]:
# Foundry
foundry = pd1['Foundry'].astype(str)
foundry

0       TSMC
1       TSMC
2       TSMC
3       TSMC
4       TSMC
       ...  
498    Intel
499    Intel
500    Intel
501    Intel
502    Intel
Name: Foundry, Length: 503, dtype: object

In [8]:
# Process size
process_size = pd1['Process Size:']
process_size = process_size.str.extract(r'(\d+)').astype(int)
process_size.rename(columns={0: 'Process Size'}, inplace=True)
process_size

Unnamed: 0,Process Size
0,7
1,6
2,4
3,4
4,7
...,...
498,22
499,22
500,22
501,22


In [9]:
# transistors
transistors = pd1['Transistors']
transistors = transistors.str.extract(r'(\d+)').astype(float).apply(lambda x: x * 1e6)
transistors.rename(columns={0: 'Transistors'}, inplace=True)
transistors

Unnamed: 0,Transistors
0,10000000.0
1,
2,20000000.0
3,20000000.0
4,10000000.0
...,...
498,1000000.0
499,1000000.0
500,1000000.0
501,1000000.0


In [10]:
# die size
die_size = pd1['Die Size']
die_size = die_size.str.extract(r'(\d+.?\d+)\s?mm').astype(float)
die_size.rename(columns={0: 'Die Size'}, inplace=True)
die_size

Unnamed: 0,Die Size
0,180.0
1,208.0
2,137.0
3,137.0
4,180.0
...,...
498,177.0
499,264.0
500,264.0
501,177.0


In [11]:
# package
package = pd1['Package'].astype(str)
package

0             FP6
1      FP7, FP7r2
2      FP7, FP7r2
3      FP7, FP7r2
4             FP6
          ...    
498     FC-PGA946
499    FC-BGA1364
500    FC-BGA1364
501     FC-PGA946
502     FC-PGA946
Name: Package, Length: 503, dtype: object

In [12]:
# tjMax
tjMax = pd1['tJMax']
tjMax = tjMax.str.extract(r'(\d+)').astype(float)
tjMax.rename(columns={0: 'tJMax'}, inplace=True)
tjMax

Unnamed: 0,tJMax
0,95.0
1,95.0
2,100.0
3,100.0
4,95.0
...,...
498,
499,
500,
501,


In [13]:
# Market
market = pd1['Market'].astype(str)
market

0      Mobile
1      Mobile
2      Mobile
3      Mobile
4      Mobile
        ...  
498    Mobile
499    Mobile
500    Mobile
501    Mobile
502    Mobile
Name: Market, Length: 503, dtype: object

In [14]:
# Production status
production_status = pd1['Production Status'].astype(str)
production_status

0           Active
1           Active
2           Active
3           Active
4           Active
          ...     
498    End-of-life
499    End-of-life
500    End-of-life
501    End-of-life
502    End-of-life
Name: Production Status, Length: 503, dtype: object

In [15]:
import re

# Release date
release_date = pd1['Release Date']

MONTH = {'jan': 1, 'feb': 2, 'mar': 3, 'apr': 4, 'may': 5, 'jun': 6,
         'jul': 7, 'aug': 8, 'sep': 9, 'oct': 10, 'nov': 11, 'dec': 12}

out = release_date.copy()

for i in range(len(release_date)):
    row = release_date[i]
    res = row.split(' ')
    day = 1
    month = 1
    year = 2014
    flag = False
    for element in res:
        if element == 'Unknown':
            out[i] = np.nan
            flag = True
        elif element.isdigit() and len(element) == 4:
            year = int(element)
        elif element.isalpha() and element.lower() in MONTH:
            month = MONTH[element.lower()]
        else:
            day = int(re.search(r'\d+', element).group())

    if not flag:
        out[i] = pd.Timestamp(year, month, day)

release_date = out.rename('Release Date')
release_date


0      2023-01-04 00:00:00
1      2023-01-04 00:00:00
2      2023-05-03 00:00:00
3      2023-12-06 00:00:00
4      2023-01-04 00:00:00
              ...         
498    2014-01-20 00:00:00
499    2014-02-01 00:00:00
500    2014-07-01 00:00:00
501    2014-01-20 00:00:00
502    2014-01-20 00:00:00
Name: Release Date, Length: 503, dtype: object

In [16]:
# Part#
part = pd1['Part#'].astype(str)
part

0      100-000000944
1      100-000000537
2      100-000001068
3      100-000001325
4      100-000000950
           ...      
498            SR1PV
499            SR1BP
500            SR1ZX
501            SR1PT
502            SR1PP
Name: Part#, Length: 503, dtype: object

In [17]:
# frequency
frequency = pd1['Frequency']

for i in range(len(frequency)):
    row = frequency[i]
    if row == 'Unknown':
        frequency[i] = np.nan
    else:
        res = row.split(' ')
        if res[1].lower() == 'ghz':
            frequency[i] = float(res[0]) * 1000
        else:
            frequency[i] = float(res[0])

frequency = frequency.astype(float).rename('Frequency')
frequency

0      2300.0
1      3000.0
2      3000.0
3      3000.0
4      2300.0
        ...  
498    2800.0
499    2400.0
500    2500.0
501    2900.0
502    3100.0
Name: Frequency, Length: 503, dtype: float64

In [18]:
# turbo clock
turbo_clock = pd1['Turbo Clock'].astype(str)

out = turbo_clock.copy()

for i in range(len(turbo_clock)):
    row = turbo_clock[i]
    if row == 'Unknown' or row == 'nan':
        out[i] = np.nan
    else:
        res = row.split(' ')
        if res[-1].lower() == 'ghz':
            out[i] = float(res[-2]) * 1000
        else:
            out[i] = float(res[-2])

turbo_clock = out.astype(float).rename('Turbo Clock')
turbo_clock

0      4300.0
1      4300.0
2      4700.0
3      4700.0
4      4300.0
        ...  
498    3800.0
499    3600.0
500    3700.0
501    3900.0
502    4000.0
Name: Turbo Clock, Length: 503, dtype: float64

In [19]:
# base clock
base_clock = pd1['Base Clock'].astype(str)
base_clock = base_clock.str.extract(r'(\d+\.?\d+)').astype(float)
base_clock.rename(columns={0: 'Base Clock'}, inplace=True)
base_clock

Unnamed: 0,Base Clock
0,100.0
1,100.0
2,100.0
3,100.0
4,100.0
...,...
498,100.0
499,100.0
500,100.0
501,100.0


In [20]:
# multiplier
multiplier = pd1['Multiplier'].astype(str)
multiplier = multiplier.str.extract(r'(\d+\.?\d+)').astype(float)
multiplier.rename(columns={0: 'Multiplier'}, inplace=True)
multiplier

Unnamed: 0,Multiplier
0,23.0
1,30.0
2,30.0
3,30.0
4,23.0
...,...
498,28.0
499,24.0
500,25.0
501,29.0


In [21]:
# multiplier unlocked
multiplier_unlocked = pd1['Multiplier Unlocked'].astype(str)
multiplier_unlocked = multiplier_unlocked.str.lower().str.contains('yes').astype(float)
multiplier_unlocked.rename('Multiplier Unlocked', inplace=True)
multiplier_unlocked

0      0.0
1      0.0
2      0.0
3      0.0
4      0.0
      ... 
498    0.0
499    0.0
500    0.0
501    0.0
502    1.0
Name: Multiplier Unlocked, Length: 503, dtype: float64

In [22]:
# TDP
tdp = pd1['TDP'].astype(str)
tdp = tdp.str.extract(r'(\d+)').astype(float)
tdp.rename(columns={0: 'TDP'}, inplace=True)
tdp

Unnamed: 0,TDP
0,15.0
1,28.0
2,28.0
3,28.0
4,15.0
...,...
498,47.0
499,47.0
500,47.0
501,47.0


In [23]:
# Code Name
code_name = pd1['Codename'].astype(str)
code_name

0        Barcelo-R
1      Rembrandt-R
2         Phoenix2
3       Hawk Point
4        Barcelo-R
          ...     
498        Haswell
499    Crystalwell
500    Crystalwell
501        Haswell
502        Haswell
Name: Codename, Length: 503, dtype: object

In [24]:
# generation
generation = pd1['Generation'].astype(str)
generation

0              Ryzen 3
1              Ryzen 3
2              Ryzen 3
3              Ryzen 3
4              Ryzen 3
            ...       
498            Core i7
499            Core i7
500            Core i7
501            Core i7
502    Core i7 Extreme
Name: Generation, Length: 503, dtype: object

In [25]:
# memory support
memory_support = pd1['Memory Support'].astype(str)

for i in range(len(memory_support)):
    row = memory_support[i]
    res = row.split(' ')
    memory_support[i] = res[-1]
    if row.lower() == 'unknown':
        memory_support[i] = np.nan

memory_support = memory_support.rename('Memory Support')
memory_support

0      DDR4
1      DDR5
2      DDR5
3      DDR5
4      DDR4
       ... 
498    DDR3
499    DDR3
500    DDR3
501    DDR3
502    DDR3
Name: Memory Support, Length: 503, dtype: object

In [26]:
# rated speed
rated_speed = pd1['Rated Speed'].astype(str)
rated_speed = rated_speed.str.extract(r'(\d+)').astype(float)
rated_speed.rename(columns={0: 'Rated Speed'}, inplace=True)
rated_speed

Unnamed: 0,Rated Speed
0,4267.0
1,6400.0
2,7500.0
3,7500.0
4,4267.0
...,...
498,1600.0
499,1600.0
500,1600.0
501,1600.0


In [27]:
# memory bus
CHANNEL = {'Single': 1, 'Dual': 2, 'Quad': 4}

memory_bus = pd1['Memory Bus'].astype(str)

for i in range(len(memory_bus)):
    row = memory_bus[i]
    res = row.split('-')
    if row.lower() == 'unknown' or row.lower() == 'nan':
        memory_bus[i] = np.nan
        continue
    memory_bus[i] = CHANNEL[res[0]]

memory_bus = memory_bus.rename('Memory Bus')
memory_bus

0      2
1      2
2      2
3      2
4      2
      ..
498    2
499    2
500    2
501    2
502    2
Name: Memory Bus, Length: 503, dtype: object

In [28]:
# ECC memory
ecc_memory = pd1['ECC Memory'].astype(str)
ecc_memory = ecc_memory.str.lower().str.contains('yes').astype(float)
ecc_memory.rename('ECC Memory', inplace=True)
ecc_memory

0      1.0
1      1.0
2      1.0
3      0.0
4      1.0
      ... 
498    0.0
499    0.0
500    0.0
501    0.0
502    0.0
Name: ECC Memory, Length: 503, dtype: float64

In [29]:
# PCI Express
pci_express = pd1['PCI-Express'].astype(str).str.split(',', expand=True)

# 0: generation, 1: lanes
pci_express[0] = pci_express[0].str.extract(r'(\d+)').astype(float)
pci_express[1] = pci_express[1].str.extract(r'(\d+)').astype(float)

pci_express.rename(columns={0: 'PCI Express Generation', 1: 'PCI Express Lanes'}, inplace=True)
pci_express

Unnamed: 0,PCI Express Generation,PCI Express Lanes
0,3.0,16.0
1,4.0,20.0
2,4.0,14.0
3,4.0,14.0
4,3.0,16.0
...,...,...
498,3.0,16.0
499,3.0,16.0
500,3.0,16.0
501,3.0,16.0


In [30]:
# Num of core
num_of_core = pd1['# of Cores'].astype(str)
num_of_core = num_of_core.str.extract(r'(\d+)').astype(float)
num_of_core.rename(columns={0: '# of Cores'}, inplace=True)
num_of_core

Unnamed: 0,# of Cores
0,4.0
1,4.0
2,4.0
3,4.0
4,4.0
...,...
498,4.0
499,4.0
500,4.0
501,4.0


In [31]:
# Num of threads
num_of_threads = pd1['# of Threads'].astype(str)
num_of_threads = num_of_threads.str.extract(r'(\d+)').astype(float)
num_of_threads.rename(columns={0: '# of Threads'}, inplace=True)
num_of_threads

Unnamed: 0,# of Threads
0,8.0
1,8.0
2,8.0
3,8.0
4,8.0
...,...
498,8.0
499,8.0
500,8.0
501,8.0


In [32]:
# Cache L1
cache_l1 = pd1['Cache L1'].astype(str)
cache_l1 = cache_l1.str.extract(r'(\d+)').astype(float)
cache_l1.rename(columns={0: 'Cache L1'}, inplace=True)
cache_l1

Unnamed: 0,Cache L1
0,64.0
1,64.0
2,64.0
3,64.0
4,64.0
...,...
498,64.0
499,64.0
500,64.0
501,64.0


In [33]:
# Cache L2
cache_l2 = pd1['Cache L2'].astype(str)

for i in range(len(cache_l2)):
    row = cache_l2[i]
    if row == 'Unknown' or row == 'nan':
        cache_l2[i] = np.nan
    else:
        res = row.split(' ')
        if res[1].lower() == 'kb':
            cache_l2[i] = float(res[0]) / 1024
        else:
            cache_l2[i] = float(res[0])

cache_l2 = cache_l2.astype(float).rename('Cache L2')
cache_l2

0      0.50
1      0.50
2      1.00
3      1.00
4      0.50
       ... 
498    0.25
499    0.25
500    0.25
501    0.25
502    0.25
Name: Cache L2, Length: 503, dtype: float64

In [34]:
# Cache L3
cache_l3 = pd1['Cache L3'].astype(str)
cache_l3 = cache_l3.str.extract(r'(\d+)').astype(float)
cache_l3.rename(columns={0: 'Cache L3'}, inplace=True)
cache_l3

Unnamed: 0,Cache L3
0,8.0
1,8.0
2,8.0
3,8.0
4,8.0
...,...
498,6.0
499,6.0
500,6.0
501,8.0


In [35]:
# Cache L4
cache_l4 = pd1['Cache L4'].astype(str)
cache_l4 = cache_l4.str.extract(r'(\d+)').astype(float)
cache_l4.rename(columns={0: 'Cache L4'}, inplace=True)
cache_l4

Unnamed: 0,Cache L4
0,
1,
2,
3,
4,
...,...
498,
499,
500,
501,


In [36]:
# p core base
p_core_base = pd1['P-Core Base'].astype(str)
p_core_base = p_core_base.str.extract(r'(\d+\.?\d+)').astype(float)
p_core_base.rename(columns={0: 'P-Core Base'}, inplace=True)
p_core_base

Unnamed: 0,P-Core Base
0,
1,
2,3.6
3,
4,
...,...
498,
499,
500,
501,


In [37]:
# e core base
e_core_base = pd1['E-Core Base'].astype(str)

for i in range(len(e_core_base)):
    row = e_core_base[i]
    if row == 'Unknown' or row == 'nan':
        e_core_base[i] = np.nan
    else:
        res = row.split(' ')
        if res[1].lower() == 'ghz':
            e_core_base[i] = float(res[0]) * 1000
        else:
            e_core_base[i] = float(res[0])

e_core_base = e_core_base.astype(float).rename('E-Core Base')
e_core_base

0         NaN
1         NaN
2      2800.0
3         NaN
4         NaN
        ...  
498       NaN
499       NaN
500       NaN
501       NaN
502       NaN
Name: E-Core Base, Length: 503, dtype: float64

In [38]:
# hybrid core
hybrid_core = pd1['Hybrid Cores'].astype(str)

for i in range(len(hybrid_core)):
    row = hybrid_core[i]
    if row == 'Unknown' or row == 'nan':
        hybrid_core[i] = np.nan
    else:
        res = re.findall(r'(\d+)', row)
        res = [float(x) for x in res]
        hybrid_core[i] = sum(res)

hybrid_core = hybrid_core.astype(float).rename('Hybrid Core')
hybrid_core

0      NaN
1      NaN
2      4.0
3      4.0
4      NaN
      ... 
498    NaN
499    NaN
500    NaN
501    NaN
502    NaN
Name: Hybrid Core, Length: 503, dtype: float64

In [39]:
# name
name = pd1['Name'].astype(str).str.lower()
name

0          ryzen 3 7330u
1          ryzen 3 7335u
2          ryzen 3 7440u
3          ryzen 3 8440u
4      ryzen 3 pro 7330u
             ...        
498       core i7-4810mq
499       core i7-4860hq
500       core i7-4870hq
501       core i7-4910mq
502       core i7-4940mx
Name: Name, Length: 503, dtype: object

In [40]:
collection = [name, socket, foundry, process_size, transistors, die_size, package, tjMax, market, production_status, release_date, part, frequency, turbo_clock, base_clock, multiplier, multiplier_unlocked, tdp, code_name, generation, memory_support, rated_speed, memory_bus, ecc_memory, pci_express, num_of_core, num_of_threads, cache_l1, cache_l2, cache_l3, cache_l4, p_core_base, e_core_base, hybrid_core]

pd_out = pd.concat(collection, axis=1)
pd_out

Unnamed: 0,Name,Socket,Foundry,Process Size,Transistors,Die Size,Package,tJMax,Market,Production Status,...,PCI Express Lanes,# of Cores,# of Threads,Cache L1,Cache L2,Cache L3,Cache L4,P-Core Base,E-Core Base,Hybrid Core
0,ryzen 3 7330u,AMD Socket FP6,TSMC,7,10000000.0,180.0,FP6,95.0,Mobile,Active,...,16.0,4.0,8.0,64.0,0.50,8.0,,,,
1,ryzen 3 7335u,AMD Socket FP7,TSMC,6,,208.0,"FP7, FP7r2",95.0,Mobile,Active,...,20.0,4.0,8.0,64.0,0.50,8.0,,,,
2,ryzen 3 7440u,AMD Socket FP7,TSMC,4,20000000.0,137.0,"FP7, FP7r2",100.0,Mobile,Active,...,14.0,4.0,8.0,64.0,1.00,8.0,,3.6,2800.0,4.0
3,ryzen 3 8440u,AMD Socket FP7,TSMC,4,20000000.0,137.0,"FP7, FP7r2",100.0,Mobile,Active,...,14.0,4.0,8.0,64.0,1.00,8.0,,,,4.0
4,ryzen 3 pro 7330u,AMD Socket FP6,TSMC,7,10000000.0,180.0,FP6,95.0,Mobile,Active,...,16.0,4.0,8.0,64.0,0.50,8.0,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
498,core i7-4810mq,Intel Socket G3,Intel,22,1000000.0,177.0,FC-PGA946,,Mobile,End-of-life,...,16.0,4.0,8.0,64.0,0.25,6.0,,,,
499,core i7-4860hq,Intel BGA 1364,Intel,22,1000000.0,264.0,FC-BGA1364,,Mobile,End-of-life,...,16.0,4.0,8.0,64.0,0.25,6.0,,,,
500,core i7-4870hq,Intel BGA 1364,Intel,22,1000000.0,264.0,FC-BGA1364,,Mobile,End-of-life,...,16.0,4.0,8.0,64.0,0.25,6.0,,,,
501,core i7-4910mq,Intel Socket G3,Intel,22,1000000.0,177.0,FC-PGA946,,Mobile,End-of-life,...,16.0,4.0,8.0,64.0,0.25,8.0,,,,


In [41]:
pd_out.to_csv('../data/ignore/best_cpu_cleaned.csv', index=False)