In [1]:
import pandas as pd
import numpy as np

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
data_path = '../data/ignore/'
old_cpu = pd.read_csv(data_path + 'cpu_cleaned.csv', index_col='name')
new_cpu = pd.read_csv(data_path + 'best_cpu_cleaned.csv', index_col='Name')

In [4]:
# Make the name index for faster lookup
trim = {'amd':'', 'intel':'', 'apple':'', 'qualcomm':'', 'nvidia':''}
old_cpu.index = old_cpu.index.astype(str).str.lower().str.strip()
new_cpu.index = new_cpu.index.astype(str).str.lower().str.strip()
for key in trim:
    old_cpu.index = old_cpu.index.str.replace(key, trim[key]).str.strip()
    new_cpu.index = new_cpu.index.str.replace(key, trim[key]).str.strip()

In [5]:
cpu_key = pd.read_csv(data_path + 'cpu_key.csv')
cpu_key.drop(columns=['Column'], inplace=True)

In [6]:
# merge the name according to the cpu_key file

# the old cpu file first
for i in range(len(old_cpu)):
    cur_name = old_cpu.index[i]
    corresponding_name = cpu_key[cpu_key['a_1'] == cur_name]['0']
    if len(corresponding_name) > 0:
        old_cpu.index.values[i] = corresponding_name.values[0]
    else:
        old_cpu.index.values[i] = np.nan

# the new cpu file
for i in range(len(new_cpu)):
    cur_name = new_cpu.index[i]
    corresponding_name = cpu_key[cpu_key['a_2'] == cur_name]['0']
    if len(corresponding_name) > 0:
        new_cpu.index.values[i] = corresponding_name.values[0]
    else:
        new_cpu.index.values[i] = np.nan

In [7]:
pd_out = pd.DataFrame()

Merge the columns that have important information according to the key file

In [8]:
# Name: basically just copy the name from key
pd_out['Name'] = cpu_key['0']

In [9]:
# Architecture / Code name

pd_out['Codename'] = np.nan
num = 0
for i in range(len(pd_out)):
    cur_name = pd_out['Name'][i]
    left = old_cpu[old_cpu.index == cur_name]['Core / Architecture'].astype(str)
    right = new_cpu[new_cpu.index == cur_name]['Codename'].astype(str)
    if len(left) > 0 and len(right) == 0:
        pd_out['Codename'][i] = left.values[0]
    elif len(right) > 0 and len(left) == 0:
        pd_out['Codename'][i] = right.values[0]
    elif len(right) == 0 and len(left) == 0:
        pd_out['Codename'][i] = np.nan
    else:
        if left.values[0] == 'nan' and right.values[0] != 'nan':
            pd_out['Codename'][i] = right.values[0]
        elif right.values[0] == 'nan' and left.values[0] != 'nan':
            pd_out['Codename'][i] = left.values[0]
        elif left.values[0] in right.values:
            pd_out['Codename'][i] = left.values[0]
        else:
            num += 1
            print('Conflict in Codename for', cur_name)
            print('Old:', left.values[0])
            print('New:', right.values[0])
            pd_out['Codename'][i] = right.values[0]

print('Total conflicts in Codename:', num)
pd_out['Codename'] = pd_out['Codename'].astype(str)

Conflict in Codename for ryzen 9 7845hx
Old: Dragon Range-HX
New: Dragon Range
Conflict in Codename for ryzen 9 7945hx3d
Old: Dragon Range-HX
New: Dragon Range
Conflict in Codename for ryzen 9 pro 6950h
Old: Rembrandt H
New: Rembrandt
Conflict in Codename for ryzen 9 pro 6950hs
Old: Rembrandt H
New: Rembrandt
Conflict in Codename for celeron j4125
Old: Gemini Lake Refresh
New: Gemini Lake
Conflict in Codename for core 3 100u
Old: Raptor Lake-U Refresh
New: Raptor Lake-U
Conflict in Codename for core 5 120u
Old: Raptor Lake-U Refresh
New: Raptor Lake-U
Conflict in Codename for core 7 150u
Old: Raptor Lake-U Refresh
New: Raptor Lake-U
Conflict in Codename for core i3-1110g4
Old: Tiger Lake-UP4
New: Tiger Lake-U
Conflict in Codename for core i3-1115g4
Old: Tiger Lake-UP3
New: Tiger Lake-U
Conflict in Codename for core i3-1125g4
Old: Tiger Lake-UP3
New: Tiger Lake-U
Conflict in Codename for core i5-10210u
Old: Comet Lake
New: Comet Lake-U
Conflict in Codename for core i5-1035g1
Old: Ice La

In [10]:
# Max operating temperature / tJMax
pd_out['Max operating temperature'] = np.nan
num = 0

for i in range(len(pd_out)):
    cur_name = pd_out['Name'][i]
    left = old_cpu[old_cpu.index == cur_name]['Max operating temperature'].astype(str)
    right = new_cpu[new_cpu.index == cur_name]['tJMax'].astype(str)
    if len(left) > 0 and len(right) == 0:
        pd_out['Max operating temperature'][i] = left.values[0]
    elif len(right) > 0 and len(left) == 0:
        pd_out['Max operating temperature'][i] = right.values[0]
    elif len(right) == 0 and len(left) == 0:
        pd_out['Max operating temperature'][i] = np.nan
    else:
        if left.values[0] == 'nan' and right.values[0] != 'nan':
            pd_out['Max operating temperature'][i] = right.values[0]
        elif right.values[0] == 'nan' and left.values[0] != 'nan':
            pd_out['Max operating temperature'][i] = left.values[0]
        elif left.values[0] in right.values:
            pd_out['Max operating temperature'][i] = left.values[0]
        else:
            num += 1
            print('Conflict in Max operating temperature for', cur_name)
            print('Old:', left.values[0])
            print('New:', right.values[0])
            pd_out['Max operating temperature'][i] = right.values[0]

print('Total conflicts in Max operating temperature:', num)
pd_out['Max operating temperature'] = pd_out['Max operating temperature'].astype(float)

Conflict in Max operating temperature for ryzen 9 7945hx3d
Old: 89.0
New: 100.0
Conflict in Max operating temperature for core i3-n300
Old: 100.0
New: 105.0
Conflict in Max operating temperature for core i3-n305
Old: 100.0
New: 105.0
Conflict in Max operating temperature for core i5-10300h
Old: 105.0
New: 100.0
Conflict in Max operating temperature for core i7-10750h
Old: 105.0
New: 100.0
Conflict in Max operating temperature for processor n100
Old: 100.0
New: 105.0
Conflict in Max operating temperature for processor n200
Old: 100.0
New: 105.0
Total conflicts in Max operating temperature: 7


In [11]:
# Core / Thread
pd_out['Core'] = np.nan
pd_out['Thread'] = np.nan
num = 0

# Core first
for i in range(len(pd_out)):
    cur_name = pd_out['Name'][i]
    left = old_cpu[old_cpu.index == cur_name]['Cores'].astype(str)
    right = new_cpu[new_cpu.index == cur_name]['# of Cores'].astype(str)

    if len(left) > 0 and len(right) == 0:
        pd_out['Core'][i] = left.values[0]
    elif len(right) > 0 and len(left) == 0:
        pd_out['Core'][i] = right.values[0]
    elif len(right) == 0 and len(left) == 0:
        pd_out['Core'][i] = np.nan
    else:
        if left.values[0] == 'nan' and right.values[0] != 'nan':
            pd_out['Core'][i] = right.values[0]
        elif right.values[0] == 'nan' and left.values[0] != 'nan':
            pd_out['Core'][i] = left.values[0]
        elif left.values[0] in right.values:
            pd_out['Core'][i] = left.values[0]
        else:
            num += 1
            print('Conflict in Core for', cur_name)
            print('Old:', left.values[0])
            print('New:', right.values[0])
            pd_out['Core'][i] = right.values[0]

print('Total conflicts in Core:', num)
pd_out['Core'] = pd_out['Core'].astype(float)
num = 0

# Thread
for i in range(len(pd_out)):
    cur_name = pd_out['Name'][i]
    left = old_cpu[old_cpu.index == cur_name]['Threads'].astype(str)
    right = new_cpu[new_cpu.index == cur_name]['# of Threads'].astype(str)

    if len(left) > 0 and len(right) == 0:
        pd_out['Thread'][i] = left.values[0]
    elif len(right) > 0 and len(left) == 0:
        pd_out['Thread'][i] = right.values[0]
    elif len(right) == 0 and len(left) == 0:
        pd_out['Thread'][i] = np.nan
    else:
        if left.values[0] == 'nan' and right.values[0] != 'nan':
            pd_out['Thread'][i] = right.values[0]
        elif right.values[0] == 'nan' and left.values[0] != 'nan':
            pd_out['Thread'][i] = left.values[0]
        elif left.values[0] in right.values:
            pd_out['Thread'][i] = left.values[0]
        else:
            num += 1
            print('Conflict in Thread for', cur_name)
            print('Old:', left.values[0])
            print('New:', right.values[0])
            pd_out['Thread'][i] = right.values[0]

print('Total conflicts in Thread:', num)
pd_out['Thread'] = pd_out['Thread'].astype(float)

Total conflicts in Core: 0
Conflict in Thread for celeron 7305
Old: 5.0
New: 6.0
Total conflicts in Thread: 1


In [12]:
# Base frequency
pd_out['Base frequency'] = np.nan
num = 0

for i in range(len(pd_out)):
    cur_name = pd_out['Name'][i]
    left = old_cpu[old_cpu.index == cur_name]['Base CPU freq'].astype(str)
    right = new_cpu[new_cpu.index == cur_name]['Frequency'].astype(str)
    left = left.apply(lambda x: str(float(x) * 1000))
    for j in range(len(left)):
        if left.values[j] == 'nan' or left.values[j] == '0.0':
            left.values[j] = str(np.nan)

    if len(left) > 0 and len(right) == 0:
        pd_out['Base frequency'][i] = left.values[0]
    elif len(right) > 0 and len(left) == 0:
        pd_out['Base frequency'][i] = right.values[0]
    elif len(right) == 0 and len(left) == 0:
        pd_out['Base frequency'][i] = np.nan
    else:
        if left.values[0] == 'nan' and right.values[0] != 'nan':
            pd_out['Base frequency'][i] = right.values[0]
        elif right.values[0] == 'nan' and left.values[0] != 'nan':
            pd_out['Base frequency'][i] = left.values[0]
        elif left.values[0] in right.values:
            pd_out['Base frequency'][i] = left.values[0]
        else:
            num += 1
            print('Conflict in Base frequency for', cur_name)
            print('Old:', left.values[0])
            print('New:', right.values[0])
            pd_out['Base frequency'][i] = right.values[0]

print('Total conflicts in Base frequency:', num)
pd_out['Base frequency'] = pd_out['Base frequency'].astype(float)

Conflict in Base frequency for ryzen 3 pro 4450u
Old: 2500.0
New: 2400.0
Conflict in Base frequency for ryzen 7 4700u
Old: 2000.0
New: 1800.0
Conflict in Base frequency for celeron 7305
Old: 900.0
New: 1100.0
Conflict in Base frequency for core 3 100u
Old: 900.0
New: 1200.0
Conflict in Base frequency for core 5 120u
Old: 900.0
New: 1400.0
Conflict in Base frequency for core 7 150u
Old: 1200.0
New: 1800.0
Conflict in Base frequency for core i3-1110g4
Old: 2500.0
New: 1800.0
Conflict in Base frequency for core i3-1215u
Old: 900.0
New: 1200.0
Conflict in Base frequency for core i3-1220p
Old: 1100.0
New: 1500.0
Conflict in Base frequency for core i3-1305u
Old: 1200.0
New: 1600.0
Conflict in Base frequency for core i3-1315u
Old: 900.0
New: 1200.0
Conflict in Base frequency for core i5-11260h
Old: 2600.0
New: 2100.0
Conflict in Base frequency for core i5-11300h
Old: 3100.0
New: 2600.0
Conflict in Base frequency for core i5-1130g7
Old: 1800.0
New: 1100.0
Conflict in Base frequency for core i5

In [13]:
# Max frequency
pd_out['Max frequency'] = np.nan
num = 0

for i in range(len(pd_out)):
    cur_name = pd_out['Name'][i]
    left = old_cpu[old_cpu.index == cur_name]['Max CPU freq'].astype(str)
    right = new_cpu[new_cpu.index == cur_name]['Turbo Clock'].astype(str)
    left = left.apply(lambda x: str(float(x) * 1000))
    for j in range(len(left)):
        if left.values[j] == 'nan' or left.values[j] == '0.0':
            left.values[j] = str(np.nan)
    
    if len(left) > 0 and len(right) == 0:
        pd_out['Max frequency'][i] = left.values[0]
    elif len(right) > 0 and len(left) == 0:
        pd_out['Max frequency'][i] = right.values[0]
    elif len(right) == 0 and len(left) == 0:
        pd_out['Max frequency'][i] = np.nan
    else:
        if left.values[0] == 'nan' and right.values[0] != 'nan':
            pd_out['Max frequency'][i] = right.values[0]
        elif right.values[0] == 'nan' and left.values[0] != 'nan':
            pd_out['Max frequency'][i] = left.values[0]
        elif left.values[0] in right.values:
            pd_out['Max frequency'][i] = left.values[0]
        else:
            num += 1
            print('Conflict in Max frequency for', cur_name)
            print('Old:', left.values[0])
            print('New:', right.values[0])
            pd_out['Max frequency'][i] = right.values[0]

print('Total conflicts in Max frequency:', num)
pd_out['Max frequency'] = pd_out['Max frequency'].astype(float)

Conflict in Max frequency for ryzen 7 4700u
Old: 4100.0
New: 4200.0
Conflict in Max frequency for ryzen 9 7945hx
Old: 5450.0
New: 5400.0
Conflict in Max frequency for core i3-n300
Old: 3800.0
New: 3700.0
Conflict in Max frequency for core i5-1145g7
Old: 4200.0
New: 4400.0
Conflict in Max frequency for core i7-1165g7
Old: 4400.0
New: 4700.0
Conflict in Max frequency for core i9-10980hk
Old: 5100.0
New: 5300.0
Total conflicts in Max frequency: 6


In [14]:
# Release quarter
pd_out['Release quarter'] = np.nan
num = 0

for i in range(len(pd_out)):
    cur_name = pd_out['Name'][i]
    left = old_cpu[old_cpu.index == cur_name]['Released'].astype(str)
    right = new_cpu[new_cpu.index == cur_name]['Release Date'].astype(str)
    for j in range(len(right)):
        if right.values[j] != 'nan':
            right.values[j] = right.values[j].split('-')[0] + '-' + str((int(right.values[j].split('-')[1]) - 1) // 3 + 1)
            year = right.values[j].split('-')[0]
            quarter = right.values[j].split('-')[1]
            right.values[j] = str(float(year) + (float(quarter) - 1) / 4)
    if len(left) > 0 and len(right) == 0:
        pd_out['Release quarter'][i] = left.values[0]
    elif len(right) > 0 and len(left) == 0:
        pd_out['Release quarter'][i] = right.values[0]
    elif len(right) == 0 and len(left) == 0:
        pd_out['Release quarter'][i] = np.nan
    else:
        if left.values[0] == 'nan' and right.values[0] != 'nan':
            pd_out['Release quarter'][i] = right.values[0]
        elif right.values[0] == 'nan' and left.values[0] != 'nan':
            pd_out['Release quarter'][i] = left.values[0]
        elif left.values[0] in right.values:
            pd_out['Release quarter'][i] = left.values[0]
        else:
            num += 1
            print('Conflict in Release quarter for', cur_name)
            print('Old:', left.values[0])
            print('New:', right.values[0])
            pd_out['Release quarter'][i] = right.values[0]

print('Total conflicts in Release quarter:', num)
pd_out['Release quarter'] = pd_out['Release quarter'].astype(float)

Conflict in Release quarter for athlon gold 7220u
Old: 2023.0
New: 2022.5
Conflict in Release quarter for ryzen 3 5300u
Old: 2020.0
New: 2021.0
Conflict in Release quarter for ryzen 5 5500u
Old: 2012.0
New: 2021.0
Conflict in Release quarter for ryzen 5 pro 5675u
Old: 2022.0
New: 2022.25
Conflict in Release quarter for ryzen 7 pro 5875u
Old: 2022.0
New: 2022.25
Conflict in Release quarter for ryzen z1 extreme
Old: 2023.0
New: 2023.25
Conflict in Release quarter for celeron j4125
Old: 2022.0
New: 2019.75
Conflict in Release quarter for core i3-1005g1
Old: 2019.25
New: 2019.5
Conflict in Release quarter for core i5-10300h
Old: 2020.0
New: 2020.5
Conflict in Release quarter for core i5-11320h
Old: 2021.5
New: 2021.25
Conflict in Release quarter for core i7-11390h
Old: 2021.5
New: 2021.25
Conflict in Release quarter for core i9-12900hx
Old: 2022.0
New: 2022.25
Total conflicts in Release quarter: 12


In [15]:
# ll cache = li cache * cores + l2 cache
pd_out['LL Cache'] = np.nan
num = 0

for i in range(len(pd_out)):
    cur_name = pd_out['Name'][i]
    left = old_cpu[old_cpu.index == cur_name]['LL Cache'].astype(str)
    right_l3 = new_cpu[new_cpu.index == cur_name]['Cache L3'].astype(str)
    # Cal the ll cache according to the formula
    right = pd.Series(np.nan, index=range(len(right_l3)))
    for j in range(len(right)):
        if right_l3.values[j] != 'nan':
            right.values[j] = str(float(right_l3.values[j]) * 1024)
        else:
            right.values[j] = str(np.nan)
    for j in range(len(left)):
        if left.values[j] != 'nan':
            res = float(left.values[j])
            if res < 1024:
                left.values[j] = str(res * 1024)
    left = left.astype(str).str.strip()
    right = right.astype(str).str.strip()
    
    if len(left) > 0 and len(right) == 0:
        pd_out['LL Cache'][i] = left.values[0]
    elif len(right) > 0 and len(left) == 0:
        pd_out['LL Cache'][i] = right.values[0]
    elif len(right) == 0 and len(left) == 0:
        pd_out['LL Cache'][i] = np.nan
    else:
        if left.values[0] == 'nan' and right.values[0] != 'nan':
            pd_out['LL Cache'][i] = right.values[0]
        elif right.values[0] == 'nan' and left.values[0] != 'nan':
            pd_out['LL Cache'][i] = left.values[0]
        elif left.values[0] in right.values:
            pd_out['LL Cache'][i] = left.values[0]
        else:
            num += 1
            print('Conflict in LL Cache for', cur_name)
            print('Old:', left.values[0])
            print('New:', right.values[0])
            pd_out['LL Cache'][i] = right.values[0]

print('Total conflicts in LL Cache:', num)
pd_out['LL Cache'] = pd_out['LL Cache'].astype(float)

Conflict in LL Cache for athlon gold 7220u
Old: 5120.0
New: 4096.0
Conflict in LL Cache for ryzen 3 5300u
Old: 6144.0
New: 4096.0
Conflict in LL Cache for ryzen 3 5425u
Old: 10240.0
New: 8192.0
Conflict in LL Cache for ryzen 3 7320u
Old: 6144.0
New: 4096.0
Conflict in LL Cache for ryzen 3 7330u
Old: 10240.0
New: 8192.0
Conflict in LL Cache for ryzen 3 7335u
Old: 8000.0
New: 8192.0
Conflict in LL Cache for ryzen 5 4600h
Old: 8192.0
New: 11264.0
Conflict in LL Cache for ryzen 5 4600hs
Old: 8192.0
New: 11264.0
Conflict in LL Cache for ryzen 5 4600u
Old: 8192.0
New: 11264.0
Conflict in LL Cache for ryzen 5 5500u
Old: 11264.0
New: 8192.0
Conflict in LL Cache for ryzen 5 5600u
Old: 19456.0
New: 16384.0
Conflict in LL Cache for ryzen 5 5625u
Old: 19456.0
New: 16384.0
Conflict in LL Cache for ryzen 5 5625u
Old: 19456.0
New: 16384.0
Conflict in LL Cache for ryzen 5 6600h
Old: 19456.0
New: 16384.0
Conflict in LL Cache for ryzen 5 6600hs
Old: 19456.0
New: 16384.0
Conflict in LL Cache for ryzen 5 

In [16]:
# Lithography / Process size
pd_out['Lithography'] = np.nan
num = 0

for i in range(len(pd_out)):
    cur_name = pd_out['Name'][i]
    left = old_cpu[old_cpu.index == cur_name]['Lithography'].astype(str)
    right = new_cpu[new_cpu.index == cur_name]['Process Size'].astype(float).astype(str)

    if len(left) > 0 and len(right) == 0:
        pd_out['Lithography'][i] = left.values[0]
    elif len(right) > 0 and len(left) == 0:
        pd_out['Lithography'][i] = right.values[0]
    elif len(right) == 0 and len(left) == 0:
        pd_out['Lithography'][i] = np.nan
    else:
        if left.values[0] == 'nan' and right.values[0] != 'nan':
            pd_out['Lithography'][i] = right.values[0]
        elif right.values[0] == 'nan' and left.values[0] != 'nan':
            pd_out['Lithography'][i] = left.values[0]
        elif left.values[0] in right.values:
            pd_out['Lithography'][i] = left.values[0]
        else:
            num += 1
            print('Conflict in Lithography for', cur_name)
            print('Old:', left.values[0])
            print('New:', right.values[0])
            pd_out['Lithography'][i] = right.values[0]

print('Total conflicts in Lithography:', num)
pd_out['Lithography'] = pd_out['Lithography'].astype(float)

Conflict in Lithography for ryzen 3 5425u
Old: 6.0
New: 7.0
Conflict in Lithography for ryzen 5 5625u
Old: 6.0
New: 7.0
Conflict in Lithography for ryzen 5 5625u
Old: 6.0
New: 7.0
Conflict in Lithography for ryzen 7 5825u
Old: 6.0
New: 7.0
Conflict in Lithography for core i9-11980hk
Old: 14.0
New: 10.0
Total conflicts in Lithography: 5


In [17]:
cols = ['Codename', 'Max operating temperature', 'Core', 'Thread', 'Base frequency', 'Max frequency', 'Release quarter', 'LL Cache', 'Lithography']
pd_out = pd_out.dropna(subset=cols, how='all')

In [18]:
# Rename all columns to have CPU: prefix
for column in pd_out.columns:
    pd_out.rename(columns={column: 'CPU: ' + column}, inplace=True)

In [19]:
pd_out.to_csv(data_path + 'cpu_merged.csv', index=False)