In [1]:
# python 3.8.10 environment
import pandas as pd
import numpy as np
import glob
import os

In [2]:
# file_paths = ['list_data.txt']
folder_path = "."

# Get all .txt files in the folder
file_paths = glob.glob(os.path.join(folder_path, "list*.txt"))

dfs = []
for i, path in enumerate(file_paths):
    if i == 0:
        df = pd.read_csv(path, sep=',')
    else:
        df = pd.read_csv(path, sep=',', skiprows=1, header=None)
        df.columns = dfs[0].columns
    dfs.append(df)

df_combined = pd.concat(dfs, ignore_index=True)

print(df_combined)

   Number                      Model  \
0       1  ROG Strix G513IE_G513IE     
1       2             Custom Build     

                                         CPU  CPUCores    RAMAmount  RAMType  \
0   AMD Ryzen 7 4800H with Radeon Graphics           8  16558182400       26   
1  Intel(R) Core(TM) i7-10700 CPU @ 2.90GHz          8  16558182400       26   

   StorageAmount StorageType         Windows WindowsActivated  
0   512105932800   SSD        Windows 11 Pro                y  
1   512105932800   SSD        Windows 11 Pro                y  


In [3]:
# Convert bytes to GB, always rounding UP, and format with 'GB'
df_combined['RAM'] = (np.ceil(df_combined['RAMAmount'] / (1024**3))).astype(int).astype(str) + 'GB'
df_combined.drop('RAMAmount', axis=1)

Unnamed: 0,Number,Model,CPU,CPUCores,RAMType,StorageAmount,StorageType,Windows,WindowsActivated,RAM
0,1,ROG Strix G513IE_G513IE,AMD Ryzen 7 4800H with Radeon Graphics,8,26,512105932800,SSD,Windows 11 Pro,y,16GB
1,2,Custom Build,Intel(R) Core(TM) i7-10700 CPU @ 2.90GHz,8,26,512105932800,SSD,Windows 11 Pro,y,16GB


In [4]:
# Define RAM lookup table
ram_lookup_table = {
    0: "Unknown",
    1: "Other",
    2: "DRAM",
    3: "Synchronous DRAM",
    4: "Cache DRAM",
    5: "EDO",
    6: "EDRAM",
    7: "VRAM",
    8: "SRAM",
    9: "RAM",
    10: "ROM",
    11: "Flash",
    12: "EEPROM",
    13: "FEPROM",
    14: "EPROM",
    15: "CDRAM",
    16: "3DRAM",
    17: "SDRAM",
    18: "SGRAM",
    19: "RDRAM",
    20: "DDR",
    21: "DDR2",
    22: "DDR2 FB-DIMM",
    23: "Reserved",
    24: "DDR3",
    25: "FBD2",
    26: "DDR4",
    27: "LPDDR",
    28: "LPDDR2",
    29: "LPDDR3",
    30: "LPDDR4",
    31: "Logical non-volatile device",
    32: "HBM",
    33: "HBM2",
    34: "DDR5"
}

In [5]:
# Map the numeric codes to names and add to new column for ram amount and type
df_combined['RAM Amount and Type'] = df_combined['RAM'] + ' ' + df_combined['RAMType'].map(ram_lookup_table)


# List of columns to drop
columns_to_drop = ['RAM', 'RAMType', 'RAMAmount']

# Drop multiple columns
df_new = df_combined.drop(columns=columns_to_drop)
print(df_new)

   Number                      Model  \
0       1  ROG Strix G513IE_G513IE     
1       2             Custom Build     

                                         CPU  CPUCores  StorageAmount  \
0   AMD Ryzen 7 4800H with Radeon Graphics           8   512105932800   
1  Intel(R) Core(TM) i7-10700 CPU @ 2.90GHz          8   512105932800   

  StorageType         Windows WindowsActivated RAM Amount and Type  
0   SSD        Windows 11 Pro                y           16GB DDR4  
1   SSD        Windows 11 Pro                y           16GB DDR4  


In [6]:
# Convert bytes to GB
sizes_gb = df_new['StorageAmount'] / (1024**3)

# Common marketed capacities in GB
common_sizes = np.array([64, 128, 256, 512, 1024, 2048, 4096])

# Snap each size to nearest common capacity
def snap_to_common(x):
    return common_sizes[np.argmin(np.abs(common_sizes - x))]

df_new['Storage_Snapped_GB'] = sizes_gb.apply(snap_to_common)

# Format as string with GB or TB
def format_size(x):
    if x >= 1024:
        return f"{int(x/1024)}TB"
    else:
        return f"{int(x)}GB"

df_new['Storage_Display'] = df_new['Storage_Snapped_GB'].apply(format_size)
df_new['Storage Amount and Type'] = df_new['Storage_Display'] + ' ' + df_new['StorageType']

In [7]:
# List of columns to drop
columns_to_drop = ['StorageAmount', 'StorageType', 'Storage_Snapped_GB', 'Storage_Display']

# Drop multiple columns
df_newer = df_new.drop(columns=columns_to_drop)

print(df_newer)

   Number                      Model  \
0       1  ROG Strix G513IE_G513IE     
1       2             Custom Build     

                                         CPU  CPUCores         Windows  \
0   AMD Ryzen 7 4800H with Radeon Graphics           8  Windows 11 Pro   
1  Intel(R) Core(TM) i7-10700 CPU @ 2.90GHz          8  Windows 11 Pro   

  WindowsActivated RAM Amount and Type Storage Amount and Type  
0                y           16GB DDR4         512GB SSD        
1                y           16GB DDR4         512GB SSD        


In [8]:
df_converted = df_newer.astype(str)
print(df_converted)

  Number                      Model  \
0      1  ROG Strix G513IE_G513IE     
1      2             Custom Build     

                                         CPU CPUCores         Windows  \
0   AMD Ryzen 7 4800H with Radeon Graphics          8  Windows 11 Pro   
1  Intel(R) Core(TM) i7-10700 CPU @ 2.90GHz         8  Windows 11 Pro   

  WindowsActivated RAM Amount and Type Storage Amount and Type  
0                y           16GB DDR4         512GB SSD        
1                y           16GB DDR4         512GB SSD        


In [9]:
df_converted.insert(loc=0, column='ID', value=None) # can change value
print(df_converted)

     ID Number                      Model  \
0  None      1  ROG Strix G513IE_G513IE     
1  None      2             Custom Build     

                                         CPU CPUCores         Windows  \
0   AMD Ryzen 7 4800H with Radeon Graphics          8  Windows 11 Pro   
1  Intel(R) Core(TM) i7-10700 CPU @ 2.90GHz         8  Windows 11 Pro   

  WindowsActivated RAM Amount and Type Storage Amount and Type  
0                y           16GB DDR4         512GB SSD        
1                y           16GB DDR4         512GB SSD        


In [10]:
for col in df_converted.select_dtypes(include=['object']).columns:
    df_converted[col] = df_converted[col].str.strip()
print(df_converted)

     ID Number                    Model  \
0  None      1  ROG Strix G513IE_G513IE   
1  None      2             Custom Build   

                                        CPU CPUCores         Windows  \
0    AMD Ryzen 7 4800H with Radeon Graphics        8  Windows 11 Pro   
1  Intel(R) Core(TM) i7-10700 CPU @ 2.90GHz        8  Windows 11 Pro   

  WindowsActivated RAM Amount and Type Storage Amount and Type  
0                y           16GB DDR4               512GB SSD  
1                y           16GB DDR4               512GB SSD  


In [11]:
# Only keep first 4 words for Ryzen
keyword = 'AMD Ryzen'
mask = df_converted['CPU'].str.contains(keyword, case=False, na=False)

df_converted['CPU'] = df_converted.apply(
    lambda row: ' '.join(row['CPU'].split()[:4]) if mask[row.name] else row['CPU'],
    axis=1
)

# Replace Intel(R) Core(TM) with Intel Core and keep the next word (model)
df_converted['CPU'] = df_converted['CPU'].str.replace(
    r'^Intel\(R\) Core\(TM\)\s+(\S+).*',
    r'Intel Core \1',
    regex=True
)
print(df_converted)

     ID Number                    Model                  CPU CPUCores  \
0  None      1  ROG Strix G513IE_G513IE    AMD Ryzen 7 4800H        8   
1  None      2             Custom Build  Intel Core i7-10700        8   

          Windows WindowsActivated RAM Amount and Type Storage Amount and Type  
0  Windows 11 Pro                y           16GB DDR4               512GB SSD  
1  Windows 11 Pro                y           16GB DDR4               512GB SSD  


In [12]:
df_converted['CPU'] = df_converted['CPU'] + ' ' + df_converted['CPUCores'] + ' Core Processor'
df = df_converted.drop('CPUCores', axis=1)
print(df)

     ID Number                    Model                                   CPU  \
0  None      1  ROG Strix G513IE_G513IE    AMD Ryzen 7 4800H 8 Core Processor   
1  None      2             Custom Build  Intel Core i7-10700 8 Core Processor   

          Windows WindowsActivated RAM Amount and Type Storage Amount and Type  
0  Windows 11 Pro                y           16GB DDR4               512GB SSD  
1  Windows 11 Pro                y           16GB DDR4               512GB SSD  


In [13]:
df['Windows Type and Version'] = df['Windows']
df = df.drop('Windows', axis=1)
print(df)

     ID Number                    Model                                   CPU  \
0  None      1  ROG Strix G513IE_G513IE    AMD Ryzen 7 4800H 8 Core Processor   
1  None      2             Custom Build  Intel Core i7-10700 8 Core Processor   

  WindowsActivated RAM Amount and Type Storage Amount and Type  \
0                y           16GB DDR4               512GB SSD   
1                y           16GB DDR4               512GB SSD   

  Windows Type and Version  
0           Windows 11 Pro  
1           Windows 11 Pro  


In [14]:
# Replace 'y' with 'Yes' and 'n' with 'No' in Windows Activated
df['Is Windows Activated?'] = df['WindowsActivated'].replace({'y': 'Yes', 'n': 'No'})
df = df.drop('WindowsActivated', axis=1)
print(df)

     ID Number                    Model                                   CPU  \
0  None      1  ROG Strix G513IE_G513IE    AMD Ryzen 7 4800H 8 Core Processor   
1  None      2             Custom Build  Intel Core i7-10700 8 Core Processor   

  RAM Amount and Type Storage Amount and Type Windows Type and Version  \
0           16GB DDR4               512GB SSD           Windows 11 Pro   
1           16GB DDR4               512GB SSD           Windows 11 Pro   

  Is Windows Activated?  
0                   Yes  
1                   Yes  


In [None]:
df.to_csv('output_listed.csv', index=False)