In [None]:
# python 3.8.10 environment
import pandas as pd
import numpy as np
import glob
import os
import re

In [76]:
# Read in all text files containing raw info and add to dataframe
folder_path = "."

file_paths = glob.glob(os.path.join(folder_path, "*vol*.txt"))

dfs = [pd.read_csv(path, sep=',', header=None, skiprows=1) for path in file_paths]

df_combined = pd.concat(dfs, ignore_index=True)

print(df_combined)

   0      1                        2                          3   \
0   1  test1  ASUSTeK COMPUTER INC.    ROG Strix G513IE_G513IE     
1   0  test2  ASUSTeK COMPUTER INC.    ROG Strix G513IE_G513IE     
2   0  test3  ASUSTeK COMPUTER INC.    ROG Strix G513IE_G513IE     

                  4                                         5   6   \
0  N3NRKD03324310B    AMD Ryzen 7 4800H with Radeon Graphics     8   
1  N3NRKD03324310B    AMD Ryzen 7 4800H with Radeon Graphics     8   
2  N3NRKD03324310B    AMD Ryzen 7 4800H with Radeon Graphics     8   

            7   8   9                            10  \
0  16558182400  26 NaN  Microsoft Windows 11 Home     
1  16558182400  26 NaN  Microsoft Windows 11 Home     
2  16558182400  26 NaN  Microsoft Windows 11 Home     

                                      11  \
0  The machine is permanently activated.   
1  The machine is permanently activated.   
2  The machine is permanently activated.   

                                                

In [77]:
# Convert bytes to GB, always rounding UP, and format with 'GB'

df_combined['RAM'] = (np.ceil(df_combined[7] / (1024**3))).astype(int).astype(str) + 'GB'
display(df_combined)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,RAM
0,1,test1,ASUSTeK COMPUTER INC.,ROG Strix G513IE_G513IE,N3NRKD03324310B,AMD Ryzen 7 4800H with Radeon Graphics,8,16558182400,26,,Microsoft Windows 11 Home,The machine is permanently activated.,SSD 512110190592 0000_0000_0100_0000_E4D...,16GB
1,0,test2,ASUSTeK COMPUTER INC.,ROG Strix G513IE_G513IE,N3NRKD03324310B,AMD Ryzen 7 4800H with Radeon Graphics,8,16558182400,26,,Microsoft Windows 11 Home,The machine is permanently activated.,Unspecified 1500301909504 NA4BD4C1 ...,16GB
2,0,test3,ASUSTeK COMPUTER INC.,ROG Strix G513IE_G513IE,N3NRKD03324310B,AMD Ryzen 7 4800H with Radeon Graphics,8,16558182400,26,,Microsoft Windows 11 Home,The machine is permanently activated.,Unspecified 1500301909504 0000000000000000 ...,16GB


In [78]:
# Define RAM lookup table
ram_lookup_table = {
    0: "Unknown",
    1: "Other",
    2: "DRAM",
    3: "Synchronous DRAM",
    4: "Cache DRAM",
    5: "EDO",
    6: "EDRAM",
    7: "VRAM",
    8: "SRAM",
    9: "RAM",
    10: "ROM",
    11: "Flash",
    12: "EEPROM",
    13: "FEPROM",
    14: "EPROM",
    15: "CDRAM",
    16: "3DRAM",
    17: "SDRAM",
    18: "SGRAM",
    19: "RDRAM",
    20: "DDR",
    21: "DDR2",
    22: "DDR2 FB-DIMM",
    23: "Reserved",
    24: "DDR3",
    25: "FBD2",
    26: "DDR4",
    27: "LPDDR",
    28: "LPDDR2",
    29: "LPDDR3",
    30: "LPDDR4",
    31: "Logical non-volatile device",
    32: "HBM",
    33: "HBM2",
    34: "DDR5"
}

In [79]:
# Map the numeric codes to names and add to new column for ram amount and type
df_combined['RAM Amount and Type'] = df_combined['RAM'] + ' ' + df_combined[8].map(ram_lookup_table)
display(df_combined)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,RAM,RAM Amount and Type
0,1,test1,ASUSTeK COMPUTER INC.,ROG Strix G513IE_G513IE,N3NRKD03324310B,AMD Ryzen 7 4800H with Radeon Graphics,8,16558182400,26,,Microsoft Windows 11 Home,The machine is permanently activated.,SSD 512110190592 0000_0000_0100_0000_E4D...,16GB,16GB DDR4
1,0,test2,ASUSTeK COMPUTER INC.,ROG Strix G513IE_G513IE,N3NRKD03324310B,AMD Ryzen 7 4800H with Radeon Graphics,8,16558182400,26,,Microsoft Windows 11 Home,The machine is permanently activated.,Unspecified 1500301909504 NA4BD4C1 ...,16GB,16GB DDR4
2,0,test3,ASUSTeK COMPUTER INC.,ROG Strix G513IE_G513IE,N3NRKD03324310B,AMD Ryzen 7 4800H with Radeon Graphics,8,16558182400,26,,Microsoft Windows 11 Home,The machine is permanently activated.,Unspecified 1500301909504 0000000000000000 ...,16GB,16GB DDR4


In [80]:
# CPU Generation
def ordinal(n: int) -> str:
    """Return an integer as an ordinal string (1 -> 1st, 2 -> 2nd, etc.)."""
    if 10 <= n % 100 <= 20:
        suffix = "th"
    else:
        suffix = {1: "st", 2: "nd", 3: "rd"}.get(n % 10, "th")
    return f"{n}{suffix}"

def get_cpu_generation(cpu_name: str) -> str:
    """
    Determine CPU generation from CPU name string.
    Supports:
      - Intel Core i3/i5/i7/i9 (desktop & mobile)
      - Intel Core Ultra (desktop & mobile, with mobile suffixes)
      - AMD Ryzen (desktop & mobile)
    """
    if not isinstance(cpu_name, str):
        return "Unknown generation"

    cpu_lower = cpu_name.lower()

    # ---- Intel Core i3/i5/i7/i9 (desktop & mobile) ----
    intel_match = re.search(r'i[3579]-([0-9]{3,5})[a-z]*', cpu_lower)
    if intel_match:
        digits = intel_match.group(1)
        if len(digits) == 3:       # 1st Gen
            gen = 1
        elif len(digits) == 4:     # 2nd–9th Gen
            gen = int(digits[0])
        elif len(digits) == 5:     # 10th Gen+
            gen = int(digits[:2])
        else:
            return "Unknown generation"
        return f"{ordinal(gen)} Gen"

    # ---- Intel Core Ultra (desktop & mobile) ----
    # Handles: "Core(TM) Ultra" or "Core Ultra"
    # Allows dash or space, and optional suffixes like U, H, KF, K, etc.
    ultra_match = re.search(
        r'core(\(tm\))?\s+ultra\s+(\d)[-\s](\d+)([a-z]{0,3})?', cpu_lower
    )
    if ultra_match:
        return "15th Gen"

    # ---- AMD Ryzen (desktop & mobile) ----
    ryzen_match = re.search(r'ryzen\s+\d+\s+([0-9]{3,5})[a-z]*', cpu_lower)
    if ryzen_match:
        digits = ryzen_match.group(1)
        gen = int(digits[0])
        return f"{ordinal(gen)} Gen"

    return "Unknown generation"

In [81]:
df_combined["CPU Generation"] = df_combined[5].apply(get_cpu_generation)
display(df_combined)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,RAM,RAM Amount and Type,CPU Generation
0,1,test1,ASUSTeK COMPUTER INC.,ROG Strix G513IE_G513IE,N3NRKD03324310B,AMD Ryzen 7 4800H with Radeon Graphics,8,16558182400,26,,Microsoft Windows 11 Home,The machine is permanently activated.,SSD 512110190592 0000_0000_0100_0000_E4D...,16GB,16GB DDR4,4th Gen
1,0,test2,ASUSTeK COMPUTER INC.,ROG Strix G513IE_G513IE,N3NRKD03324310B,AMD Ryzen 7 4800H with Radeon Graphics,8,16558182400,26,,Microsoft Windows 11 Home,The machine is permanently activated.,Unspecified 1500301909504 NA4BD4C1 ...,16GB,16GB DDR4,4th Gen
2,0,test3,ASUSTeK COMPUTER INC.,ROG Strix G513IE_G513IE,N3NRKD03324310B,AMD Ryzen 7 4800H with Radeon Graphics,8,16558182400,26,,Microsoft Windows 11 Home,The machine is permanently activated.,Unspecified 1500301909504 0000000000000000 ...,16GB,16GB DDR4,4th Gen


In [82]:
for col in df_combined.select_dtypes(include=['object']).columns:
    df_combined[col] = df_combined[col].str.strip()
display(df_combined)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,RAM,RAM Amount and Type,CPU Generation
0,1,test1,ASUSTeK COMPUTER INC.,ROG Strix G513IE_G513IE,N3NRKD03324310B,AMD Ryzen 7 4800H with Radeon Graphics,8,16558182400,26,,Microsoft Windows 11 Home,The machine is permanently activated.,SSD 512110190592 0000_0000_0100_0000_E4D...,16GB,16GB DDR4,4th Gen
1,0,test2,ASUSTeK COMPUTER INC.,ROG Strix G513IE_G513IE,N3NRKD03324310B,AMD Ryzen 7 4800H with Radeon Graphics,8,16558182400,26,,Microsoft Windows 11 Home,The machine is permanently activated.,Unspecified 1500301909504 NA4BD4C1 ...,16GB,16GB DDR4,4th Gen
2,0,test3,ASUSTeK COMPUTER INC.,ROG Strix G513IE_G513IE,N3NRKD03324310B,AMD Ryzen 7 4800H with Radeon Graphics,8,16558182400,26,,Microsoft Windows 11 Home,The machine is permanently activated.,Unspecified 1500301909504 0000000000000000 ...,16GB,16GB DDR4,4th Gen


In [83]:
# Only keep first 4 words for Ryzen
keyword = 'AMD Ryzen'
mask = df_combined[5].str.contains(keyword, case=False, na=False)

df_combined['CPU'] = df_combined.apply(
    lambda row: ' '.join(row[5].split()[:4]) if mask[row.name] else row[5],
    axis=1
)
display(df_combined)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,RAM,RAM Amount and Type,CPU Generation,CPU
0,1,test1,ASUSTeK COMPUTER INC.,ROG Strix G513IE_G513IE,N3NRKD03324310B,AMD Ryzen 7 4800H with Radeon Graphics,8,16558182400,26,,Microsoft Windows 11 Home,The machine is permanently activated.,SSD 512110190592 0000_0000_0100_0000_E4D...,16GB,16GB DDR4,4th Gen,AMD Ryzen 7 4800H
1,0,test2,ASUSTeK COMPUTER INC.,ROG Strix G513IE_G513IE,N3NRKD03324310B,AMD Ryzen 7 4800H with Radeon Graphics,8,16558182400,26,,Microsoft Windows 11 Home,The machine is permanently activated.,Unspecified 1500301909504 NA4BD4C1 ...,16GB,16GB DDR4,4th Gen,AMD Ryzen 7 4800H
2,0,test3,ASUSTeK COMPUTER INC.,ROG Strix G513IE_G513IE,N3NRKD03324310B,AMD Ryzen 7 4800H with Radeon Graphics,8,16558182400,26,,Microsoft Windows 11 Home,The machine is permanently activated.,Unspecified 1500301909504 0000000000000000 ...,16GB,16GB DDR4,4th Gen,AMD Ryzen 7 4800H


In [84]:
# Replace Intel(R) Core(TM) with Intel Core and keep the next word (model)
df_combined['CPU'] = df_combined['CPU'].str.replace(
    r'.*Intel\(R\) Core\(TM\)\s+(\S+).*',
    r'Intel Core \1',
    regex=True
)
display(df_combined)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,RAM,RAM Amount and Type,CPU Generation,CPU
0,1,test1,ASUSTeK COMPUTER INC.,ROG Strix G513IE_G513IE,N3NRKD03324310B,AMD Ryzen 7 4800H with Radeon Graphics,8,16558182400,26,,Microsoft Windows 11 Home,The machine is permanently activated.,SSD 512110190592 0000_0000_0100_0000_E4D...,16GB,16GB DDR4,4th Gen,AMD Ryzen 7 4800H
1,0,test2,ASUSTeK COMPUTER INC.,ROG Strix G513IE_G513IE,N3NRKD03324310B,AMD Ryzen 7 4800H with Radeon Graphics,8,16558182400,26,,Microsoft Windows 11 Home,The machine is permanently activated.,Unspecified 1500301909504 NA4BD4C1 ...,16GB,16GB DDR4,4th Gen,AMD Ryzen 7 4800H
2,0,test3,ASUSTeK COMPUTER INC.,ROG Strix G513IE_G513IE,N3NRKD03324310B,AMD Ryzen 7 4800H with Radeon Graphics,8,16558182400,26,,Microsoft Windows 11 Home,The machine is permanently activated.,Unspecified 1500301909504 0000000000000000 ...,16GB,16GB DDR4,4th Gen,AMD Ryzen 7 4800H


In [85]:

df_combined['CPU'] = df_combined['CPU'] + ' ' + df_combined[6].astype(str) + ' Core Processor'
display(df_combined)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,RAM,RAM Amount and Type,CPU Generation,CPU
0,1,test1,ASUSTeK COMPUTER INC.,ROG Strix G513IE_G513IE,N3NRKD03324310B,AMD Ryzen 7 4800H with Radeon Graphics,8,16558182400,26,,Microsoft Windows 11 Home,The machine is permanently activated.,SSD 512110190592 0000_0000_0100_0000_E4D...,16GB,16GB DDR4,4th Gen,AMD Ryzen 7 4800H 8 Core Processor
1,0,test2,ASUSTeK COMPUTER INC.,ROG Strix G513IE_G513IE,N3NRKD03324310B,AMD Ryzen 7 4800H with Radeon Graphics,8,16558182400,26,,Microsoft Windows 11 Home,The machine is permanently activated.,Unspecified 1500301909504 NA4BD4C1 ...,16GB,16GB DDR4,4th Gen,AMD Ryzen 7 4800H 8 Core Processor
2,0,test3,ASUSTeK COMPUTER INC.,ROG Strix G513IE_G513IE,N3NRKD03324310B,AMD Ryzen 7 4800H with Radeon Graphics,8,16558182400,26,,Microsoft Windows 11 Home,The machine is permanently activated.,Unspecified 1500301909504 0000000000000000 ...,16GB,16GB DDR4,4th Gen,AMD Ryzen 7 4800H 8 Core Processor


In [86]:
df_combined['Windows Type and Version'] = df_combined[10].str.replace('Microsoft', '', regex=False)
display(df_combined)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,RAM,RAM Amount and Type,CPU Generation,CPU,Windows Type and Version
0,1,test1,ASUSTeK COMPUTER INC.,ROG Strix G513IE_G513IE,N3NRKD03324310B,AMD Ryzen 7 4800H with Radeon Graphics,8,16558182400,26,,Microsoft Windows 11 Home,The machine is permanently activated.,SSD 512110190592 0000_0000_0100_0000_E4D...,16GB,16GB DDR4,4th Gen,AMD Ryzen 7 4800H 8 Core Processor,Windows 11 Home
1,0,test2,ASUSTeK COMPUTER INC.,ROG Strix G513IE_G513IE,N3NRKD03324310B,AMD Ryzen 7 4800H with Radeon Graphics,8,16558182400,26,,Microsoft Windows 11 Home,The machine is permanently activated.,Unspecified 1500301909504 NA4BD4C1 ...,16GB,16GB DDR4,4th Gen,AMD Ryzen 7 4800H 8 Core Processor,Windows 11 Home
2,0,test3,ASUSTeK COMPUTER INC.,ROG Strix G513IE_G513IE,N3NRKD03324310B,AMD Ryzen 7 4800H with Radeon Graphics,8,16558182400,26,,Microsoft Windows 11 Home,The machine is permanently activated.,Unspecified 1500301909504 0000000000000000 ...,16GB,16GB DDR4,4th Gen,AMD Ryzen 7 4800H 8 Core Processor,Windows 11 Home


In [87]:
# Windows Activated column
df_combined['Is Windows Activated?'] = df_combined[11].replace({'The machine is permanently activated.': 'Yes', 'n': 'No'})
display(df_combined)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,RAM,RAM Amount and Type,CPU Generation,CPU,Windows Type and Version,Is Windows Activated?
0,1,test1,ASUSTeK COMPUTER INC.,ROG Strix G513IE_G513IE,N3NRKD03324310B,AMD Ryzen 7 4800H with Radeon Graphics,8,16558182400,26,,Microsoft Windows 11 Home,The machine is permanently activated.,SSD 512110190592 0000_0000_0100_0000_E4D...,16GB,16GB DDR4,4th Gen,AMD Ryzen 7 4800H 8 Core Processor,Windows 11 Home,Yes
1,0,test2,ASUSTeK COMPUTER INC.,ROG Strix G513IE_G513IE,N3NRKD03324310B,AMD Ryzen 7 4800H with Radeon Graphics,8,16558182400,26,,Microsoft Windows 11 Home,The machine is permanently activated.,Unspecified 1500301909504 NA4BD4C1 ...,16GB,16GB DDR4,4th Gen,AMD Ryzen 7 4800H 8 Core Processor,Windows 11 Home,Yes
2,0,test3,ASUSTeK COMPUTER INC.,ROG Strix G513IE_G513IE,N3NRKD03324310B,AMD Ryzen 7 4800H with Radeon Graphics,8,16558182400,26,,Microsoft Windows 11 Home,The machine is permanently activated.,Unspecified 1500301909504 0000000000000000 ...,16GB,16GB DDR4,4th Gen,AMD Ryzen 7 4800H 8 Core Processor,Windows 11 Home,Yes


In [105]:
# Rows where col 0 == 0
df_system = df_combined[df_combined[0] == 0].reset_index(drop=True)

# Rows where col 0 == 1
df_listing = df_combined[df_combined[0] == 1].reset_index(drop=True)

In [106]:
df_system.rename(columns={1: 'Barcode', 2: 'Manufacturer', 3: 'Model', 4: 'Serial', 5: 'CPU_Raw', 9: 'Corporate Supplier'}, inplace=True)
df_system = df_system.drop([0, 6, 7, 8, 10, 11, 12, 'RAM', 'RAM Amount and Type', 'CPU', 'Windows Type and Version', 'Is Windows Activated?'], axis=1)
display(df_system)

Unnamed: 0,Barcode,Manufacturer,Model,Serial,CPU_Raw,Corporate Supplier,CPU Generation
0,test2,ASUSTeK COMPUTER INC.,ROG Strix G513IE_G513IE,N3NRKD03324310B,AMD Ryzen 7 4800H with Radeon Graphics,,4th Gen
1,test3,ASUSTeK COMPUTER INC.,ROG Strix G513IE_G513IE,N3NRKD03324310B,AMD Ryzen 7 4800H with Radeon Graphics,,4th Gen


In [107]:
# Clean up NaN and add ID column
df_system = df_system.fillna('')
df_system = df_system.astype(str)
df_system.insert(loc=0, column='ID', value=None) # can change value
display(df_system)

Unnamed: 0,ID,Barcode,Manufacturer,Model,Serial,CPU_Raw,Corporate Supplier,CPU Generation
0,,test2,ASUSTeK COMPUTER INC.,ROG Strix G513IE_G513IE,N3NRKD03324310B,AMD Ryzen 7 4800H with Radeon Graphics,,4th Gen
1,,test3,ASUSTeK COMPUTER INC.,ROG Strix G513IE_G513IE,N3NRKD03324310B,AMD Ryzen 7 4800H with Radeon Graphics,,4th Gen


In [108]:
df_listing.rename(columns={1: 'Listing Number', 3: 'Model'}, inplace=True)
# separate disks frame
disks = df_listing[12].str.split('|', expand=True)
df_listing = df_listing.drop([0, 2, 4, 5, 6, 7, 8, 9, 10, 11, 12, 'RAM', 'CPU Generation'], axis=1)
display(df_listing)

Unnamed: 0,Listing Number,Model,RAM Amount and Type,CPU,Windows Type and Version,Is Windows Activated?
0,test1,ROG Strix G513IE_G513IE,16GB DDR4,AMD Ryzen 7 4800H 8 Core Processor,Windows 11 Home,Yes


In [109]:
# create dataframe of storage disk info and remove USB drive (serial number is zeros) info
disks = disks.replace(r'.*0{16}.*', np.nan, regex=True)
display(disks)

Unnamed: 0,0
0,SSD 512110190592 0000_0000_0100_0000_E4D...


In [110]:
def push_nans_right(row):
    # Sort row values: non-NaNs first, NaNs last
    return row.dropna().tolist() + [np.nan] * (len(row) - row.count())

# clean up NaNs
disks = disks.apply(push_nans_right, axis=1, result_type='expand')

disks = disks.dropna(axis=1, how='all')

display(disks)

Unnamed: 0,0
0,SSD 512110190592 0000_0000_0100_0000_E4D...


In [111]:
# define nice sizes in GB
nice_sizes = [128, 256, 512, 1024, 2048, 4096, 8192]

def snap_to_nice(size_gb):
    # pick the closest nice size
    return min(nice_sizes, key=lambda x: abs(x - size_gb))

def format_storage(cell):
    if pd.isna(cell):
        return np.nan
    parts = str(cell).split()
    if len(parts) < 2:
        return cell
    
    storage_type = parts[0]
    size_bytes = int(parts[1])
    
    # Convert bytes to GB
    size_gb = size_bytes / (1024**3)
    
    # Snap to nearest nice value
    snapped = snap_to_nice(size_gb)
    
    return f"{snapped}GB {storage_type}"

# Apply to every cell
disks_formatted = disks.applymap(format_storage)
# Rename columns to Disk1, Disk2, ...
disks_formatted.columns = [f"Disk{i+1}" for i in range(disks_formatted.shape[1])]
display(disks_formatted)

Unnamed: 0,Disk1
0,512GB SSD


In [113]:
df_listing_2 = pd.concat([df_listing, disks_formatted], axis=1)
df_listing_2 = df_listing_2.astype(str)
display(df_listing_2)

Unnamed: 0,Listing Number,Model,RAM Amount and Type,CPU,Windows Type and Version,Is Windows Activated?,Disk1
0,test1,ROG Strix G513IE_G513IE,16GB DDR4,AMD Ryzen 7 4800H 8 Core Processor,Windows 11 Home,Yes,512GB SSD


In [114]:
df_system.to_csv('output_system.csv', index=False)
df_listing_2.to_csv('output_listing.csv', index=False)