## 1. Import Required Libraries

In [2]:
!pip install tqdm

Collecting tqdm
  Using cached tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Downloading tqdm-4.67.1-py3-none-any.whl (78 kB)
Installing collected packages: tqdm
Successfully installed tqdm-4.67.1



[notice] A new release of pip is available: 25.1.1 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [3]:
import pandas as pd
import numpy as np
from tqdm import tqdm

print("Libraries imported successfully!")

Libraries imported successfully!


## 2. Define Helper Functions

In [4]:
def hex_to_binary_list(hex_address, num_bits=32):
    """
    Convert hexadecimal address to list of binary bits.
    
    Args:
        hex_address: Hex string (e.g., '7f2d05dc1089')
        num_bits: Number of bits to output (default 32)
    
    Returns:
        List of integers (0 or 1) representing binary bits
    """
    # Convert hex to integer
    int_value = int(hex_address, 16)
    
    # Convert to binary string (remove '0b' prefix)
    binary_str = bin(int_value)[2:]
    
    # Pad with zeros or truncate to num_bits
    if len(binary_str) > num_bits:
        binary_str = binary_str[-num_bits:]  # Take last num_bits
    else:
        binary_str = binary_str.zfill(num_bits)  # Pad with zeros
    
    # Convert to list of integers
    return [int(bit) for bit in binary_str]


def parse_branch_history(history_str, num_bits=32):
    """
    Parse branch history string to list of binary bits.
    
    Args:
        history_str: Space-separated string of 0s and 1s
        num_bits: Number of bits to output (default 32)
    
    Returns:
        List of integers (0 or 1), padded with 0s to num_bits
    """
    # Split and convert to integers
    if history_str.strip():
        history = [int(bit) for bit in history_str.strip().split()]
    else:
        history = []
    
    # Pad with zeros if shorter than num_bits
    if len(history) < num_bits:
        history = history + [0] * (num_bits - len(history))
    # Truncate if longer than num_bits (take last num_bits)
    elif len(history) > num_bits:
        history = history[-num_bits:]
    
    return history


# Test functions
print("Testing helper functions:")
print(f"Hex to binary: {hex_to_binary_list('7f2d05dc1089', 32)[:10]}... (first 10 bits)")
print(f"Branch history: {parse_branch_history('0 1 1 0', 32)[:10]}... (first 10 bits)")
print("âœ“ Helper functions defined successfully!")

Testing helper functions:
Hex to binary: [0, 0, 0, 0, 0, 1, 0, 1, 1, 1]... (first 10 bits)
Branch history: [0, 1, 1, 0, 0, 0, 0, 0, 0, 0]... (first 10 bits)
âœ“ Helper functions defined successfully!


## 3. Read and Parse the Input File

In [12]:
input_file = 'calculator(switch case).out'
output_file = 'branch_data_processed_01.csv'

print(f"Reading file: {input_file}")
print("This may take a few minutes...\n")

# Lists to store parsed data
data_rows = []

# Read file line by line
with open(input_file, 'r') as f:
    lines = f.readlines()

print(f"Total lines to process: {len(lines):,}")

# Process lines in pairs (Branch Address line + Branch History line)
i = 0
with tqdm(total=len(lines)//2, desc="Processing branches") as pbar:
    while i < len(lines) - 1:
        # Parse branch address line
        address_line = lines[i].strip()
        if address_line.startswith('Branch Address:'):
            # Extract address and taken status
            parts = address_line.split(',')
            hex_address = parts[0].replace('Branch Address:', '').strip()
            taken = int(parts[1].replace('Taken:', '').strip())
            
            # Parse branch history line
            i += 1
            history_line = lines[i].strip()
            if history_line.startswith('Branch History:'):
                history_str = history_line.replace('Branch History:', '').strip()
                
                # Convert to binary lists
                pc_bits = hex_to_binary_list(hex_address, 32)
                history_bits = parse_branch_history(history_str, 32)
                
                # Combine into single row: PC (32) + History (32) + Taken (1)
                row = pc_bits + history_bits + [taken]
                data_rows.append(row)
                
                pbar.update(1)
        
        i += 1

print(f"\nâœ“ Processed {len(data_rows):,} branch records")

Reading file: calculator(switch case).out
This may take a few minutes...

Total lines to process: 638,406


Processing branches: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 319203/319203 [00:08<00:00, 37672.76it/s]


âœ“ Processed 319,203 branch records





## 4. Create DataFrame and Save to CSV

In [13]:
# Create column names
pc_columns = [f'PC_{i}' for i in range(32)]
history_columns = [f'BH_{i}' for i in range(32)]
target_column = ['Taken']

column_names = pc_columns + history_columns + target_column

# Create DataFrame
print("Creating DataFrame...")
df = pd.DataFrame(data_rows, columns=column_names)

print(f"\nDataFrame created successfully!")
print(f"Shape: {df.shape}")
print(f"Columns: {len(df.columns)} (32 PC + 32 Branch History + 1 Target)")
print(f"\nFirst few rows:")
print(df.head())

Creating DataFrame...

DataFrame created successfully!
Shape: (319203, 65)
Columns: 65 (32 PC + 32 Branch History + 1 Target)

First few rows:
   PC_0  PC_1  PC_2  PC_3  PC_4  PC_5  PC_6  PC_7  PC_8  PC_9  ...  BH_23  \
0     1     1     0     1     0     0     1     1     1     0  ...      0   
1     1     1     0     1     0     0     1     1     1     0  ...      0   
2     1     1     0     1     0     0     1     1     1     0  ...      0   
3     1     1     0     1     0     0     1     1     1     0  ...      0   
4     1     1     0     1     0     0     1     1     1     0  ...      0   

   BH_24  BH_25  BH_26  BH_27  BH_28  BH_29  BH_30  BH_31  Taken  
0      0      0      0      0      0      0      0      0      0  
1      0      0      0      0      0      0      0      0      1  
2      0      0      0      0      0      0      0      0      1  
3      0      0      0      0      0      0      0      0      0  
4      0      0      0      0      0      0      0      0  

## 5. Verify Data Quality

In [14]:
print("Data Quality Check:")
print("="*60)

# Check for missing values
print(f"Missing values: {df.isnull().sum().sum()}")

# Check data types
print(f"\nData types:")
print(df.dtypes.value_counts())

# Check value ranges (should be 0 or 1 for all columns)
print(f"\nValue ranges:")
print(f"  Min value: {df.min().min()}")
print(f"  Max value: {df.max().max()}")

# Check target distribution
print(f"\nTarget distribution:")
print(df['Taken'].value_counts())
print(f"\nPercentage:")
print(df['Taken'].value_counts(normalize=True) * 100)

print("\n" + "="*60)
print("âœ“ Data quality check complete!")

Data Quality Check:
Missing values: 0

Data types:
int64    65
Name: count, dtype: int64

Value ranges:
  Min value: 0
  Max value: 1

Target distribution:
Taken
1    179024
0    140179
Name: count, dtype: int64

Percentage:
Taken
1    56.084686
0    43.915314
Name: proportion, dtype: float64

âœ“ Data quality check complete!


## 6. Display Sample Rows

In [15]:
print("Sample rows from the processed data:\n")

# Show first 3 rows
print("First 3 rows:")
print("="*100)
display(df.head(3))

# Show last 3 rows
print("\nLast 3 rows:")
print("="*100)
display(df.tail(3))

Sample rows from the processed data:

First 3 rows:


Unnamed: 0,PC_0,PC_1,PC_2,PC_3,PC_4,PC_5,PC_6,PC_7,PC_8,PC_9,...,BH_23,BH_24,BH_25,BH_26,BH_27,BH_28,BH_29,BH_30,BH_31,Taken
0,1,1,0,1,0,0,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
1,1,1,0,1,0,0,1,1,1,0,...,0,0,0,0,0,0,0,0,0,1
2,1,1,0,1,0,0,1,1,1,0,...,0,0,0,0,0,0,0,0,0,1



Last 3 rows:


Unnamed: 0,PC_0,PC_1,PC_2,PC_3,PC_4,PC_5,PC_6,PC_7,PC_8,PC_9,...,BH_23,BH_24,BH_25,BH_26,BH_27,BH_28,BH_29,BH_30,BH_31,Taken
319200,1,1,0,0,0,0,0,0,0,0,...,0,0,0,1,1,0,0,0,0,0
319201,1,1,0,0,0,0,0,0,0,0,...,0,0,1,1,0,0,0,0,0,0
319202,1,1,0,0,0,0,0,0,0,0,...,0,1,1,0,0,0,0,0,1,1


## 7. Save to CSV File

In [None]:
print(f"Saving to CSV file: {output_file}")
print("This may take a moment...\n")

df.to_csv(output_file, index=False)

print(f"âœ“ Data saved successfully to {output_file}")

Saving to CSV file: branch_data_processed_01.csv
This may take a moment...

âœ“ Data saved successfully to branch_data_processed_01.csv

File details:
  Rows: 319,203
  Columns: 65


AttributeError: module 'pandas.io.common' has no attribute 'get_filepath_or_buffer'

## 8. Summary Statistics

In [11]:
print("="*70)
print(" "*20 + "PROCESSING COMPLETE")
print("="*70)

print(f"\nðŸ“Š Dataset Summary:")
print(f"   Input file: {input_file}")
print(f"   Output file: {output_file}")
print(f"   Total records: {len(df):,}")

print(f"\nðŸ“ˆ Column Structure:")
print(f"   PC bits (0-31): 32 columns")
print(f"   Branch History (32-63): 32 columns")
print(f"   Target (64): 1 column")
print(f"   Total: 65 columns")

print(f"\nðŸŽ¯ Target Distribution:")
for val, count in df['Taken'].value_counts().items():
    pct = (count / len(df)) * 100
    print(f"   Taken={val}: {count:,} ({pct:.2f}%)")

print(f"\nâœ“ CSV file is ready for machine learning!")
print("="*70)

                    PROCESSING COMPLETE

ðŸ“Š Dataset Summary:
   Input file: cp-pin.exe.out
   Output file: branch_data_processed.csv
   Total records: 350,783

ðŸ“ˆ Column Structure:
   PC bits (0-31): 32 columns
   Branch History (32-63): 32 columns
   Target (64): 1 column
   Total: 65 columns

ðŸŽ¯ Target Distribution:
   Taken=1: 197,297 (56.24%)
   Taken=0: 153,486 (43.76%)

âœ“ CSV file is ready for machine learning!
