# Transform the data

## Objective: 
Create a pipeline to transform the log data into a dataframe we can use for predictive modelling.

## Table format: Features

### Input Features

- $ n $: Number of elements (e.g., 16, 31).
- $ k $: Number of partitions (e.g., 5, 4).
- Total sum: $ \sum S $ (requires input numbers).
- Variance: $ \text{var}(S) $.
- Skewness: Distribution shape
- Max/min number.
- Average subset sum: $ \text{total sum} / k $.

### Solver Features (First $ k $ Logs at stackDepth=3):

**For each log (up to $ k $):**

- evts: Events at stackDepth=3.
- expandEvts: Expansions.
- pruneBacktrackEvts: Pruning backtracks.
- backtrackEvts: Non-pruning backtracks.
- strengthenEvts: Constraint tightenings.
- maxStackDepth: Maximum depth reached.
- Subset sums: Sum of numbers assigned to each subset based on path (requires input numbers).
- Subset sum variance: Variance of subset sums.
- Aggregated: Average or max evts, expandEvts, pruneBacktrackEvts across the $ k $ logs.
- num_stackdepth3_logs: Number of stackDepth=3 logs (proxy for search difficulty).

### Termination/Timeout Features
- expandEvts (target variable).
- Censored flag: 1 for timeouts, 0 for completions.
- Objective value: maxsum - minsum (if available, e.g., 2 for $ n=10, k=3 $).

In [19]:
import json
import re
import os
import pandas as pd
import numpy as np
from scipy import stats


# 1. Create a dataframe with solver features
The solver features are as listed above. The index will be the file names

In [30]:

def extract_ml_features(jsonl_path):
    """
    Extract ML features from the ml_features.jsonl file.
    
    Args:
        jsonl_path: Path to the ml_features.jsonl file
        
    Returns:
        pandas.DataFrame: DataFrame containing the extracted features
    """
    # List to store data for each instance
    data_list = []
    
    # Open and process the JSONL file
    with open(jsonl_path, 'r') as f:
        for line in f:
            try:
                # Parse the JSON line
                line_data = json.loads(line.strip())
                
                # Each line contains a single key (filename) with an array of log entries
                for filename, logs in line_data.items():
                    if not logs:  # Skip if no logs
                        continue
                    
                    # Extract n and k from filename using regex
                    match = re.search(r'n(\d+)k(\d+)', filename)
                    if match:
                        n = int(match.group(1))  # Number of values
                        k = int(match.group(2))  # Number of partitions
                    else:
                        # If pattern doesn't match, try to infer from the logs
                        k = max(3, min(5, len(logs)))
                        n = 0  # Unknown
                    
                    # Initialize feature dictionary
                    features = {
                        'filename': filename,
                        'n': n,
                        'k': k,
                        'num_stackdepth3_logs': 0
                    }
                    
                    # Extract individual log features (up to k logs)
                    for i in range(min(k, len(logs))):
                        log = logs[i]
                        
                        if log.get('stackDepth', 0) == 3:
                            features['num_stackdepth3_logs'] += 1
                            
                        # Extract all numeric features from this log
                        for field in ['evts', 'expandEvts', 'pruneBacktrackEvts', 
                                     'backtrackEvts', 'strengthenEvts', 'maxStackDepth']:
                            if field in log:
                                features[f'{field}_{i+1}'] = log[field]
                    
                    # Find the termination or timeout event (should be the last log)
                    last_log = logs[-1]
                    is_timeout = last_log.get('event') == 'TIMEOUT'
                    
                    # Add target variables
                    features['censored'] = 1 if is_timeout else 0
                    features['final_expandEvts'] = last_log.get('expandEvts', 0)
                    features['final_maxStackDepth'] = last_log.get('maxStackDepth', 0)
                    
                    # Calculate aggregated features
                    for field in ['evts', 'expandEvts', 'pruneBacktrackEvts']:
                        values = [log.get(field, 0) for log in logs[:k] if field in log]
                        if values:
                            features[f'avg_{field}'] = sum(values) / len(values)
                            features[f'max_{field}'] = max(values)
                    
                    # Add to data list
                    data_list.append(features)
                    
            except json.JSONDecodeError as e:
                print(f"Error parsing JSON line: {e}")
                continue
            except Exception as e:
                print(f"Error processing line: {e}")
                continue
    
    # Convert to DataFrame
    df = pd.DataFrame(data_list)
    df = df.set_index("filename")
    
    # For demonstration, print the shape and first few rows
    print(f"Extracted features for {len(df)} instances")
    print(f"DataFrame shape: {df.shape}")
    
    return df





solver_features_df = extract_ml_features("ml_features.jsonl")
# df.to_csv("ml_features.csv", index=False)


Extracted features for 620 instances
DataFrame shape: (620, 42)


In [31]:
solver_features_df

Unnamed: 0_level_0,n,k,num_stackdepth3_logs,evts_1,expandEvts_1,pruneBacktrackEvts_1,backtrackEvts_1,strengthenEvts_1,maxStackDepth_1,evts_2,...,pruneBacktrackEvts_4,backtrackEvts_4,strengthenEvts_4,maxStackDepth_4,evts_5,expandEvts_5,pruneBacktrackEvts_5,backtrackEvts_5,strengthenEvts_5,maxStackDepth_5
filename,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
n10k3_v1.txt,10,3,3,4,4,0,0,0,3,60,...,,,,,,,,,,
n10k3_v2.txt,10,3,3,4,4,0,0,0,3,59,...,,,,,,,,,,
n10k3_v3.txt,10,3,3,4,4,0,0,0,3,66,...,,,,,,,,,,
n10k3_v4.txt,10,3,3,4,4,0,0,0,3,106,...,,,,,,,,,,
n10k3_v5.txt,10,3,3,4,4,0,0,0,3,21,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
n49k4_v1.txt,49,4,1,4,4,0,0,0,3,2318009416,...,,,,,,,,,,
n49k4_v2.txt,49,4,1,4,4,0,0,0,3,2511054050,...,,,,,,,,,,
n49k4_v3.txt,49,4,1,4,4,0,0,0,3,2402044766,...,,,,,,,,,,
n49k4_v4.txt,49,4,1,4,4,0,0,0,3,2319516173,...,,,,,,,,,,


## 2. Create a Dataframe with input Features

In [32]:

def process_path_features(df, instance_dir):
    """
    Process input features from the original instance files.
    
    Args:
        df: DataFrame with existing features from logs
        instance_dir: Directory containing the original instance files
        
    Returns:
        pandas.DataFrame: DataFrame with input features
    """
    # Create a new DataFrame for input features with filename as index
    input_features = pd.DataFrame(index=df.index)
    
    # Track missing files
    missing_files = []
    processed_files = 0
    
    # Process each file
    for filename in df.index:
        file_path = os.path.join(instance_dir, filename)
        
        try:
            if not os.path.exists(file_path):
                missing_files.append(filename)
                continue
                
            # Read the instance file
            with open(file_path, 'r') as f:
                lines = f.readlines()
            
            # Extract data from file
            solution = int(lines[0].strip())  # -1 if no solution
            k = int(lines[1].strip())         # number of partitions
            numbers = [int(line.strip()) for line in lines[2:]]
            n = len(numbers)                  # number of elements
            
            # Calculate basic input features
            total_sum = sum(numbers)
            variance = np.var(numbers) if n > 1 else 0
            skewness = stats.skew(numbers) if n > 2 else 0
            max_num = max(numbers) if numbers else 0
            min_num = min(numbers) if numbers else 0
            avg_subset_sum = total_sum / k if k > 0 else 0
            
            # Store features
            input_features.loc[filename, 'n'] = n
            input_features.loc[filename, 'k'] = k
            input_features.loc[filename, 'total_sum'] = total_sum
            input_features.loc[filename, 'variance'] = variance
            input_features.loc[filename, 'skewness'] = skewness
            input_features.loc[filename, 'max_num'] = max_num
            input_features.loc[filename, 'min_num'] = min_num
            input_features.loc[filename, 'avg_subset_sum'] = avg_subset_sum
            # input_features.loc[filename, 'solution_exists'] = 0 if solution == -1 else 1
            
            # Calculate additional features
            # Theoretical minimum objective value (difference between max and min subset sums)
            # In perfect partitioning, all subsets would have the same sum
            perfect_partition = total_sum / k
            input_features.loc[filename, 'perfect_partition_sum'] = perfect_partition
            
            # How close is the maximum number to the average subset sum?
            # If max_num > avg_subset_sum, the problem is likely harder
            input_features.loc[filename, 'max_to_avg_ratio'] = max_num / avg_subset_sum if avg_subset_sum > 0 else float('inf')
            
            # Range to average ratio
            input_features.loc[filename, 'range_to_avg_ratio'] = (max_num - min_num) / avg_subset_sum if avg_subset_sum > 0 else float('inf')
            
            # Coefficient of variation (standardized measure of dispersion)
            mean = np.mean(numbers)
            std_dev = np.std(numbers)
            input_features.loc[filename, 'coef_of_variation'] = std_dev / mean if mean > 0 else 0
            
            processed_files += 1
            
        except Exception as e:
            print(f"Error processing file {filename}: {e}")
            missing_files.append(filename)
    
    print(f"Processed {processed_files} instance files")
    print(f"Missing {len(missing_files)} files")
    
    if missing_files:
        print(f"First few missing files: {missing_files[:5]}")
    
    # Return a new DataFrame with input features
    # This keeps the original df unchanged and allows for better merging later
    return input_features



# Example usage:
instance_dir = "solver/numpart/instances/feature_collected"
input_features_df = process_path_features(solver_features_df, instance_dir)
# df = extract_and_analyze_ml_features("ml_features.jsonl", instance_dir)
# df.to_csv("ml_features_complete.csv", index=False)

input_features_df

Processed 450 instance files
Missing 170 files
First few missing files: ['n10k3_v1.txt', 'n10k3_v2.txt', 'n10k3_v3.txt', 'n10k3_v4.txt', 'n10k3_v5.txt']


Unnamed: 0_level_0,n,k,total_sum,variance,skewness,max_num,min_num,avg_subset_sum,perfect_partition_sum,max_to_avg_ratio,range_to_avg_ratio,coef_of_variation
filename,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
n10k3_v1.txt,,,,,,,,,,,,
n10k3_v2.txt,,,,,,,,,,,,
n10k3_v3.txt,,,,,,,,,,,,
n10k3_v4.txt,,,,,,,,,,,,
n10k3_v5.txt,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
n49k4_v1.txt,49.0,4.0,2084.0,784.085798,0.528426,98.0,2.0,521.00,521.00,0.188100,0.184261,0.658385
n49k4_v2.txt,49.0,4.0,1848.0,729.959184,0.279890,92.0,1.0,462.00,462.00,0.199134,0.196970,0.716380
n49k4_v3.txt,49.0,4.0,2055.0,724.016660,0.322909,97.0,1.0,513.75,513.75,0.188808,0.186861,0.641591
n49k4_v4.txt,49.0,4.0,2504.0,720.295710,-0.022441,96.0,2.0,626.00,626.00,0.153355,0.150160,0.525191


In [44]:
def merge_features(solver_features_df, input_features_df):
    """
    Merge solver features with input features.
    
    Args:
        solver_features_df: DataFrame with solver features
        input_features_df: DataFrame with input features
        
    Returns:
        pandas.DataFrame: Merged DataFrame
    """
    # Set filename as index for solver features to enable proper joining
    # solver_features_df = solver_features_df.set_index('filename')
    
    # Merge DataFrames on filename index
    merged_df = solver_features_df.join(input_features_df, how='inner', lsuffix='_solver', rsuffix='_input')
    
    # Reset index to make filename a column again
    # merged_df = merged_df.reset_index()
    
    print(f"Merged features: {len(merged_df)} rows, {len(merged_df.columns)} columns")
    
    return merged_df

df = merge_features(solver_features_df, input_features_df)

Merged features: 620 rows, 54 columns


In [45]:
df

Unnamed: 0_level_0,n_solver,k_solver,num_stackdepth3_logs,evts_1,expandEvts_1,pruneBacktrackEvts_1,backtrackEvts_1,strengthenEvts_1,maxStackDepth_1,evts_2,...,total_sum,variance,skewness,max_num,min_num,avg_subset_sum,perfect_partition_sum,max_to_avg_ratio,range_to_avg_ratio,coef_of_variation
filename,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
n10k3_v1.txt,10,3,3,4,4,0,0,0,3,60,...,,,,,,,,,,
n10k3_v2.txt,10,3,3,4,4,0,0,0,3,59,...,,,,,,,,,,
n10k3_v3.txt,10,3,3,4,4,0,0,0,3,66,...,,,,,,,,,,
n10k3_v4.txt,10,3,3,4,4,0,0,0,3,106,...,,,,,,,,,,
n10k3_v5.txt,10,3,3,4,4,0,0,0,3,21,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
n49k4_v1.txt,49,4,1,4,4,0,0,0,3,2318009416,...,2084.0,784.085798,0.528426,98.0,2.0,521.00,521.00,0.188100,0.184261,0.658385
n49k4_v2.txt,49,4,1,4,4,0,0,0,3,2511054050,...,1848.0,729.959184,0.279890,92.0,1.0,462.00,462.00,0.199134,0.196970,0.716380
n49k4_v3.txt,49,4,1,4,4,0,0,0,3,2402044766,...,2055.0,724.016660,0.322909,97.0,1.0,513.75,513.75,0.188808,0.186861,0.641591
n49k4_v4.txt,49,4,1,4,4,0,0,0,3,2319516173,...,2504.0,720.295710,-0.022441,96.0,2.0,626.00,626.00,0.153355,0.150160,0.525191


In [None]:
def analyze_features(df):
    """
    Perform basic analysis on the extracted features.
    
    Args:
        df: DataFrame with extracted features
        
    Returns:
        dict: Basic statistics about the features
    """
    stats_dict = {
        'total_instances': len(df),
        'timeout_instances': df['censored'].sum() if 'censored' in df.columns else 'N/A',
        'complete_instances': (len(df) - df['censored'].sum()) if 'censored' in df.columns else 'N/A',
    }
    
    # Add statistics for key numeric columns
    numeric_cols = ['final_expandEvts', 'final_maxStackDepth', 'total_sum', 'variance', 
                    'max_num', 'avg_subset_sum'] 
    
    for col in numeric_cols:
        if col in df.columns:
            stats_dict[f'avg_{col}'] = df[col].mean()
            stats_dict[f'max_{col}'] = df[col].max()
    
    print("\nFeature Statistics:")
    for key, value in stats_dict.items():
        print(f"{key}: {value}")
    
    return stats_dict

def extract_and_analyze_ml_features(jsonl_path, instance_dir):
    """
    Main function to extract and analyze ML features.
    
    Args:
        jsonl_path: Path to the ml_features.jsonl file
        instance_dir: Directory containing original instance files
        
    Returns:
        pandas.DataFrame: DataFrame with extracted features
    """
    print(f"Processing {jsonl_path}...")
    
    # Extract solver features
    solver_features_df = extract_ml_features(jsonl_path)
    
    # Extract input features
    print(f"\nProcessing instance files from {instance_dir}...")
    input_features_df = process_path_features(solver_features_df, instance_dir)
    
    # Merge features
    merged_df = merge_features(solver_features_df, input_features_df)
    
    # Analyze features
    analyze_features(merged_df)
    
    return merged_df

In [None]:

def extract_and_analyze_ml_features(jsonl_path, instance_dir=None):
    """
    Main function to extract and analyze ML features.
    
    Args:
        jsonl_path: Path to the ml_features.jsonl file
        instance_dir: Optional directory containing original instance files
        
    Returns:
        pandas.DataFrame: DataFrame with extracted features
    """
    print(f"Processing {jsonl_path}...")
    
    # Extract basic features
    df = extract_ml_features(jsonl_path)
    
    # Process path-based features if instance directory is provided
    if instance_dir:
        df = process_path_features(df, instance_dir)
    
    # Analyze features
    analyze_features(df)
    
    return df

In [6]:
pd.read_json?


[0;31mSignature:[0m
[0mpd[0m[0;34m.[0m[0mread_json[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mpath_or_buf[0m[0;34m:[0m [0;34m'FilePath | ReadBuffer[str] | ReadBuffer[bytes]'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0;34m*[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0morient[0m[0;34m:[0m [0;34m'str | None'[0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mtyp[0m[0;34m:[0m [0;34m"Literal['frame', 'series']"[0m [0;34m=[0m [0;34m'frame'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mdtype[0m[0;34m:[0m [0;34m'DtypeArg | None'[0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mconvert_axes[0m[0;34m:[0m [0;34m'bool | None'[0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mconvert_dates[0m[0;34m:[0m [0;34m'bool | list[str]'[0m [0;34m=[0m [0;32mTrue[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mkeep_default_dates[0m[0;34m:[0m [0;34m'bool'[0m [0;34m=[0m [0;32mTrue[0m[0;

In [41]:
input_features_df.index



Index(['n10k3_v1.txt', 'n10k3_v2.txt', 'n10k3_v3.txt', 'n10k3_v4.txt',
       'n10k3_v5.txt', 'n11k3_v1.txt', 'n11k3_v2.txt', 'n11k3_v3.txt',
       'n11k3_v4.txt', 'n11k3_v5.txt',
       ...
       'n49k3_v1.txt', 'n49k3_v2.txt', 'n49k3_v3.txt', 'n49k3_v4.txt',
       'n49k3_v5.txt', 'n49k4_v1.txt', 'n49k4_v2.txt', 'n49k4_v3.txt',
       'n49k4_v4.txt', 'n49k4_v5.txt'],
      dtype='object', name='filename', length=620)