In [56]:
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import cm
import numpy as np
import math
from scipy import stats


from result_analysis.helper_functions import (
  process_csv_data,
  add_seed_magnitude_column,
  add_generator_iter_column,
  add_seed_index_column
)

plt.rcParams.update({'font.size': 12})

In [None]:
satzilla_features = process_csv_data("/home/csoare/experiments/feature_analysis2/feaures.csv")

satzilla_features = add_seed_magnitude_column(satzilla_features)
satzilla_features = add_seed_index_column(satzilla_features)
satzilla_features = add_generator_iter_column(satzilla_features)

# Total instances
total_instances = len(satzilla_features)

# Average number of instances per generator
avg_instances_per_generator = math.ceil(satzilla_features['generator'].value_counts().mean())

# Number of unique seeds
num_unique_seeds = len(satzilla_features['seed_magnitude'].unique())

# Number of instances per generator per seed
nr_of_instances_per_generator_per_seed = satzilla_features['generator_iter_number'].nunique()

# Display statistics
print(f"Total number of instances: {total_instances}")
print(f"Average instances per generator: {avg_instances_per_generator}")
print(f"Number of unique seeds: {num_unique_seeds}")
print(f"Instances per generator per seed: {nr_of_instances_per_generator_per_seed}")

def find_missing_instances(cnf_dir: str, features_csv: str) -> list:
    """
    Find CNF files that were not processed in the features CSV.
    
    Args:
        cnf_dir (str): Directory containing the original CNF files
        features_csv (str): Path to the features CSV file
        
    Returns:
        list: Names of CNF files that were not processed
    """
    import pandas as pd
    from pathlib import Path
    
    # Read all CNF filenames
    cnf_files = set()
    for cnf_file in Path(cnf_dir).glob("*.cnf"):
        cnf_files.add(cnf_file.stem)
    
    # Read processed instances from CSV
    df = pd.read_csv(features_csv)
    processed_instances = set(df['instance_name'])
    
    # Find missing instances
    missing = cnf_files - processed_instances
    
    return sorted(list(missing))

missing = find_missing_instances("/home/csoare/experiments/feature_analysis2/instances/cnf", "/home/csoare/experiments/feature_analysis2/feaures.csv")
if missing:
    print(f"Found {len(missing)} unprocessed instances:")
    for instance in missing:
        print(f"  - {instance}")
else:
    print("All CNF files were processed successfully")

In [None]:
def extract_generator_properties(df):
    """
    Extract base generator name, difficulty and randomness from generator column.
    
    Args:
        df (pd.DataFrame): Input DataFrame with 'generator' column
        
    Returns:
        pd.DataFrame: DataFrame with new 'base_generator', 'difficulty' and 'randomness' columns
    """
    df = df.copy()
    split_values = df['generator'].str.split('-', expand=True)
    df['base_generator'] = split_values[0]  # 'FuzzSAT' or 'PairSAT'
    df['difficulty'] = split_values[1]  # 'easy', 'medium', 'hard'
    df['randomness'] = split_values[2].astype(int)  # 0, 50, 100
    return df

# Apply the function to your DataFrame
satzilla_features = extract_generator_properties(satzilla_features)

# Verify the new columns
print("\nUnique difficulties:", satzilla_features['difficulty'].unique())
print("Unique randomness values:", satzilla_features['randomness'].unique())

In [59]:
# Helper function for all analyses
def get_numeric_features(df):
    """Get numeric features, excluding metadata columns"""
    features_to_exclude = ['seed', 'seed_magnitude', 'seed_index', 'randomness', 
                          'generator_iter_number', 'instance_name', 'solved']
    numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns
    return [col for col in numeric_cols if col not in features_to_exclude]

In [60]:
def create_stats_dataframe(df):
    """
    Create a DataFrame where each numeric feature is represented by a stats object.
    
    Args:
        df (pd.DataFrame): Input DataFrame with generator column
        
    Returns:
        pd.DataFrame: DataFrame with stats objects for each numeric feature
    """
    numeric_cols = get_numeric_features(df)
    
    # Initialize empty DataFrame with generator as index
    stats_df = pd.DataFrame(index=df['generator'].unique())
    
    # For each numeric column, calculate stats and create dict
    for col in numeric_cols:
        grouped_stats = df.groupby('generator')[col].agg([
            ('mean', 'mean'),
            ('std', 'std'),
            ('var', 'var'),
            ('min', 'min'),
            ('max', 'max'),
            ('median', 'median')
        ])
        
        # Convert stats to dictionary for each generator
        stats_dict = {
            idx: {
                'mean': row['mean'],
                'std': row['std'],
                'var': row['var'],
                'min': row['min'],
                'max': row['max'],
                'median': row['median']
            }
            for idx, row in grouped_stats.iterrows()
        }
            
        stats_df[col] = pd.Series(stats_dict)
    
    return stats_df

# Create the stats DataFrame
all_instances_stats_df = create_stats_dataframe(satzilla_features)

# all_instances_stats_df.to_csv("/home/csoare/experiments/feature_analysis2/all_instances_stats.csv")

In [None]:
def find_similar_generators(df):
   """
   Find statistically similar generators across randomness and difficulty levels.
   
   This function performs t-tests on all numeric features to identify generators that produce
   statistically similar distributions. It compares:
   1. Different randomness levels within the same generator and difficulty
   2. Different difficulty levels within the same generator and randomness
   
   Parameters:
       df (pd.DataFrame): DataFrame containing generator data with columns:
           - base_generator: The generator type
           - difficulty: The difficulty level
           - randomness: The randomness level
           - Various numeric feature columns
           
   Returns:
       tuple: (similar_pairs, filtered_df) where:
           - similar_pairs: Dict mapping generator keys to lists of similar (r1,r2) or (d1,d2) pairs
           - filtered_df: DataFrame with one of each similar pair removed
   """
   results = {}
   filtered_df = df.copy()
   
   for base in df['base_generator'].unique():
       for diff in df['difficulty'].unique():
           key = f"{base}-{diff}"
           data = df[(df['base_generator'] == base) & (df['difficulty'] == diff)]
           if len(data) == 0:
               continue
               
           features = get_numeric_features(data)
           rand_values = sorted(data['randomness'].unique())
           
           similar_pairs = []
           for i in range(len(rand_values)):
               for j in range(i + 1, len(rand_values)):
                   r1, r2 = int(rand_values[i]), int(rand_values[j])  # Convert to int
                   all_features_similar = True
                   
                   for feature in features:
                       if data[feature].nunique() > 1:
                           dist1 = data[data['randomness'] == r1][feature]
                           dist2 = data[data['randomness'] == r2][feature]
                           _, p_val = stats.ttest_ind(dist1, dist2)
                           if p_val <= 0.05:
                               all_features_similar = False
                               break
                               
                   if all_features_similar:
                       similar_pairs.append(('R', r1, r2))
                       filtered_df = filtered_df[~((filtered_df['base_generator'] == base) & 
                                                 (filtered_df['difficulty'] == diff) & 
                                                 (filtered_df['randomness'] == r2))]
           
           if similar_pairs:
               results[key] = similar_pairs
   
   for base in df['base_generator'].unique():
       for rand in df['randomness'].unique():
           key = f"{base}-R{rand}"
           data = df[(df['base_generator'] == base) & (df['randomness'] == rand)]
           if len(data) == 0:
               continue
               
           features = get_numeric_features(data)
           diff_values = sorted(data['difficulty'].unique())
           
           similar_pairs = []
           for i in range(len(diff_values)):
               for j in range(i + 1, len(diff_values)):
                   d1, d2 = diff_values[i], diff_values[j]
                   all_features_similar = True
                   
                   for feature in features:
                       if data[feature].nunique() > 1:
                           dist1 = data[data['difficulty'] == d1][feature]
                           dist2 = data[data['difficulty'] == d2][feature]
                           _, p_val = stats.ttest_ind(dist1, dist2)
                           if p_val <= 0.05:
                               all_features_similar = False
                               break
                               
                   if all_features_similar:
                       similar_pairs.append(('D', d1, d2))
                       filtered_df = filtered_df[~((filtered_df['base_generator'] == base) & 
                                                 (filtered_df['difficulty'] == d2) & 
                                                 (filtered_df['randomness'] == rand))]
           
           if similar_pairs:
               results[key] = similar_pairs
               
   return results, filtered_df

similar_generators, filtered_satzilla_features = find_similar_generators(satzilla_features)
print(similar_generators)

In [84]:
def check_stats_similarity(original_stats, new_stats, threshold_percentage=5):
    """
    Check if new stats are within threshold_percentage of original stats using KS test.
    
    Args:
        original_stats (dict): Original statistics dictionary
        new_stats (dict): New statistics dictionary
        threshold_percentage (float): Threshold for p-value (converted to 0-1 range)
        
    Returns:
        bool: True if distributions are similar according to KS test
    """
        
    # Convert dictionaries to arrays for KS test
    # We use the key statistics to represent the distribution
    original_dist = np.array([
        original_stats['mean'],
        original_stats['std'],
        original_stats['var']
    ])
    
    new_dist = np.array([
        new_stats['mean'],
        new_stats['std'],
        new_stats['var']
    ])
    
    # Perform KS test
    ks_statistic, p_value = stats.ks_2samp(original_dist, new_dist)
    
    # Convert threshold_percentage to p-value threshold (0-1 range)
    p_value_threshold = threshold_percentage / 100
    
    return p_value > p_value_threshold

def find_minimal_representative_subset(satzilla_features, all_instances_stats_df, 
                                    initial_drop_percentage=10, 
                                    threshold_percentage=5,
                                    max_iterations=50):
    """
    Find minimal subset of instances that maintains similar statistics.
    
    Args:
        satzilla_features (pd.DataFrame): Original features DataFrame
        all_instances_stats_df (pd.DataFrame): Original statistics DataFrame
        initial_drop_percentage (int): Percentage of instances to drop in each iteration
        threshold_percentage (float): Acceptable difference threshold
        max_iterations (int): Maximum number of iterations to try
        
    Returns:
        dict: Dictionary with generator -> minimal representative subset mapping
    """
    results = {}
    
    for generator in satzilla_features['generator'].unique():        
        # Get original generator stats
        original_stats = all_instances_stats_df.loc[generator]
        
        # Get generator instances
        generator_instances = satzilla_features[satzilla_features['generator'] == generator].copy()
        original_count = len(generator_instances)
        
        current_subset = generator_instances.copy()
        
        for iteration in range(max_iterations):
            # Calculate how many instances to drop
            drop_count = max(1, int(len(current_subset) * initial_drop_percentage / 100))
            
            # Randomly drop instances
            reduced_subset = current_subset.drop(
                current_subset.sample(n=drop_count).index
            )
            
            # Calculate new stats
            try:
                new_stats_df = create_stats_dataframe(reduced_subset)
                new_stats = new_stats_df.loc[generator]
            except:
                break
            
            # Check if all features are within threshold
            all_features_similar = True
            
            for feature in original_stats.index:
                if not check_stats_similarity(
                    original_stats[feature],
                    new_stats[feature],
                    threshold_percentage
                ):
                    all_features_similar = False
                    break
            
            if not all_features_similar:
                break
                
            current_subset = reduced_subset
            
            if len(current_subset) <= 10:  # Minimum size threshold
                break
                
        results[generator] = current_subset
        
    return results

In [36]:
def grid_search_downsampling(satzilla_features, all_instances_stats_df, step_size=5, max_iterations=50):
    """
    Perform grid search over initial_drop_percentage and threshold_percentage.
    
    Returns:
        list: List of tuples (drop_pct, threshold_pct, total_instances)
    """
    results = []
    
    # Use step size to reduce number of iterations
    for drop_pct in range(5, 51, step_size):
        for threshold_pct in range(5, 101, step_size):
            print(f"Testing drop%={drop_pct}, threshold%={threshold_pct}")
            
            try:
                minimal_subsets = find_minimal_representative_subset(
                    satzilla_features,
                    all_instances_stats_df,
                    initial_drop_percentage=drop_pct,
                    threshold_percentage=threshold_pct,
                    max_iterations=max_iterations
                )
                
                # Calculate total instances in minimal subsets
                total_instances = sum(len(subset) for subset in minimal_subsets.values())
                
                results.append({
                    'drop_percentage': drop_pct,
                    'threshold_percentage': threshold_pct,
                    'total_instances': total_instances
                })
                
            except Exception as e:
                print(f"Error with drop%={drop_pct}, threshold%={threshold_pct}: {e}")
                continue
    
    return results


In [37]:
# INSTANCE REDUCTION ANALYSIS

def instance_reduction_and_plot(satzilla_features, all_instances_stats_df, step_size=5, max_iterations=500):
    # Run grid search
    results = grid_search_downsampling(satzilla_features, all_instances_stats_df, step_size, max_iterations)

    # Calculate how many unique reduction rates we have
    reduction_rates = sorted(set(drop_pcts))
    n_plots = len(reduction_rates)

    # Create subplots, one for each reduction rate
    fig, axes = plt.subplots(n_plots, 1, figsize=(12, 4*n_plots))

    for idx, rate in enumerate(reduction_rates):
        # Get data for this reduction rate
        mask = [i for i, d in enumerate(drop_pcts) if d == rate]
        confidence_levels = [threshold_pcts[i] for i in mask]
        preserved_instances = [total_instances[i] for i in mask]
        
        # Sort points for smooth line
        points = sorted(zip(confidence_levels, preserved_instances))
        confidence_levels = [x[0] for x in points]
        preserved_instances = [x[1] for x in points]
        
        # Plot line and fill
        color = cm.viridis(idx/n_plots)
        axes[idx].plot(confidence_levels, preserved_instances, color=color, linewidth=2)
        axes[idx].fill_between(confidence_levels, preserved_instances, alpha=0.3, color=color)
        
        # Customize plot
        axes[idx].set_xlabel('Statistical Confidence Level (%)')
        axes[idx].set_ylabel('Preserved Instance Count')
        axes[idx].set_title(f'Reduction Rate = {rate}%')
        axes[idx].grid(True)

    plt.tight_layout()
    plt.show()

# instance_reduction_and_plot(filtered_satzilla_features, all_instances_stats_df, step_size=5, max_iterations=500)

In [None]:
def analyze_and_reduce_instances(satzilla_features, all_instances_stats_df, initial_drop_percentage=20, threshold_percentage=55, max_iterations=500):
    """
    Analyze and reduce instances based on given parameters.
    
    Args:
        satzilla_features (pd.DataFrame): Original features DataFrame
        all_instances_stats_df (pd.DataFrame): Original statistics DataFrame
        initial_drop_percentage (int): Percentage of instances to drop in each iteration
        threshold_percentage (float): Acceptable difference threshold
        max_iterations (int): Maximum number of iterations to try
        
    Returns:
        pd.DataFrame: Reduced DataFrame
    """
    # Get optimal subset with your chosen parameters
    optimal_subsets = find_minimal_representative_subset(
        satzilla_features,
        all_instances_stats_df,
        initial_drop_percentage=initial_drop_percentage,
        threshold_percentage=threshold_percentage,
        max_iterations=max_iterations
    )

    # Calculate and print statistics
    total_original = len(satzilla_features)
    total_preserved = sum(len(subset) for subset in optimal_subsets.values())

    print(f"Original total instances: {total_original}")
    print(f"Preserved instances: {total_preserved}")
    print(f"Overall reduction: {((total_original - total_preserved) / total_original) * 100:.2f}%")

    # Per-generator statistics
    print("\nPer-generator breakdown:")
    for generator, subset in optimal_subsets.items():
        original_count = len(satzilla_features[satzilla_features['generator'] == generator])
        preserved_count = len(subset)
        reduction = ((original_count - preserved_count) / original_count) * 100
        print(f"\n{generator}:")
        print(f"  Original instances: {original_count}")
        print(f"  Preserved instances: {preserved_count}")
        print(f"  Reduction: {reduction:.2f}%")

    # Get indices from optimal subsets
    selected_indices = pd.concat(optimal_subsets.values()).index

    # Filter satzilla_features
    reduced_satzilla = satzilla_features.loc[selected_indices]
    
    return reduced_satzilla

reduced_satzilla = analyze_and_reduce_instances(filtered_satzilla_features, all_instances_stats_df, initial_drop_percentage=20, threshold_percentage=55, max_iterations=500)

In [45]:
def organize_selected_instances(reduced_df, source_cnf_dir, target_base_dir):
    """
    Organize selected CNF files into folders based on base generator and difficulty.
    """
    import shutil
    from pathlib import Path
    
    target_base_path = Path(target_base_dir)
    target_base_path.mkdir(parents=True, exist_ok=True)
    source_path = Path(source_cnf_dir)
    
    stats = {'total_copied': 0, 'files_per_group': {}}
    
    # Group by base generator and difficulty
    for base in reduced_df['base_generator'].unique():
        for diff in reduced_df['difficulty'].unique():
            # Create directory like "PairSAT-easy"
            group_dir_name = f"{base}-{diff}"
            group_dir = target_base_path / group_dir_name
            group_dir.mkdir(exist_ok=True)
            
            # Get all instances for this base generator and difficulty
            mask = (reduced_df['base_generator'] == base) & (reduced_df['difficulty'] == diff)
            group_instances = reduced_df[mask]
            
            files_copied = 0
            for instance in group_instances.index:
                source_file = source_path / f"{instance}.cnf"
                target_file = group_dir / f"{instance}.cnf"
                
                try:
                    if source_file.exists():
                        shutil.copy2(source_file, target_file)
                        files_copied += 1
                    else:
                        print(f"Warning: Source file not found: {source_file}")
                except Exception as e:
                    print(f"Error copying {instance}: {str(e)}")
            
            stats['files_per_group'][group_dir_name] = files_copied
            stats['total_copied'] += files_copied
    
    return stats

# Usage
# source_dir = "/home/csoare/experiments/feature_analysis2/instances/cnf"
# target_dir = "/home/csoare/experiments/reduced_instances"

# stats = organize_selected_instances(reduced_satzilla, source_dir, target_dir)

# # Print statistics
# print("\nCopying complete!")
# print(f"Total files copied: {stats['total_copied']}")
# print("\nFiles copied per group:")
# for group, count in stats['files_per_group'].items():
#     print(f"{group}: {count} files")