In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import math
from scipy import stats
from scipy.stats import gaussian_kde


from result_analysis.helper_functions import (
  process_csv_data,
  add_seed_magnitude_column,
  add_generator_iter_column,
  add_seed_index_column
)

plt.set_cmap('viridis')
plt.rcParams.update({'font.size': 14})

In [None]:
satzilla_features = process_csv_data("/home/csoare/experiments/reduced_instances_strategy2/features_output.csv")

satzilla_features = add_seed_magnitude_column(satzilla_features)
satzilla_features = add_seed_index_column(satzilla_features)
satzilla_features = add_generator_iter_column(satzilla_features)

# Total instances
total_instances = len(satzilla_features)

# Average number of instances per generator
avg_instances_per_generator = math.ceil(satzilla_features['generator'].value_counts().mean())

# Number of unique seeds
num_unique_seeds = len(satzilla_features['seed_magnitude'].unique())

# Number of instances per generator per seed
nr_of_instances_per_generator_per_seed = satzilla_features['generator_iter_number'].nunique()

# Display statistics
print(f"Total number of instances: {total_instances}")
print(f"Average instances per generator: {avg_instances_per_generator}")
print(f"Number of unique seeds: {num_unique_seeds}")
print(f"Instances per generator per seed: {nr_of_instances_per_generator_per_seed}")

def find_missing_instances(cnf_dir: str, features_csv: str) -> list:
    """
    Find CNF files that were not processed in the features CSV.
    
    Args:
        cnf_dir (str): Directory containing the original CNF files
        features_csv (str): Path to the features CSV file
        
    Returns:
        list: Names of CNF files that were not processed
    """
    import pandas as pd
    from pathlib import Path
    
    # Read all CNF filenames
    cnf_files = set()
    for cnf_file in Path(cnf_dir).glob("*.cnf"):
        cnf_files.add(cnf_file.stem)
    
    # Read processed instances from CSV
    df = pd.read_csv(features_csv)
    processed_instances = set(df['instance_name'])
    
    # Find missing instances
    missing = cnf_files - processed_instances
    
    return sorted(list(missing))

missing = find_missing_instances("/home/csoare/experiments/feature_analysis2/instances/cnf", "/home/csoare/experiments/feature_analysis2/feaures.csv")
if missing:
    print(f"Found {len(missing)} unprocessed instances:")
    for instance in missing:
        print(f"  - {instance}")
else:
    print("All CNF files were processed successfully")

In [None]:
def extract_generator_properties(df):
    """
    Extract base generator name, difficulty and randomness from generator column.
    
    Args:
        df (pd.DataFrame): Input DataFrame with 'generator' column
        
    Returns:
        pd.DataFrame: DataFrame with new 'base_generator', 'difficulty' and 'randomness' columns
    """
    df = df.copy()
    split_values = df['generator'].str.split('-', expand=True)
    df['base_generator'] = split_values[0]  # 'FuzzSAT' or 'PairSAT'
    df['difficulty'] = split_values[1]  # 'easy', 'medium', 'hard'
    df['randomness'] = split_values[2].astype(int)  # 0, 50, 100
    return df

# Apply the function to your DataFrame
satzilla_features = extract_generator_properties(satzilla_features)

# Verify the new columns
print("\nUnique difficulties:", satzilla_features['difficulty'].unique())
print("Unique randomness values:", satzilla_features['randomness'].unique())

In [13]:
# Helper function for all analyses
def get_numeric_features(df):
    """Get numeric features, excluding metadata columns"""
    features_to_exclude = ['seed', 'seed_magnitude', 'seed_index', 'randomness', 
                          'generator_iter_number', 'instance_name', 'solved']
    numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns
    return [col for col in numeric_cols if col not in features_to_exclude]

In [14]:
def create_stats_dataframe(df):
    """
    Create a DataFrame where each numeric feature is represented by a stats object.
    
    Args:
        df (pd.DataFrame): Input DataFrame with generator column
        
    Returns:
        pd.DataFrame: DataFrame with stats objects for each numeric feature
    """
    numeric_cols = get_numeric_features(df)
    
    # Initialize empty DataFrame with generator as index
    stats_df = pd.DataFrame(index=df['generator'].unique())
    
    # For each numeric column, calculate stats and create dict
    for col in numeric_cols:
        grouped_stats = df.groupby('generator')[col].agg([
            ('mean', 'mean'),
            ('std', 'std'),
            ('var', 'var'),
            ('min', 'min'),
            ('max', 'max'),
            ('median', 'median')
        ])
        
        # Convert stats to dictionary for each generator
        stats_dict = {
            idx: {
                'mean': row['mean'],
                'std': row['std'],
                'var': row['var'],
                'min': row['min'],
                'max': row['max'],
                'median': row['median']
            }
            for idx, row in grouped_stats.iterrows()
        }
            
        stats_df[col] = pd.Series(stats_dict)
    
    return stats_df

# Create the stats DataFrame
all_instances_stats_df = create_stats_dataframe(satzilla_features)

# all_instances_stats_df.to_csv("/home/csoare/experiments/feature_analysis2/all_instances_stats.csv")

In [None]:
def find_similar_generators(df):
   """
   Find statistically similar generators across randomness and difficulty levels.
   
   This function performs t-tests on all numeric features to identify generators that produce
   statistically similar distributions. It compares:
   1. Different randomness levels within the same generator and difficulty
   2. Different difficulty levels within the same generator and randomness
   
   Parameters:
       df (pd.DataFrame): DataFrame containing generator data with columns:
           - base_generator: The generator type
           - difficulty: The difficulty level
           - randomness: The randomness level
           - Various numeric feature columns
           
   Returns:
       tuple: (similar_pairs, filtered_df) where:
           - similar_pairs: Dict mapping generator keys to lists of similar (r1,r2) or (d1,d2) pairs
           - filtered_df: DataFrame with one of each similar pair removed
   """
   results = {}
   filtered_df = df.copy()
   
   for base in df['base_generator'].unique():
       for diff in df['difficulty'].unique():
           key = f"{base}-{diff}"
           data = df[(df['base_generator'] == base) & (df['difficulty'] == diff)]
           if len(data) == 0:
               continue
               
           features = get_numeric_features(data)
           rand_values = sorted(data['randomness'].unique())
           
           similar_pairs = []
           for i in range(len(rand_values)):
               for j in range(i + 1, len(rand_values)):
                   r1, r2 = int(rand_values[i]), int(rand_values[j])  # Convert to int
                   all_features_similar = True
                   
                   for feature in features:
                       if data[feature].nunique() > 1:
                           dist1 = data[data['randomness'] == r1][feature]
                           dist2 = data[data['randomness'] == r2][feature]
                           _, p_val = stats.ttest_ind(dist1, dist2)
                           if p_val <= 0.05:
                               all_features_similar = False
                               break
                               
                   if all_features_similar:
                       similar_pairs.append(('R', r1, r2))
                       filtered_df = filtered_df[~((filtered_df['base_generator'] == base) & 
                                                 (filtered_df['difficulty'] == diff) & 
                                                 (filtered_df['randomness'] == r2))]
           
           if similar_pairs:
               results[key] = similar_pairs
   
   for base in df['base_generator'].unique():
       for rand in df['randomness'].unique():
           key = f"{base}-R{rand}"
           data = df[(df['base_generator'] == base) & (df['randomness'] == rand)]
           if len(data) == 0:
               continue
               
           features = get_numeric_features(data)
           diff_values = sorted(data['difficulty'].unique())
           
           similar_pairs = []
           for i in range(len(diff_values)):
               for j in range(i + 1, len(diff_values)):
                   d1, d2 = diff_values[i], diff_values[j]
                   all_features_similar = True
                   
                   for feature in features:
                       if data[feature].nunique() > 1:
                           dist1 = data[data['difficulty'] == d1][feature]
                           dist2 = data[data['difficulty'] == d2][feature]
                           _, p_val = stats.ttest_ind(dist1, dist2)
                           if p_val <= 0.05:
                               all_features_similar = False
                               break
                               
                   if all_features_similar:
                       similar_pairs.append(('D', d1, d2))
                       filtered_df = filtered_df[~((filtered_df['base_generator'] == base) & 
                                                 (filtered_df['difficulty'] == d2) & 
                                                 (filtered_df['randomness'] == rand))]
           
           if similar_pairs:
               results[key] = similar_pairs
               
   return results, filtered_df

similar_generators, filtered_satzilla_features = find_similar_generators(satzilla_features)
print(similar_generators)

In [None]:
def analyze_randomness_predictors(df, n=5):
   features = get_numeric_features(df)
   results = {}
   difficulty_order = ["easy", "medium", "hard"]
   
   for base in df['base_generator'].unique():
       for diff in difficulty_order:
           key = f"{base}-{diff}"
           data = df[(df['base_generator'] == base) & (df['difficulty'] == diff)]
           if len(data) == 0:
               continue
               
           correlations = {}
           for feature in features:
               if data[feature].nunique() > 1:
                   corr = data[feature].corr(data['randomness'], method='spearman')
                   if not np.isnan(corr):
                       correlations[feature] = abs(corr)
                       
           if not correlations:
               continue
               
           sorted_predictors = sorted(correlations.items(), key=lambda x: x[1], reverse=True)
           top_predictors = sorted_predictors[:n]
           bottom_predictors = sorted_predictors[-n:]
           results[key] = {'top': top_predictors, 'bottom': bottom_predictors}
           
           fig, axes = plt.subplots(2, n, figsize=(4*n, 8))
           
           # Plot top predictors
           for idx, (feature, corr) in enumerate(top_predictors):
               rand_values = sorted(data['randomness'].unique())
               for rand in rand_values:
                   subset = data[data['randomness'] == rand][feature]
                   try:
                       density = gaussian_kde(subset)
                       xs = np.linspace(subset.min(), subset.max(), 200)
                       ys = density(xs)
                       ys = ys / np.max(ys)
                       axes[0, idx].plot(xs, ys, label=f'Randomness={rand}%')
                       axes[0, idx].fill_between(xs, ys, alpha=0.2)
                   except np.linalg.LinAlgError:
                       axes[0, idx].hist(subset, bins=20, density=True, alpha=0.5, label=f'Randomness={rand}%')
               axes[0, idx].set_title(f'{feature}\ncorr={corr:.3f}')
               axes[0, idx].legend()
               axes[0, idx].set_xlabel(feature)
               axes[0, idx].set_ylabel('Density')
           
           # Plot bottom predictors
           for idx, (feature, corr) in enumerate(bottom_predictors):
               rand_values = sorted(data['randomness'].unique())
               for rand in rand_values:
                   subset = data[data['randomness'] == rand][feature]
                   try:
                       density = gaussian_kde(subset)
                       xs = np.linspace(subset.min(), subset.max(), 200)
                       ys = density(xs)
                       ys = ys / np.max(ys)
                       axes[1, idx].plot(xs, ys, label=f'Randomness={rand}%')
                       axes[1, idx].fill_between(xs, ys, alpha=0.2)
                   except np.linalg.LinAlgError:
                       axes[1, idx].hist(subset, bins=20, density=True, alpha=0.5, label=f'Randomness={rand}%')
               axes[1, idx].set_title(f'{feature}\ncorr={corr:.3f}')
               axes[1, idx].legend()
               axes[1, idx].set_xlabel(feature)
               axes[1, idx].set_ylabel('Density')
           
           plt.suptitle(f'Top and Bottom Randomness Predictors for {key}')
           plt.tight_layout()
           plt.show()
           
   return results

randomness_results = analyze_randomness_predictors(filtered_satzilla_features, 3)

In [None]:
def analyze_difficulty_predictors(df, n=5):
   """
   Analyzes and visualizes both top and bottom predictors of difficulty levels.
   Parameters:
       df (pd.DataFrame): Input DataFrame with generator data
       n (int): Number of top/bottom predictors to show. Default is 5
   Returns:
       dict: Dictionary containing top/bottom predictor results
   """
   features = get_numeric_features(df)
   results = {}
   difficulty_order = ["easy", "medium", "hard"]
   
   for base in df['base_generator'].unique():
       for rand in sorted(df['randomness'].unique()):
           key = f"{base}-R{rand}"
           data = df[(df['base_generator'] == base) & (df['randomness'] == rand)].copy()
           if len(data) == 0:
               continue
               
           # Convert difficulty to numeric
           diff_map = {'easy': 0, 'medium': 1, 'hard': 2}
           data['diff_numeric'] = data['difficulty'].map(diff_map)
           
           correlations = {}
           for feature in features:
               if data[feature].nunique() > 1:
                   corr = data[feature].corr(data['diff_numeric'], method='spearman')
                   if not np.isnan(corr):
                       correlations[feature] = abs(corr)
                       
           if not correlations:
               continue
               
           sorted_predictors = sorted(correlations.items(), key=lambda x: x[1], reverse=True)
           top_predictors = sorted_predictors[:n]
           bottom_predictors = sorted_predictors[-n:]
           results[key] = {'top': top_predictors, 'bottom': bottom_predictors}
           
           # Create two rows of subplots
           fig, axes = plt.subplots(2, n, figsize=(4*n, 8))
           
           # Plot top predictors
           for idx, (feature, corr) in enumerate(top_predictors):
               for diff in difficulty_order:
                   subset = data[data['difficulty'] == diff][feature]
                   if subset.nunique() == 1:
                       # If only one value, plot a vertical line
                       value = subset.iloc[0]
                       axes[0, idx].axvline(x=value, label=f'{diff} (constant={value:.2f})', 
                                          linestyle='--', alpha=0.8)
                   else:
                       try:
                           if subset.nunique() > 3:
                               density = gaussian_kde(subset)
                               xs = np.linspace(subset.min(), subset.max(), 200)
                               ys = density(xs)
                               ys = ys / np.max(ys)
                               axes[0, idx].plot(xs, ys, label=diff, linewidth=2)
                               axes[0, idx].fill_between(xs, ys, alpha=0.2)
                           else:
                               axes[0, idx].violinplot(subset, positions=[['easy', 'medium', 'hard'].index(diff)])
                               axes[0, idx].set_xticks([0, 1, 2], ['easy', 'medium', 'hard'])
                       except Exception:
                           sorted_vals = sorted(subset)
                           axes[0, idx].plot(range(len(sorted_vals)), sorted_vals, label=diff, alpha=0.5, linewidth=2)
               axes[0, idx].set_title(f'{feature}\ncorr={corr:.3f}', fontsize=14, pad=10)
               axes[0, idx].legend(fontsize=12)
               axes[0, idx].set_xlabel(feature, fontsize=12, labelpad=8)
               axes[0, idx].set_ylabel('Density/Value', fontsize=12, labelpad=8)
               axes[0, idx].tick_params(axis='both', which='major', labelsize=10)
           
           # Plot bottom predictors (same changes as top)
           for idx, (feature, corr) in enumerate(bottom_predictors):
               for diff in difficulty_order:
                   subset = data[data['difficulty'] == diff][feature]
                   if subset.nunique() == 1:
                       value = subset.iloc[0]
                       axes[1, idx].axvline(x=value, label=f'{diff} (constant={value:.2f})', 
                                          linestyle='--', alpha=0.8)
                   else:
                       try:
                           if subset.nunique() > 3:
                               density = gaussian_kde(subset)
                               xs = np.linspace(subset.min(), subset.max(), 200)
                               ys = density(xs)
                               ys = ys / np.max(ys)
                               axes[1, idx].plot(xs, ys, label=diff, linewidth=2)
                               axes[1, idx].fill_between(xs, ys, alpha=0.2)
                           else:
                               axes[1, idx].violinplot(subset, positions=[['easy', 'medium', 'hard'].index(diff)])
                               axes[1, idx].set_xticks([0, 1, 2], ['easy', 'medium', 'hard'])
                       except Exception:
                           sorted_vals = sorted(subset)
                           axes[1, idx].plot(range(len(sorted_vals)), sorted_vals, label=diff, alpha=0.5, linewidth=2)
               axes[1, idx].set_title(f'{feature}\ncorr={corr:.3f}', fontsize=14, pad=10)
               axes[1, idx].legend(fontsize=12)
               axes[1, idx].set_xlabel(feature, fontsize=12, labelpad=8)
               axes[1, idx].set_ylabel('Density/Value', fontsize=12, labelpad=8)
               axes[1, idx].tick_params(axis='both', which='major', labelsize=10)
           
           plt.suptitle(f'Top and Bottom Difficulty Predictors for {base} with Randomness: {rand}%', 
                       fontsize=16, y=1.05)
           plt.tight_layout()
           plt.show()
           
   return results

difficulty_results = analyze_difficulty_predictors(filtered_satzilla_features, 3)

In [None]:
def compare_generators(df, n=5):
   """
   Compare features between generator pairs for same difficulty and randomness levels.
   Shows both top and bottom differentiating features using effect size.
   
   Parameters:
       df (pd.DataFrame): Input DataFrame with generator data
       n (int): Number of top/bottom features to compare. Default is 5
       
   Returns:
       dict: Dictionary containing comparison results
   """
   features = get_numeric_features(df)
   results = {}
   difficulty_order = ["easy", "medium", "hard"]
   
   plt.rcParams.update({'font.size': 12})
   
   for diff in difficulty_order:
       for rand in sorted(df['randomness'].unique()):
           key = f"{diff}-R{rand}"
           data_fuzz = df[(df['base_generator'] == 'FuzzSAT') &
                         (df['difficulty'] == diff) &
                         (df['randomness'] == rand)]
           data_pair = df[(df['base_generator'] == 'PairSAT') &
                         (df['difficulty'] == diff) &
                         (df['randomness'] == rand)]
           
           if len(data_fuzz) == 0 or len(data_pair) == 0:
               continue
               
           # Calculate effect sizes for features with at least one non-constant group
           effect_sizes = {}
           for feature in features:
               try:
                   if data_fuzz[feature].nunique() > 1 or data_pair[feature].nunique() > 1:
                       d = (data_fuzz[feature].mean() - data_pair[feature].mean()) / \
                           np.sqrt((data_fuzz[feature].var() + data_pair[feature].var()) / 2)
                       if not np.isnan(d):
                           effect_sizes[feature] = abs(d)
               except Exception:
                   continue
                   
           if not effect_sizes:
               continue
               
           sorted_effects = sorted(effect_sizes.items(), key=lambda x: abs(x[1]), reverse=True)
           top_diff = sorted_effects[:n]
           bottom_diff = sorted_effects[-n:]
           results[key] = {'top': top_diff, 'bottom': bottom_diff}
           
           # Create two rows of subplots
           fig, axes = plt.subplots(2, n, figsize=(4*n, 8))
           
           # Plot top differences
           for idx, (feature, effect) in enumerate(top_diff):
               try:
                   # Handle FuzzSAT distribution/constant
                   if data_fuzz[feature].nunique() == 1:
                       value = data_fuzz[feature].iloc[0]
                       axes[0, idx].axvline(x=value, color='blue', 
                                          label=f'FuzzSAT (constant={value:.2f})', 
                                          linestyle='--', alpha=0.8)
                   else:
                       density_fuzz = gaussian_kde(data_fuzz[feature])
                       xs_fuzz = np.linspace(data_fuzz[feature].min(), data_fuzz[feature].max(), 200)
                       ys_fuzz = density_fuzz(xs_fuzz)
                       ys_fuzz = ys_fuzz / np.max(ys_fuzz)
                       axes[0, idx].plot(xs_fuzz, ys_fuzz, label='FuzzSAT', linewidth=2)
                       axes[0, idx].fill_between(xs_fuzz, ys_fuzz, alpha=0.2)
                       
                   # Handle PairSAT distribution/constant
                   if data_pair[feature].nunique() == 1:
                       value = data_pair[feature].iloc[0]
                       axes[0, idx].axvline(x=value, color='orange',
                                          label=f'PairSAT (constant={value:.2f})', 
                                          linestyle='--', alpha=0.8)
                   else:
                       density_pair = gaussian_kde(data_pair[feature])
                       xs_pair = np.linspace(data_pair[feature].min(), data_pair[feature].max(), 200)
                       ys_pair = density_pair(xs_pair)
                       ys_pair = ys_pair / np.max(ys_pair)
                       axes[0, idx].plot(xs_pair, ys_pair, label='PairSAT', linewidth=2)
                       axes[0, idx].fill_between(xs_pair, ys_pair, alpha=0.2)
               except Exception:
                   data_to_plot = [data_fuzz[feature], data_pair[feature]]
                   axes[0, idx].boxplot(data_to_plot, labels=['FuzzSAT', 'PairSAT'])
               
               axes[0, idx].set_title(f'{feature}\nEffect Size={effect:.3f}', fontsize=14, pad=10)
               axes[0, idx].legend(fontsize=12)
               axes[0, idx].set_xlabel(feature, fontsize=12, labelpad=8)
               axes[0, idx].set_ylabel('Density', fontsize=12, labelpad=8)
               axes[0, idx].tick_params(axis='both', which='major', labelsize=10)
           
           # Plot bottom differences (same logic as top)
           for idx, (feature, effect) in enumerate(bottom_diff):
               try:
                   # Handle FuzzSAT distribution/constant
                   if data_fuzz[feature].nunique() == 1:
                       value = data_fuzz[feature].iloc[0]
                       axes[1, idx].axvline(x=value, color='blue',
                                          label=f'FuzzSAT (constant={value:.2f})', 
                                          linestyle='--', alpha=0.8)
                   else:
                       density_fuzz = gaussian_kde(data_fuzz[feature])
                       xs_fuzz = np.linspace(data_fuzz[feature].min(), data_fuzz[feature].max(), 200)
                       ys_fuzz = density_fuzz(xs_fuzz)
                       ys_fuzz = ys_fuzz / np.max(ys_fuzz)
                       axes[1, idx].plot(xs_fuzz, ys_fuzz, label='FuzzSAT', linewidth=2)
                       axes[1, idx].fill_between(xs_fuzz, ys_fuzz, alpha=0.2)
                       
                   # Handle PairSAT distribution/constant
                   if data_pair[feature].nunique() == 1:
                       value = data_pair[feature].iloc[0]
                       axes[1, idx].axvline(x=value, color='orange',
                                          label=f'PairSAT (constant={value:.2f})', 
                                          linestyle='--', alpha=0.8)
                   else:
                       density_pair = gaussian_kde(data_pair[feature])
                       xs_pair = np.linspace(data_pair[feature].min(), data_pair[feature].max(), 200)
                       ys_pair = density_pair(xs_pair)
                       ys_pair = ys_pair / np.max(ys_pair)
                       axes[1, idx].plot(xs_pair, ys_pair, label='PairSAT', linewidth=2)
                       axes[1, idx].fill_between(xs_pair, ys_pair, alpha=0.2)
               except Exception:
                   data_to_plot = [data_fuzz[feature], data_pair[feature]]
                   axes[1, idx].boxplot(data_to_plot, labels=['FuzzSAT', 'PairSAT'])
                   
               axes[1, idx].set_title(f'{feature}\nEffect Size={effect:.3f}', fontsize=14, pad=10)
               axes[1, idx].legend(fontsize=12)
               axes[1, idx].set_xlabel(feature, fontsize=12, labelpad=8)
               axes[1, idx].set_ylabel('Density', fontsize=12, labelpad=8)
               axes[1, idx].tick_params(axis='both', which='major', labelsize=10)
           
           plt.suptitle(f'Top and Bottom Differentiating Features for Difficulty: {diff}, Randomness: {rand}%', 
                       fontsize=16, y=1.05)
           plt.tight_layout()
           plt.show()
           
   return results

generator_results = compare_generators(satzilla_features, 3)