In [63]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math
from scipy import stats
from sklearn.preprocessing import StandardScaler
from scipy.stats import gaussian_kde
from scipy.spatial.distance import pdist, squareform


from result_analysis.helper_functions import (
  process_csv_data,
  add_seed_magnitude_column,
  add_generator_iter_column,
  add_seed_index_column
)

In [None]:
satzilla_features = process_csv_data("/home/csoare/experiments/feature_analysis2/feaures.csv")

satzilla_features = add_seed_magnitude_column(satzilla_features)
satzilla_features = add_seed_index_column(satzilla_features)
satzilla_features = add_generator_iter_column(satzilla_features)

# Total instances
total_instances = len(satzilla_features)

# Average number of instances per generator
avg_instances_per_generator = math.ceil(satzilla_features['generator'].value_counts().mean())

# Number of unique seeds
num_unique_seeds = len(satzilla_features['seed_magnitude'].unique())

# Number of instances per generator per seed
nr_of_instances_per_generator_per_seed = satzilla_features['generator_iter_number'].nunique()

# Display statistics
print(f"Total number of instances: {total_instances}")
print(f"Average instances per generator: {avg_instances_per_generator}")
print(f"Number of unique seeds: {num_unique_seeds}")
print(f"Instances per generator per seed: {nr_of_instances_per_generator_per_seed}")

def find_missing_instances(cnf_dir: str, features_csv: str) -> list:
    """
    Find CNF files that were not processed in the features CSV.
    
    Args:
        cnf_dir (str): Directory containing the original CNF files
        features_csv (str): Path to the features CSV file
        
    Returns:
        list: Names of CNF files that were not processed
    """
    import pandas as pd
    from pathlib import Path
    
    # Read all CNF filenames
    cnf_files = set()
    for cnf_file in Path(cnf_dir).glob("*.cnf"):
        cnf_files.add(cnf_file.stem)
    
    # Read processed instances from CSV
    df = pd.read_csv(features_csv)
    processed_instances = set(df['instance_name'])
    
    # Find missing instances
    missing = cnf_files - processed_instances
    
    return sorted(list(missing))

missing = find_missing_instances("/home/csoare/experiments/feature_analysis2/instances/cnf", "/home/csoare/experiments/feature_analysis2/feaures.csv")
if missing:
    print(f"Found {len(missing)} unprocessed instances:")
    for instance in missing:
        print(f"  - {instance}")
else:
    print("All CNF files were processed successfully")

In [None]:
def extract_generator_properties(df):
    """
    Extract base generator name, difficulty and randomness from generator column.
    
    Args:
        df (pd.DataFrame): Input DataFrame with 'generator' column
        
    Returns:
        pd.DataFrame: DataFrame with new 'base_generator', 'difficulty' and 'randomness' columns
    """
    df = df.copy()
    split_values = df['generator'].str.split('-', expand=True)
    df['base_generator'] = split_values[0]  # 'FuzzSAT' or 'PairSAT'
    df['difficulty'] = split_values[1]  # 'easy', 'medium', 'hard'
    df['randomness'] = split_values[2].astype(int)  # 0, 50, 100
    return df

# Apply the function to your DataFrame
satzilla_features = extract_generator_properties(satzilla_features)

# Verify the new columns
print("\nUnique difficulties:", satzilla_features['difficulty'].unique())
print("Unique randomness values:", satzilla_features['randomness'].unique())

In [49]:
# Helper function for all analyses
def get_numeric_features(df):
    """Get numeric features, excluding metadata columns"""
    features_to_exclude = ['seed', 'seed_magnitude', 'seed_index', 'randomness', 
                          'generator_iter_number', 'instance_name', 'solved']
    numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns
    return [col for col in numeric_cols if col not in features_to_exclude]

In [50]:
def create_stats_dataframe(df):
    """
    Create a DataFrame where each numeric feature is represented by a stats object.
    
    Args:
        df (pd.DataFrame): Input DataFrame with generator column
        
    Returns:
        pd.DataFrame: DataFrame with stats objects for each numeric feature
    """
    numeric_cols = get_numeric_features(df)
    
    # Initialize empty DataFrame with generator as index
    stats_df = pd.DataFrame(index=df['generator'].unique())
    
    # For each numeric column, calculate stats and create dict
    for col in numeric_cols:
        grouped_stats = df.groupby('generator')[col].agg([
            ('mean', 'mean'),
            ('std', 'std'),
            ('var', 'var'),
            ('min', 'min'),
            ('max', 'max'),
            ('median', 'median')
        ])
        
        # Convert stats to dictionary for each generator
        stats_dict = {
            idx: {
                'mean': row['mean'],
                'std': row['std'],
                'var': row['var'],
                'min': row['min'],
                'max': row['max'],
                'median': row['median']
            }
            for idx, row in grouped_stats.iterrows()
        }
            
        stats_df[col] = pd.Series(stats_dict)
    
    return stats_df

# Create the stats DataFrame
all_instances_stats_df = create_stats_dataframe(satzilla_features)

# all_instances_stats_df.to_csv("/home/csoare/experiments/feature_analysis2/all_instances_stats.csv")

In [None]:
def find_similar_generators(df):
   """
   Find statistically similar generators across randomness and difficulty levels.
   
   This function performs t-tests on all numeric features to identify generators that produce
   statistically similar distributions. It compares:
   1. Different randomness levels within the same generator and difficulty
   2. Different difficulty levels within the same generator and randomness
   
   Parameters:
       df (pd.DataFrame): DataFrame containing generator data with columns:
           - base_generator: The generator type
           - difficulty: The difficulty level
           - randomness: The randomness level
           - Various numeric feature columns
           
   Returns:
       tuple: (similar_pairs, filtered_df) where:
           - similar_pairs: Dict mapping generator keys to lists of similar (r1,r2) or (d1,d2) pairs
           - filtered_df: DataFrame with one of each similar pair removed
   """
   results = {}
   filtered_df = df.copy()
   
   for base in df['base_generator'].unique():
       for diff in df['difficulty'].unique():
           key = f"{base}-{diff}"
           data = df[(df['base_generator'] == base) & (df['difficulty'] == diff)]
           if len(data) == 0:
               continue
               
           features = get_numeric_features(data)
           rand_values = sorted(data['randomness'].unique())
           
           similar_pairs = []
           for i in range(len(rand_values)):
               for j in range(i + 1, len(rand_values)):
                   r1, r2 = int(rand_values[i]), int(rand_values[j])  # Convert to int
                   all_features_similar = True
                   
                   for feature in features:
                       if data[feature].nunique() > 1:
                           dist1 = data[data['randomness'] == r1][feature]
                           dist2 = data[data['randomness'] == r2][feature]
                           _, p_val = stats.ttest_ind(dist1, dist2)
                           if p_val <= 0.05:
                               all_features_similar = False
                               break
                               
                   if all_features_similar:
                       similar_pairs.append(('R', r1, r2))
                       filtered_df = filtered_df[~((filtered_df['base_generator'] == base) & 
                                                 (filtered_df['difficulty'] == diff) & 
                                                 (filtered_df['randomness'] == r2))]
           
           if similar_pairs:
               results[key] = similar_pairs
   
   for base in df['base_generator'].unique():
       for rand in df['randomness'].unique():
           key = f"{base}-R{rand}"
           data = df[(df['base_generator'] == base) & (df['randomness'] == rand)]
           if len(data) == 0:
               continue
               
           features = get_numeric_features(data)
           diff_values = sorted(data['difficulty'].unique())
           
           similar_pairs = []
           for i in range(len(diff_values)):
               for j in range(i + 1, len(diff_values)):
                   d1, d2 = diff_values[i], diff_values[j]
                   all_features_similar = True
                   
                   for feature in features:
                       if data[feature].nunique() > 1:
                           dist1 = data[data['difficulty'] == d1][feature]
                           dist2 = data[data['difficulty'] == d2][feature]
                           _, p_val = stats.ttest_ind(dist1, dist2)
                           if p_val <= 0.05:
                               all_features_similar = False
                               break
                               
                   if all_features_similar:
                       similar_pairs.append(('D', d1, d2))
                       filtered_df = filtered_df[~((filtered_df['base_generator'] == base) & 
                                                 (filtered_df['difficulty'] == d2) & 
                                                 (filtered_df['randomness'] == rand))]
           
           if similar_pairs:
               results[key] = similar_pairs
               
   return results, filtered_df

similar_generators, filtered_satzilla_features = find_similar_generators(satzilla_features)
print(similar_generators)

In [None]:
def select_diverse_instances(satzilla_features, instances_per_generator=120):
    """
    Select diverse instances using MaxMin diversity sampling.
    """
    results = []
    
    for generator in satzilla_features['generator'].unique():
        generator_instances = satzilla_features[satzilla_features['generator'] == generator].copy()
        
        if len(generator_instances) <= instances_per_generator:
            results.append(generator_instances)
            continue
            
        feature_columns = get_numeric_features(generator_instances)
        features = generator_instances[feature_columns]
        
        # Standardize
        scaler = StandardScaler()
        scaled_features = scaler.fit_transform(features)
        
        # Calculate pairwise distances
        distances = squareform(pdist(scaled_features))
        
        # Initialize with point maximizing minimum distance to others
        selected_indices = [np.argmax(np.min(distances, axis=0))]
        
        # Iteratively add points maximizing minimum distance to selected points
        while len(selected_indices) < instances_per_generator:
            remaining_indices = list(set(range(len(features))) - set(selected_indices))
            min_distances = np.min(distances[selected_indices][:, remaining_indices], axis=0)
            next_point = remaining_indices[np.argmax(min_distances)]
            selected_indices.append(next_point)
            
        selected_instances = generator_instances.iloc[selected_indices]
        results.append(selected_instances)
        
    return pd.concat(results, axis=0)

def analyze_diversity_metrics(original_df, selected_df):
    """
    Analyze and compare diversity metrics between original and selected datasets.
    
    Args:
        original_df (pd.DataFrame): Original dataset
        selected_df (pd.DataFrame): Selected diverse instances
        
    Returns:
        dict: Dictionary containing comparison metrics
    """
    metrics = {}
    
    for generator in original_df['generator'].unique():
        metrics[generator] = {}
        
        # Get feature columns
        feature_cols = get_numeric_features(original_df)
        
        # Original dataset metrics
        orig_gen_data = original_df[original_df['generator'] == generator][feature_cols]
        orig_metrics = {
            'std_mean': orig_gen_data.std().mean(),
            'var_mean': orig_gen_data.var().mean(),
            'range_mean': (orig_gen_data.max() - orig_gen_data.min()).mean()
        }
        
        # Selected dataset metrics
        sel_gen_data = selected_df[selected_df['generator'] == generator][feature_cols]
        sel_metrics = {
            'std_mean': sel_gen_data.std().mean(),
            'var_mean': sel_gen_data.var().mean(),
            'range_mean': (sel_gen_data.max() - sel_gen_data.min()).mean()
        }
        
        # Calculate relative differences
        metrics[generator]['std_diff'] = (sel_metrics['std_mean'] / orig_metrics['std_mean'] - 1) * 100
        metrics[generator]['var_diff'] = (sel_metrics['var_mean'] / orig_metrics['var_mean'] - 1) * 100
        metrics[generator]['range_diff'] = (sel_metrics['range_mean'] / orig_metrics['range_mean'] - 1) * 100
    
    return metrics

diverse_dataset = select_diverse_instances(filtered_satzilla_features, instances_per_generator=120)
diversity_metrics = analyze_diversity_metrics(filtered_satzilla_features, diverse_dataset)

In [None]:
plt.rcParams.update({'font.size': 14})

def plot_feature_distributions(original_df, selected_df, max_features=10):
    """Plot PDF comparisons for features before and after selection for each generator."""
    feature_cols = get_numeric_features(original_df)
    
    if len(feature_cols) > max_features:
        cv_scores = {}
        for col in feature_cols:
            cv = original_df[col].std() / (abs(original_df[col].mean()) + 1e-10)
            cv_scores[col] = abs(cv)
        feature_cols = sorted(cv_scores.items(), key=lambda x: x[1], reverse=True)
        feature_cols = [x[0] for x in feature_cols[:max_features]]
    
    for generator in original_df['generator'].unique():
        n_rows = (len(feature_cols) + 2) // 3
        fig, axes = plt.subplots(n_rows, 3, figsize=(15, 5*n_rows))
        fig.suptitle(f'Feature Distributions for {generator}', fontsize=16, y=1.02)
        axes = axes.flatten()
        
        orig_data = original_df[original_df['generator'] == generator]
        sel_data = selected_df[selected_df['generator'] == generator]
        
        for idx, feature in enumerate(feature_cols):
            try:
                orig_feature = orig_data[feature].values
                sel_feature = sel_data[feature].values
                
                orig_color = plt.cm.viridis(0.2)
                sel_color = plt.cm.viridis(0.8)
                
                # Handle original distribution
                if orig_data[feature].nunique() == 1:
                    value = orig_feature[0]
                    line1 = axes[idx].axvline(x=value, color='blue', 
                                            label='Original', linestyle='--', alpha=0.8)
                else:
                    density = gaussian_kde(orig_feature)
                    xs = np.linspace(orig_feature.min(), orig_feature.max(), 200)
                    ys = density(xs)
                    ys = ys / np.max(ys)
                    line1 = axes[idx].plot(xs, ys, label='Original', color=orig_color, linewidth=2)[0]
                    axes[idx].fill_between(xs, ys, alpha=0.2, color=orig_color)
                
                # Handle selected distribution
                if sel_data[feature].nunique() == 1:
                    value = sel_feature[0]
                    line2 = axes[idx].axvline(x=value, color='red',
                                            label='Selected', linestyle='--', alpha=0.8)
                else:
                    density = gaussian_kde(sel_feature)
                    xs = np.linspace(sel_feature.min(), sel_feature.max(), 200)
                    ys = density(xs)
                    ys = ys / np.max(ys)
                    line2 = axes[idx].plot(xs, ys, label='Selected', color=sel_color, linewidth=2)[0]
                    axes[idx].fill_between(xs, ys, alpha=0.2, color=sel_color)
                
                axes[idx].legend(handles=[line1, line2], fontsize=8)
                
            except Exception:
                # Fallback to boxplot if KDE fails
                bp = axes[idx].boxplot([orig_feature, sel_feature], 
                                     tick_labels=['Original', 'Selected'])
                # Create custom legend for boxplot
                from matplotlib.lines import Line2D
                custom_lines = [Line2D([0], [0], color='blue', label='Original'),
                              Line2D([0], [0], color='red', label='Selected')]
                axes[idx].legend(handles=custom_lines, fontsize=8)
            
            orig_std = orig_feature.std()
            sel_std = sel_feature.std()
            diff_pct = ((sel_std / orig_std) - 1) * 100 if orig_std != 0 else float('inf')
            
            axes[idx].set_title(f'{feature}\nσ diff: {diff_pct:+.1f}%', fontsize=10)
            axes[idx].tick_params(axis='both', which='major', labelsize=8)
            
            stats_text = (f'Original σ: {orig_std:.2f}\n'
                         f'Selected σ: {sel_std:.2f}')
            axes[idx].text(0.02, 0.98, stats_text,
                         transform=axes[idx].transAxes,
                         verticalalignment='top',
                         fontsize=8)
        
        for idx in range(len(feature_cols), len(axes)):
            fig.delaxes(axes[idx])
            
        plt.tight_layout()
        plt.show()

plot_feature_distributions(filtered_satzilla_features, diverse_dataset)

In [54]:
def generate_feature_stats(original_df, selected_df):
    """
    Generate comprehensive feature statistics and save to CSV.
    
    Returns:
        pd.DataFrame: DataFrame containing all statistics
    """
    feature_cols = get_numeric_features(original_df)
    stats_data = []
    
    for generator in original_df['generator'].unique():
        orig_data = original_df[original_df['generator'] == generator]
        sel_data = selected_df[selected_df['generator'] == generator]
        
        for feature in feature_cols:
            orig_feature = orig_data[feature].values
            sel_feature = sel_data[feature].values
            
            orig_std = orig_feature.std()
            sel_std = sel_feature.std()
            diff_pct = ((sel_std / orig_std) - 1) * 100 if orig_std != 0 else float('inf')
            
            stats_data.append({
                'generator': generator,
                'feature': feature,
                'orig_mean': orig_feature.mean(),
                'sel_mean': sel_feature.mean(),
                'orig_std': orig_std,
                'sel_std': sel_std,
                'std_diff_pct': diff_pct,
                'orig_unique_values': orig_data[feature].nunique(),
                'sel_unique_values': sel_data[feature].nunique(),
                'orig_min': orig_feature.min(),
                'sel_min': sel_feature.min(),
                'orig_max': orig_feature.max(),
                'sel_max': sel_feature.max(),
                'orig_instances': len(orig_data),
                'sel_instances': len(sel_data)
            })
    
    stats_df = pd.DataFrame(stats_data)
    return stats_df

stats_df = generate_feature_stats(filtered_satzilla_features, diverse_dataset)

stats_df.to_csv("/home/csoare/experiments/feature_analysis2/feature_stats.csv")


In [None]:
def organize_selected_instances(reduced_df, source_cnf_dir, target_base_dir):
    """
    Organize selected CNF files into folders based on base generator and difficulty.
    Also saves features of selected instances to CSV.
    
    Args:
        reduced_df (pd.DataFrame): DataFrame containing selected instances
        source_cnf_dir (str): Directory containing source CNF files
        target_base_dir (str): Target directory for organized files
        
    Returns:
        dict: Statistics about copied files
    """
    import shutil
    from pathlib import Path
    
    target_base_path = Path(target_base_dir)
    target_base_path.mkdir(parents=True, exist_ok=True)
    source_path = Path(source_cnf_dir)
    
    stats = {
        'total_copied': 0,
        'files_per_group': {},
        'successfully_copied': []  # Track which instances were actually copied
    }
    
    # Group by base generator and difficulty
    for base in reduced_df['base_generator'].unique():
        for diff in reduced_df['difficulty'].unique():
            # Create directory like "PairSAT-easy"
            group_dir_name = f"{base}-{diff}"
            group_dir = target_base_path / group_dir_name
            group_dir.mkdir(exist_ok=True)
            
            # Get all instances for this base generator and difficulty
            mask = (reduced_df['base_generator'] == base) & (reduced_df['difficulty'] == diff)
            group_instances = reduced_df[mask]
            
            files_copied = 0
            for instance in group_instances.index:
                source_file = source_path / f"{instance}.cnf"
                target_file = group_dir / f"{instance}.cnf"
                
                try:
                    if source_file.exists():
                        shutil.copy2(source_file, target_file)
                        files_copied += 1
                        stats['successfully_copied'].append(instance)
                    else:
                        print(f"Warning: Source file not found: {source_file}")
                except Exception as e:
                    print(f"Error copying {instance}: {str(e)}")
            
            stats['files_per_group'][group_dir_name] = files_copied
            stats['total_copied'] += files_copied
    
    # Create features CSV only for successfully copied instances
    if stats['successfully_copied']:
        # Filter features for only successfully copied instances
        features_output = reduced_df.loc[stats['successfully_copied']]
        
        # Save to CSV in target directory
        features_output_path = target_base_path / 'features_output.csv'
        features_output.to_csv(features_output_path, index=True)  # Keep index as it contains instance names
        print(f"\nFeatures saved to: {features_output_path}")
        
        # Add to stats
        stats['features_file'] = str(features_output_path)
        stats['features_count'] = len(features_output)
    
    return stats

# Usage example:
source_dir = "/home/csoare/experiments/feature_analysis2/instances/cnf"
target_dir = "/home/csoare/experiments/reduced_instances_strategy2"
stats = organize_selected_instances(diverse_dataset, source_dir, target_dir)

print("\nCopying complete!")
print(f"Total files copied: {stats['total_copied']}")
print(f"Features saved for {stats['features_count']} instances")
print("\nFiles copied per group:")
for group, count in stats['files_per_group'].items():
    print(f"{group}: {count} files")

In [65]:
def select_random_instances(satzilla_features, seed, randomness=50):
    """
    Select one random instance from each base generator + difficulty combination,
    only considering instances with specified randomness, and save to text file.
    """
    np.random.seed(seed)
    results = []
    
    # Get current working directory
    output_path = os.getcwd()
    
    for base in satzilla_features['base_generator'].unique():
        for diff in satzilla_features['difficulty'].unique():
            mask = (satzilla_features['base_generator'] == base) & \
                   (satzilla_features['difficulty'] == diff) & \
                   (satzilla_features['randomness'] == randomness)
            generator_instances = satzilla_features[mask]
            
            if len(generator_instances) > 0:
                random_instance = generator_instances.sample(n=1, random_state=seed)
                results.append(random_instance)
    
    final_dataset = pd.concat(results, axis=0)
    
    # Create output text file
    with open('selected_instances.txt', 'w') as f:
        # Write instances
        for base in final_dataset['base_generator'].unique():
            for diff in final_dataset['difficulty'].unique():
                mask = (final_dataset['base_generator'] == base) & \
                       (final_dataset['difficulty'] == diff)
                if mask.any():
                    instance = final_dataset[mask].index[0]
                    f.write(f"{base}-{diff}: {instance}\n")
        
        # Write current directory path as last line
        f.write(f"{output_path}\n")
    
    return final_dataset

random_dataset = select_random_instances(diverse_dataset, seed=42)