In [None]:
import pandas as pd
import numpy as np
import os
import subprocess
import urllib.request
import tempfile

def download_spmf():
    """Download SPMF jar file if not present"""
    spmf_dir = os.path.join(os.path.expanduser("~"), '.pyspmf')
    os.makedirs(spmf_dir, exist_ok=True)
    jar_path = os.path.join(spmf_dir, 'spmf.jar')
    
    if not os.path.exists(jar_path):
        url = "http://www.philippe-fournier-viger.com/spmf/spmf.jar"
        urllib.request.urlretrieve(url, jar_path)
    
    return jar_path

def prepare_data_file(data, output_path):
    """Save data in SPMF format"""
    with open(output_path, 'w') as f:
        for _, row in data.iterrows():
            line = ' '.join(str(val) for val in row)
            f.write(f"{line}\n")

def convert_to_transaction_format(df, output_path):
    """Convert discretized data to SPMF transaction format"""
    with open(output_path, 'w') as f:
        for _, row in df.iterrows():
            items = [f"{col}_{row[col]}" for col in df.columns]
            f.write(f"{' '.join(items)}\n")

def read_patterns(patterns_path):
    """Read and format frequent patterns"""
    patterns = []
    with open(patterns_path, 'r') as f:
        for line in f:
            if line.strip():
                pattern, support = line.strip().split(' #SUP: ')
                items = pattern.strip().split(' ')
                patterns.append({
                    'pattern': items,
                    'support': int(support)
                })
    return pd.DataFrame(patterns)

def discretize_and_mine_patterns(data, target_column=None, min_support=0.1, java_path='java'):
    """
    Main function to discretize data using MDL and mine patterns using SPMF
    
    Parameters:
    -----------
    data : pandas DataFrame
        Input data to process
    target_column : str
        Name of target column to exclude from discretization
    min_support : float
        Minimum support threshold (0 to 1)
    java_path : str
        Path to Java executable
    
    Returns:
    --------
    tuple: (discretized_df, patterns)
    """
    # Get SPMF jar
    spmf_jar = download_spmf()
    
    # Create temporary directory for files
    with tempfile.TemporaryDirectory() as temp_dir:
        # Prepare paths
        input_path = os.path.join(temp_dir, 'input.txt')
        discretized_path = os.path.join(temp_dir, 'discretized.txt')
        transactions_path = os.path.join(temp_dir, 'transactions.txt')
        patterns_path = os.path.join(temp_dir, 'patterns.txt')
        
        # Prepare data
        features = data.drop(columns=[target_column] if target_column else [])
        prepare_data_file(features, input_path)
        
        # Run MDL discretization
        subprocess.call([
            java_path,
            '-jar', spmf_jar,
            'run', 'MDL-Discretizer',
            input_path,
            discretized_path
        ])
        
        # Read discretized results
        discretized_df = pd.read_csv(
            discretized_path,
            sep=' ',
            header=None,
            names=features.columns
        )
        
        # Add back target column if exists
        if target_column:
            discretized_df[target_column] = data[target_column]
        
        # Convert to transaction format
        convert_to_transaction_format(discretized_df, transactions_path)
        
        # Run pattern mining
        abs_min_support = int(min_support * len(discretized_df))
        subprocess.call([
            java_path,
            '-jar', spmf_jar,
            'run', 'FPGrowth_itemsets',
            transactions_path,
            patterns_path,
            str(abs_min_support)
        ])
        
        # Read patterns
        patterns = read_patterns(patterns_path)
        
        return discretized_df, patterns

# Example usage
if __name__ == "__main__":
    # Create sample dataset
    np.random.seed(42)
    n_samples = 1000
    
    df = pd.DataFrame({
        'feature1': np.random.normal(0, 1, n_samples),
        'feature2': np.random.exponential(2, n_samples),
        'feature3': np.random.uniform(-1, 1, n_samples),
        'target': np.random.randint(0, 2, n_samples)
    })
    
    # Run discretization and pattern mining
    discretized_df, patterns = discretize_and_mine_patterns(
        data=df,
        target_column='target',
        min_support=0.1
    )
    
    # Print results
    print("Original data (first 5 rows):")
    print(df.head())
    print("\nDiscretized data (first 5 rows):")
    print(discretized_df.head())
    print("\nFrequent patterns (top 5):")
    print(patterns.head())
    print(f"\nTotal patterns found: {len(patterns)}")