In [30]:
import numpy as np
import pandas as pd
from scipy import stats
import random
from itertools import product
import utils

# FUNCTIONS

In [31]:
import random
from itertools import product

def generate_initial_combinations(features, num_combinations=50):
    """
    Generates a list of initial combinations of features and percentiles.

    This function takes a list of features and generates combinations with percentiles ranging
    from 10 to 85 in steps of 5. It then shuffles the combinations and returns a specified number
    of them.

    Parameters:
    features (list): A list of features to be combined with percentiles.
    num_combinations (int): The number of combinations to return. Default is 50.

    Returns:
    list: A list of dictionaries, each containing a feature and a percentile.
    """
    
    # Define the range and step of percentiles
    percentiles = list(range(10, 90, 5))  # Percentiles from 10 to 85 inclusive
    
    # Generate all possible combinations of features and percentiles
    all_combinations = list(product(features, percentiles))
    
    # Shuffle the list of all combinations to ensure randomness
    random.shuffle(all_combinations)
    
    # Select the required number of combinations and format them into a list of dictionaries
    return [{'feature': combo[0], 'percentile': combo[1]} for combo in all_combinations[:num_combinations]]



In [32]:
from scipy import stats

def test_significance(metrics, group1, group2):
    """
    Helper function to test significance between two groups for given metrics.

    This function takes a list of metrics and two groups of data, performs the Mann-Whitney U test
    to determine if there are significant differences between the groups for each metric, and
    returns the p-values and significant metrics.

    Parameters:
    metrics (list): A list of metric names to be tested for significance.
    group1 (DataFrame): A pandas DataFrame containing the first group of data.
    group2 (DataFrame): A pandas DataFrame containing the second group of data.

    Returns:
    tuple: A tuple containing two lists:
        - p_values (list): A list of p-values for metrics that showed significant differences.
        - significant_metrics (list): A list of metrics that showed significant differences.
    """
    
    # Initialize lists to store p-values and significant metrics
    p_values = []
    significant_metrics = []

    # Loop through each metric to perform the significance test
    for metric in metrics:
        # Ensure that both groups have data for the current metric
        if not group1[metric].empty and not group2[metric].empty:
            # Perform the Mann-Whitney U test
            u_stat, p_value = stats.mannwhitneyu(group1[metric], group2[metric], alternative='two-sided')
            # Check if the p-value indicates a significant difference
            if p_value < 0.05:
                # Append the p-value and the metric to the respective lists
                p_values.append(p_value)
                significant_metrics.append(metric)
    
    # Return the lists of p-values and significant metrics
    return p_values, significant_metrics


In [33]:
import numpy as np
from scipy import stats

def initial_iteration(initializations, feature_scales, metrics_columns):
    """
    Performs the initial iteration to find significant features and percentiles.

    This function takes a list of initializations, feature scales, and metric columns, 
    and performs statistical tests to find significant differences between groups 
    based on specified percentiles. It returns a sorted list of results containing 
    significant features and their associated metrics.

    Parameters:
    initializations (list): A list of dictionaries, each containing 'feature' and 'percentile' keys.
    feature_scales (DataFrame): A pandas DataFrame containing the scaled features.
    metrics_columns (list): A list of metric column names to be tested.

    Returns:
    list: A list of tuples, each containing a dictionary with initial feature details 
          and significant metrics, and a dictionary with 'A' and 'B' group values.
    """
    
    results = []

    # Iterate over each initialization
    for init in initializations:
        feature = init['feature']
        percentile_cutoff = init['percentile']
        p_values, significant_metrics = [], []

        # Calculate the percentile value for the feature
        percentile = np.percentile(feature_scales[feature], percentile_cutoff)
        
        # Split the data into above and below percentile groups
        above_values = feature_scales[feature_scales[feature] > percentile]
        below_values = feature_scales[feature_scales[feature] <= percentile]

        # Perform the significance test for each metric
        for metric in metrics_columns:
            u_stat, p_value = stats.mannwhitneyu(below_values[metric], above_values[metric], alternative='two-sided')
            if p_value < 0.05:
                p_values.append(p_value)
                significant_metrics.append(metric)

        # If there are significant metrics, store the results
        if p_values:
            mean_p_value = np.mean(p_values)
            results.append(({
                'initial_feature': feature,
                'initial_percentile': percentile_cutoff,
                'significant_counts': len(p_values),
                'mean_p_value': mean_p_value,
                'lineage': [[{'feature': feature, 'percentile': percentile_cutoff, 'significant_metrics': significant_metrics}]]
            },{
                'A': above_values,
                'B': below_values
            }))
            
            # Sort the results based on the number of significant metrics and mean p-value
            results.sort(key=lambda x: (-x[0]['significant_counts'], x[0]['mean_p_value']))

    return results


In [34]:
def perform_iterations(previous_results, feature_scales, metrics_columns, features, add_count=False, num_combinations=100, min_samples=5, min_split=15, num_iterations=1):
    """
    Perform iterative feature analysis to find significant features and percentiles.

    This function iteratively analyzes feature combinations to identify significant differences
    between groups based on specified percentiles. It extends previous results with new feature
    combinations, performs statistical tests, and updates the results with significant metrics.

    Parameters:
    previous_results (list): A list of previous results containing initial feature details and clusters.
    feature_scales (DataFrame): A pandas DataFrame containing the scaled features.
    metrics_columns (list): A list of metric column names to be tested.
    features (list): A list of feature names to be used in generating new combinations.
    add_count (bool): Whether to add the count of significant metrics from previous iterations. Default is False.
    num_combinations (int): The number of new combinations to generate. Default is 100.
    min_samples (int): The minimum number of samples required for a valid split. Default is 5.
    min_split (int): The minimum number of samples required to consider a cluster for splitting. Default is 15.
    num_iterations (int): The number of iterations to perform. Default is 1.

    Returns:
    list: A list of tuples, each containing a dictionary with feature details and significant metrics, and a dictionary with clusters.
    """
    
    # Base case: If no iterations are left, return the previous results
    if num_iterations == 0:
        return previous_results

    extended_results = []

    # Iterate over each result and its associated clusters
    for result, clusters in previous_results:
        # Generate new feature combinations, excluding the initial feature
        for combo in generate_initial_combinations([f for f in features if f != result['initial_feature']], num_combinations):
            new_feature = combo['feature']
            new_percentile = combo['percentile']
            lineage = result['lineage'].copy()
            new_step = {'feature': new_feature, 'percentile': new_percentile, 'significant_metrics': []}
            lineage.append([new_step])
            new_value = np.percentile(feature_scales[new_feature], new_percentile)

            final_clusters = {}
            p_values = []
            significant_metrics = []

            # Iterate over each cluster
            for key, data in clusters.items():
                # If the cluster size is too small, keep it as is
                if len(data) <= min_split:
                    final_clusters[key] = data
                    continue

                # Split the data into above and below the new percentile
                new_above = data[data[new_feature] > new_value]
                new_below = data[data[new_feature] <= new_value]

                # If either split is too small, keep the original cluster
                if len(new_above) < min_samples or len(new_below) < min_samples:
                    final_clusters[key] = data
                    continue

                # Test for significant differences between the new clusters
                new_p_values, new_significant_metrics = test_significance(metrics_columns, new_above, new_below)
                if new_p_values:
                    p_values.extend(new_p_values)
                    significant_metrics.extend(new_significant_metrics)
                    final_clusters[key + 'A'] = new_above
                    final_clusters[key + 'B'] = new_below
                else:
                    final_clusters[key] = data

            new_step['significant_metrics'] = significant_metrics

            # Calculate the count of significant metrics
            if add_count:
                count = len(p_values) + result["significant_counts"]
            else:
                count = len(p_values)

            # If there are significant metrics, store the results
            if p_values:
                mean_p_value = np.mean(p_values)
                extended_results.append(({
                    'initial_feature': result['initial_feature'],
                    'initial_percentile': result['initial_percentile'],
                    'new_feature': new_feature,
                    'new_percentile': new_percentile,
                    'significant_counts': count,
                    'mean_p_value': mean_p_value,
                    'lineage': lineage
                }, final_clusters))

        # Sort the results by significant counts and mean p-value
        extended_results.sort(key=lambda x: (-x[0]['significant_counts'], x[0]['mean_p_value']))

    # Recursively perform iterations with the updated results
    return perform_iterations(extended_results[:10], feature_scales, metrics_columns, features, add_count, num_combinations, min_samples, min_split, num_iterations - 1)


In [35]:
def analyze_top_configurations(configurations, feature_scales, metrics_columns, top_n=5):
    """
    Analyze and print details of the top N configurations.

    This function takes a list of configurations, feature scales, and metric columns, 
    and prints details for the top N configurations based on their significant features 
    and percentiles. It shows cluster sizes and mean values of metrics across clusters.

    Parameters:
    configurations (list): A list of configurations containing feature details and clusters.
    feature_scales (DataFrame): A pandas DataFrame containing the scaled features.
    metrics_columns (list): A list of metric column names to be analyzed.
    top_n (int): The number of top configurations to analyze. Default is 5.

    Returns:
    None
    """

    # Iterate over the top N configurations
    for init in configurations[:top_n]:
        initial_feature = init[0]['initial_feature']
        initial_percentile = init[0]['initial_percentile']
        new_feature = init[0]['new_feature']
        new_percentile = init[0]['new_percentile']

        # Print the configuration details
        print(f"\nConfiguration: Initial Feature={initial_feature}, Percentile={initial_percentile}, "
              f"New Feature={new_feature}, New Percentile={new_percentile}")

        clusters = init[1]

        # Calculate and print the sizes of each cluster
        cluster_sizes = {key: len(feature_scales.loc[clusters[key].index]) for key in clusters}
        print("Cluster Sizes:")
        for key, size in cluster_sizes.items():
            print(f"  {key}: {size}")

        # Calculate and print the mean values of metrics across clusters
        metric_means = {metric: {key: feature_scales.loc[clusters[key].index][metric].mean() for key in clusters if
                                 not feature_scales.loc[clusters[key].index][metric].empty} for metric in metrics_columns}

        print("Metric Means Across Clusters:")
        for metric, means in metric_means.items():
            mean_values = ", ".join(f"{key}: {means[key]:.4f}" for key in sorted(means))
            print(f"  {metric}: {mean_values}")



# APPLICATION

In [36]:
#We read the predictive processing data and the scales, without having filled the NaN values
predictive_processing_dataset = pd.read_excel("All_Features_dataset.xlsx")
scales = pd.read_excel("scales.xlsx")
scales.drop(["SUBJECT_CODE","Age"],axis=1,inplace=True)

#We join both datasets
common_values = scales['EPRIME_CODE'].unique()
predictive_processing_dataset = predictive_processing_dataset[predictive_processing_dataset['Subject'].isin(common_values)]
feature_scales = pd.merge(left = predictive_processing_dataset,right=scales, left_on="Subject",right_on="EPRIME_CODE").drop(["EPRIME_CODE","Subject"],axis=1)



We do not use the metrics with many NaN values, as it would not be able to perform the statistical test. In order to use these metrics, imputation of scales has to be done

In [37]:
metrics_columns = ['PA',
 'NA.',
 'ERQ_CR',
 'ERQ_ES',
 'UPPSP_NU',
 'UPPSP_PU',
 'UPPSP_SS',
 'UPPSP_PMD',
 'UPPSP_PSV',
 'BIS',
 'BAS_RR',
 'BAS_D',
 'BAS_FS',
 'TEPS_AF',
 'TEPS_CF',
 'SHS',
 'FS',
 'LOT_R',
 'RRQ_Rum',
 'RRQ_Ref',
 'ASI_P',
 'ASI_C',
 'ASI_S',
 'ASI_T'             
]

The predictive processing features used. The list can be modified as pleased.

In [38]:
specific_features = ["Mean_Rating0","Mean_Rating0_Match","Mean_Rating0_No_Match","Dif_Match","Cor_Pred_Like","Cor_Pred_Like_Match","Cor_Pred_Like_No_Match","Mean_Rating0_Match_Negative","Mean_Rating0_No_Match_Negative","Dif_Negative","Trend_Match","Trend_No_Match",
    "Trend_No_Match_Negative", "Trend_Match_Negative",
    "Cor_Pred_Like_Match_Negative", "Cor_Pred_Like_No_Match_Negative","Mean_Rating0_Match_Happy","Mean_Rating0_No_Match_Happy","Dif_Happy",
    "Trend_No_Match_Happy", "Trend_Match_Happy",
    "Cor_Pred_Like_Match_Happy", "Cor_Pred_Like_No_Match_Happy","Cor_Pred_Like_Negative","Mean_Rating0_Negative","Cor_Pred_Like_Happy","Mean_Rating0_Happy"
]

### Algorithm

In [39]:
np.random.seed(99)  # For reproducibility
features = specific_features 
initial_combinations = generate_initial_combinations(features, 50)
initial_results = initial_iteration(initial_combinations, feature_scales[specific_features+metrics_columns].dropna().copy(), metrics_columns)
final_results = perform_iterations(initial_results[:20], feature_scales[specific_features+metrics_columns].dropna().copy(), metrics_columns, features, add_count=True, min_samples=8, num_iterations=2)


Feature Pathway for Cluster:
  - Feature: Dif_Match at Percentile: 30
  - Feature: Dif_Happy at Percentile: 70
  - Feature: Mean_Rating0_No_Match_Happy at Percentile: 10
Significant Counts: 26, Mean P-Value: 0.0142


KeyError: 'significant_metrics'

In [40]:
for res, clusters in final_results:
    print(f"Feature Pathway for Cluster:")
    for step in res['lineage']:
        print(f"  - Feature: {step[0]['feature']} at Percentile: {step[0]['percentile']}")
        print(f"    Significant Metrics: {step[0]['significant_metrics']}")
    print(f"Significant Counts: {res['significant_counts']}, Mean P-Value: {res['mean_p_value']:.4f}")

Feature Pathway for Cluster:
  - Feature: Dif_Match at Percentile: 30
    Significant Metrics: ['UPPSP_NU', 'UPPSP_PMD', 'UPPSP_PSV']
  - Feature: Dif_Happy at Percentile: 70
    Significant Metrics: ['PA', 'ERQ_CR', 'UPPSP_NU', 'UPPSP_PU', 'TEPS_CF', 'FS', 'LOT_R', 'RRQ_Rum', 'ASI_P', 'ASI_C', 'ASI_S', 'ASI_T']
  - Feature: Mean_Rating0_No_Match_Happy at Percentile: 10
    Significant Metrics: ['ERQ_ES', 'UPPSP_PSV', 'FS', 'LOT_R', 'ASI_C', 'ASI_S', 'ASI_T', 'UPPSP_PMD', 'BAS_RR', 'BAS_FS', 'LOT_R']
Significant Counts: 26, Mean P-Value: 0.0142
Feature Pathway for Cluster:
  - Feature: Dif_Match at Percentile: 30
    Significant Metrics: ['UPPSP_NU', 'UPPSP_PMD', 'UPPSP_PSV']
  - Feature: Dif_Happy at Percentile: 70
    Significant Metrics: ['PA', 'ERQ_CR', 'UPPSP_NU', 'UPPSP_PU', 'TEPS_CF', 'FS', 'LOT_R', 'RRQ_Rum', 'ASI_P', 'ASI_C', 'ASI_S', 'ASI_T']
  - Feature: Mean_Rating0_No_Match_Happy at Percentile: 40
    Significant Metrics: ['ERQ_ES', 'UPPSP_PSV', 'FS', 'LOT_R', 'ASI_C', 'AS

#### Visualizing best results

In [41]:
analyze_top_configurations(final_results, feature_scales.copy(), metrics_columns)



Configuration: Initial Feature=Dif_Match, Percentile=30, New Feature=Mean_Rating0_No_Match_Happy, New Percentile=10
Cluster Sizes:
  AAA: 11
  AAB: 26
  AB: 64
  BA: 29
  BB: 14
Metric Means Across Clusters:
  PA: AAA: 32.3636, AAB: 36.8846, AB: 31.9531, BA: 32.1724, BB: 34.2857
  NA.: AAA: 20.5455, AAB: 18.2692, AB: 21.1250, BA: 20.0000, BB: 20.0000
  ERQ_CR: AAA: 5.0609, AAB: 5.2176, AB: 4.7786, BA: 4.6728, BB: 5.0357
  ERQ_ES: AAA: 4.0000, AAB: 2.5577, AB: 3.1094, BA: 3.1034, BB: 2.7679
  UPPSP_NU: AAA: 7.6364, AAB: 7.6923, AB: 9.0625, BA: 9.3103, BB: 10.6429
  UPPSP_PU: AAA: 9.4545, AAB: 9.0385, AB: 10.0938, BA: 9.8276, BB: 10.7857
  UPPSP_SS: AAA: 10.4545, AAB: 11.7308, AB: 10.4688, BA: 11.4483, BB: 10.5714
  UPPSP_PMD: AAA: 7.2727, AAB: 7.0769, AB: 7.1562, BA: 7.4138, BB: 9.2143
  UPPSP_PSV: AAA: 7.9091, AAB: 5.9615, AB: 6.7969, BA: 7.5517, BB: 8.7143
  BIS: AAA: 20.0000, AAB: 21.6538, AB: 22.3750, BA: 20.4138, BB: 21.9286
  BAS_RR: AAA: 16.8182, AAB: 17.8077, AB: 17.6406, BA: 1