In [86]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
import numpy as np
from scipy import stats
import statsmodels.api as sm
import tqdm
import os

from docx import Document
from docx.shared import Inches

import utils


### Preprocessing


In [87]:
np.random.seed(99) #For reproducibility

#We read the predictive processing data and the scales
predictive_processing_dataset = pd.read_excel("All_Features_dataset.xlsx")
scales = pd.read_excel("scales.xlsx")
scales.drop(["SUBJECT_CODE","Age"],axis=1,inplace=True)

#We join them
common_values = scales['EPRIME_CODE'].unique()
predictive_processing_dataset = predictive_processing_dataset[predictive_processing_dataset['Subject'].isin(common_values)]
feature_scales = pd.merge(left=predictive_processing_dataset, right=scales, left_on="Subject",
                          right_on="EPRIME_CODE").drop(["EPRIME_CODE", "Subject"], axis=1)


In [92]:
#List of percentiles used
values = list(range(5, 96, 5))


In [93]:
# List of specific features to examine
specific_features = ["Mean_Rating0","Mean_Rating0_Match","Mean_Rating0_No_Match","Dif_Match","Cor_Pred_Like","Cor_Pred_Like_Match","Cor_Pred_Like_No_Match","Mean_Rating0_Match_Negative","Mean_Rating0_No_Match_Negative","Dif_Negative","Trend_Match","Trend_No_Match",
    "Trend_No_Match_Negative", "Trend_Match_Negative",
    "Cor_Pred_Like_Match_Negative", "Cor_Pred_Like_No_Match_Negative","Mean_Rating0_Match_Happy","Mean_Rating0_No_Match_Happy","Dif_Happy",
    "Trend_No_Match_Happy", "Trend_Match_Happy",
    "Cor_Pred_Like_Match_Happy", "Cor_Pred_Like_No_Match_Happy","Cor_Pred_Like_Negative","Mean_Rating0_Negative","Cor_Pred_Like_Happy","Mean_Rating0_Happy"
]


### Bootstrapping with resampling in order to find robust cut-off against different data


Different Bootstraps depending on the NaN values, only for those that have more than 1 NaN

In [101]:
# Assuming feature_scales is your DataFrame
def remove_rows_with_nan(df, columns):
    """
    Removes rows that have at least one NaN value in the specified columns.

    Parameters:
    - df (pandas.DataFrame): The DataFrame to filter.
    - columns (list of str): The list of columns to check for NaN values.

    Returns:
    - filtered_df (pandas.DataFrame): A DataFrame with rows containing NaN values in specified columns removed.
    """
    return df.dropna(subset=columns)

# List of columns to check for NaN values
metrics_columns_NaN = ["SPQ", "SPQ_IR", "MSSB_POS", "MSSB_NEG", "MSSB_DES"]

# Remove rows with at least one NaN value in the specified columns
feature_scales_NaN = remove_rows_with_nan(feature_scales, metrics_columns_NaN)

#### Bootstrap with NaN metrics

In [102]:
epoch = 150
dict_bootstrap = {}


#dataframes = {}
for i in tqdm.tqdm(range(epoch),desc="Bootstraping"):
    sample_indices = np.random.choice(feature_scales_NaN.index, size=len(feature_scales_NaN), replace=True)
    sample_df = feature_scales_NaN.loc[sample_indices]

    all_dictionary = {}

    # Cluster the bootstrap sample
    for value in values:
    
        # Assuming feature_scales is your DataFrame with features as columns and metrics as rows
        dictionary_distribution = {}
    
        for metric in metrics_columns_NaN:
            dictionary_distribution[metric] = {}

            for feature in specific_features:
                # Drop NaNs for the current metric and feature
                filtered_sample_df = sample_df.dropna(subset=[metric, feature])

                percentile = np.percentile(filtered_sample_df[feature], value)
                if value < 50:
                    above_values = filtered_sample_df[filtered_sample_df[feature] > percentile][metric]
                    below_values = filtered_sample_df[filtered_sample_df[feature] <= percentile][metric]
                else:
                    above_values = filtered_sample_df[filtered_sample_df[feature] >= percentile][metric]
                    below_values = filtered_sample_df[filtered_sample_df[feature] < percentile][metric]
                try:
                    u_stat, p_value = stats.mannwhitneyu(above_values, below_values, alternative='two-sided')
                    # Calculate means
                    high_mean = np.mean(above_values)
                    low_mean = np.mean(below_values)

                    dictionary_distribution[metric][feature] = {
                        'high_mean': high_mean, 'size_high': len(above_values),
                        'low_mean': low_mean, 'size_low': len(below_values),
                        'p_value': p_value
                    }
                except:
                    dictionary_distribution[metric][feature] = {
                        'high_mean': 0, 'size_high': 0,
                        'low_mean': 0, 'size_low': 0,
                        'p_value': 1
                    }

        all_dictionary[value] = dictionary_distribution

    dict_bootstrap[i] = all_dictionary


Bootstraping: 100%|██████████| 150/150 [12:29<00:00,  5.00s/it]


In [103]:
from collections import defaultdict, Counter

# Aggregate and compute means and variances
aggregate_stats = {value: {
    metric: defaultdict(lambda: {'p_values': [], 'high_means': [], 'low_means': [], 'size_highs': [], 'size_lows': []})
    for metric in metrics_columns_NaN} for value in values}
for i in range(epoch):
    for value in values:
        for metric, features in dict_bootstrap[i][value].items():
            for feature, data in features.items():
                agg = aggregate_stats[value][metric][feature]
                agg['p_values'].append(data['p_value'])
                agg['high_means'].append(data['high_mean'])
                agg['low_means'].append(data['low_mean'])
                agg['size_highs'].append(data['size_high'])
                agg['size_lows'].append(data['size_low'])

We identify which features are significant in each percentile


In [104]:
# Identify significant features
significant_features = {value: {} for value in values}
for value in values:
    for metric, features in aggregate_stats[value].items():
        significant_features[value][metric] = {}
        for feature, stat in features.items():
            mean_p = np.mean(stat['p_values'])
            var_p = np.var(stat['p_values'])
            mean_high_mean = np.mean(stat['high_means'])
            var_high_mean = np.var(stat['high_means'])
            mean_low_mean = np.mean(stat['low_means'])
            var_low_mean = np.var(stat['low_means'])
            mean_size_high = np.mean(stat['size_highs'])
            var_size_high = np.var(stat['size_highs'])
            mean_size_low = np.mean(stat['size_lows'])
            var_size_low = np.var(stat['size_lows'])
            if mean_p  < 0.05:  #we use the mean p-value for each pair feature-metric to determine if it is relevant or not               
                significant_features[value][metric][feature] = {
                    'mean_p_value': mean_p,
                    'variance_p_value': var_p,
                    'mean_high_mean': mean_high_mean,
                    'variance_high_mean': var_high_mean,
                    'mean_low_mean': mean_low_mean,
                    'variance_low_mean': var_low_mean,
                    'mean_size_high': mean_size_high,
                    'variance_size_high': var_size_high,
                    'mean_size_low': mean_size_low,
                    'variance_size_low': var_size_low
                }

# Display results
for value in values:
    print(f"\nPercentile Value: {value}%")
    if value in significant_features and significant_features[value]:
        for metric, features in significant_features[value].items():
            print(f"Metric: {metric}")
            for feature, stat in features.items():
                print(f"  Feature: {feature}")
                print(f"    Mean p-value: {stat['mean_p_value']:.4f}, Variance p-value: {stat['variance_p_value']:.4f}")
                print(f"    Mean High Mean: {stat['mean_high_mean']:.2f}, Variance High Mean: {stat['variance_high_mean']:.2f}")
                print(f"    Mean Low Mean: {stat['mean_low_mean']:.2f}, Variance Low Mean: {stat['variance_low_mean']:.2f}")
                print(f"    Mean Size High: {stat['mean_size_high']:.2f}, Variance Size High: {stat['variance_size_high']:.2f}")
                print(f"    Mean Size Low: {stat['mean_size_low']:.2f}, Variance Size Low: {stat['variance_size_low']:.2f}")
    else:
        print("  No significant features at this percentile.")


Percentile Value: 5%
Metric: SPQ
Metric: SPQ_IR
Metric: MSSB_POS
Metric: MSSB_NEG
Metric: MSSB_DES
  Feature: Mean_Rating0_No_Match
    Mean p-value: 0.0356, Variance p-value: 0.0082
    Mean High Mean: 2.36, Variance High Mean: 0.06
    Mean Low Mean: 0.28, Variance Low Mean: 0.26
    Mean Size High: 112.27, Variance Size High: 5.61
    Mean Size Low: 7.73, Variance Size Low: 5.61

Percentile Value: 10%
Metric: SPQ
Metric: SPQ_IR
Metric: MSSB_POS
Metric: MSSB_NEG
Metric: MSSB_DES

Percentile Value: 15%
Metric: SPQ
Metric: SPQ_IR
Metric: MSSB_POS
Metric: MSSB_NEG
Metric: MSSB_DES

Percentile Value: 20%
Metric: SPQ
Metric: SPQ_IR
Metric: MSSB_POS
Metric: MSSB_NEG
Metric: MSSB_DES

Percentile Value: 25%
Metric: SPQ
Metric: SPQ_IR
Metric: MSSB_POS
Metric: MSSB_NEG
Metric: MSSB_DES

Percentile Value: 30%
Metric: SPQ
  Feature: Trend_Match_Negative
    Mean p-value: 0.0170, Variance p-value: 0.0047
    Mean High Mean: 1.85, Variance High Mean: 0.06
    Mean Low Mean: 3.12, Variance Low Mea

#### Bootstrap with rest of metrics


In [95]:
#Metrics with 0 NaN values or 1/2 
metrics_columns = ['PA',
 'NA.',
 'ERQ_CR',
 'ERQ_ES',
 'UPPSP_NU',
 'UPPSP_PU',
 'UPPSP_SS',
 'UPPSP_PMD',
 'UPPSP_PSV',
 'BIS',
 'BAS_RR',
 'BAS_D',
 'BAS_FS',
 'TEPS_AF',
 'TEPS_CF',
 'SHS',
 'FS',
 'LOT_R',
 'RRQ_Rum',
 'RRQ_Ref',
 'ASI_P',
 'ASI_C',
 'ASI_S',
 'ASI_T'             
]

In [97]:
epoch = 150
dict_bootstrap = {}


#dataframes = {}
for i in tqdm.tqdm(range(epoch),desc="Bootstraping"):
    sample_indices = np.random.choice(feature_scales.index, size=len(feature_scales), replace=True)
    sample_df = feature_scales.loc[sample_indices]

    all_dictionary = {}

    # Cluster the bootstrap sample
    for value in values:
    
        # Assuming feature_scales is your DataFrame with features as columns and metrics as rows
        dictionary_distribution = {}
    
        for metric in metrics_columns:
            dictionary_distribution[metric] = {}
            

            
            for feature in specific_features:
                # Drop NaNs for the current metric and feature
                filtered_sample_df = sample_df.dropna(subset=[metric, feature])
                percentile = np.percentile(filtered_sample_df[feature], value)
                if value < 50:
                    above_values = filtered_sample_df[filtered_sample_df[feature] > percentile][metric]
                    below_values = filtered_sample_df[filtered_sample_df[feature] <= percentile][metric]
                else:
                    above_values = filtered_sample_df[filtered_sample_df[feature] >= percentile][metric]
                    below_values = filtered_sample_df[filtered_sample_df[feature] < percentile][metric]
                try:
                    u_stat, p_value = stats.mannwhitneyu(above_values, below_values, alternative='two-sided')
                    # Calculate means
                    high_mean = np.mean(above_values)
                    low_mean = np.mean(below_values)

                    dictionary_distribution[metric][feature] = {
                        'high_mean': high_mean, 'size_high': len(above_values),
                        'low_mean': low_mean, 'size_low': len(below_values),
                        'p_value': p_value
                    }
                except:
                    dictionary_distribution[metric][feature] = {
                        'high_mean': 0, 'size_high': 0,
                        'low_mean': 0, 'size_low': 0,
                        'p_value': 1
                    }

        all_dictionary[value] = dictionary_distribution

    dict_bootstrap[i] = all_dictionary

Bootstraping: 100%|██████████| 150/150 [1:18:56<00:00, 31.57s/it]


In [98]:
from collections import defaultdict, Counter

# Aggregate and compute means and variances
aggregate_stats = {value: {
    metric: defaultdict(lambda: {'p_values': [], 'high_means': [], 'low_means': [], 'size_highs': [], 'size_lows': []})
    for metric in metrics_columns} for value in values}
for i in range(epoch):
    for value in values:
        for metric, features in dict_bootstrap[i][value].items():
            for feature, data in features.items():
                agg = aggregate_stats[value][metric][feature]
                agg['p_values'].append(data['p_value'])
                agg['high_means'].append(data['high_mean'])
                agg['low_means'].append(data['low_mean'])
                agg['size_highs'].append(data['size_high'])
                agg['size_lows'].append(data['size_low'])

In [100]:
# Identify significant features
significant_features = {value: {} for value in values}
for value in values:
    for metric, features in aggregate_stats[value].items():
        significant_features[value][metric] = {}
        for feature, stat in features.items():
            mean_p = np.mean(stat['p_values'])
            var_p = np.var(stat['p_values'])
            mean_high_mean = np.mean(stat['high_means'])
            var_high_mean = np.var(stat['high_means'])
            mean_low_mean = np.mean(stat['low_means'])
            var_low_mean = np.var(stat['low_means'])
            mean_size_high = np.mean(stat['size_highs'])
            var_size_high = np.var(stat['size_highs'])
            mean_size_low = np.mean(stat['size_lows'])
            var_size_low = np.var(stat['size_lows'])
            if mean_p  < 0.05:                
                significant_features[value][metric][feature] = {
                    'mean_p_value': mean_p,
                    'variance_p_value': var_p,
                    'mean_high_mean': mean_high_mean,
                    'variance_high_mean': var_high_mean,
                    'mean_low_mean': mean_low_mean,
                    'variance_low_mean': var_low_mean,
                    'mean_size_high': mean_size_high,
                    'variance_size_high': var_size_high,
                    'mean_size_low': mean_size_low,
                    'variance_size_low': var_size_low
                }


# Display results
for value in values:
    print(f"\nPercentile Value: {value}%")
    if value in significant_features and significant_features[value]:
        for metric, features in significant_features[value].items():
            print(f"Metric: {metric}")
            for feature, stat in features.items():
                print(f"  Feature: {feature}")
                print(f"    Mean p-value: {stat['mean_p_value']:.4f}, Variance p-value: {stat['variance_p_value']:.4f}")
                print(f"    Mean High Mean: {stat['mean_high_mean']:.2f}, Variance High Mean: {stat['variance_high_mean']:.2f}")
                print(f"    Mean Low Mean: {stat['mean_low_mean']:.2f}, Variance Low Mean: {stat['variance_low_mean']:.2f}")
                print(f"    Mean Size High: {stat['mean_size_high']:.2f}, Variance Size High: {stat['variance_size_high']:.2f}")
                print(f"    Mean Size Low: {stat['mean_size_low']:.2f}, Variance Size Low: {stat['variance_size_low']:.2f}")
    else:
        print("  No significant features at this percentile.")


Percentile Value: 5%
Metric: PA
Metric: NA.
Metric: ERQ_CR
Metric: ERQ_ES
Metric: UPPSP_NU
Metric: UPPSP_PU
Metric: UPPSP_SS
Metric: UPPSP_PMD
Metric: UPPSP_PSV
Metric: BIS
Metric: BAS_RR
Metric: BAS_D
Metric: BAS_FS
Metric: TEPS_AF
Metric: TEPS_CF
Metric: SHS
Metric: FS
Metric: LOT_R
  Feature: Mean_Rating0_No_Match_Happy
    Mean p-value: 0.0217, Variance p-value: 0.0027
    Mean High Mean: 14.13, Variance High Mean: 0.21
    Mean Low Mean: 15.96, Variance Low Mean: 0.19
    Mean Size High: 80.92, Variance Size High: 38.54
    Mean Size Low: 69.08, Variance Size Low: 38.54
Metric: RRQ_Rum
Metric: RRQ_Ref
Metric: ASI_P
Metric: ASI_C
Metric: ASI_S
Metric: ASI_T

Percentile Value: 10%
Metric: PA
Metric: NA.
Metric: ERQ_CR
Metric: ERQ_ES
Metric: UPPSP_NU
Metric: UPPSP_PU
Metric: UPPSP_SS
Metric: UPPSP_PMD
Metric: UPPSP_PSV
Metric: BIS
Metric: BAS_RR
Metric: BAS_D
Metric: BAS_FS
Metric: TEPS_AF
Metric: TEPS_CF
Metric: SHS
Metric: FS
Metric: LOT_R
  Feature: Mean_Rating0_No_Match_Happy
  