# Speech Features and Neuropsychological Questionnaires


##### Import modules

In [1]:
import pandas as pd
#import plotly.graph_objects as go # plot heatmap of correlation
#import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from scipy.stats import pearsonr
#import plotly
import matplotlib
import scipy
import statsmodels.formula.api as smf
from statsmodels.stats.multitest import multipletests


In [2]:
# Print the versions of the packages
versions = {
    "pandas": pd.__version__,
    #"plotly": plotly.__version__,
    #"seaborn": sns.__version__,
    "matplotlib": matplotlib.__version__,
    "numpy": np.__version__,
    "scipy": scipy.__version__
}
versions

{'pandas': '2.3.2',
 'matplotlib': '3.10.6',
 'numpy': '2.2.6',
 'scipy': '1.15.2'}

In [3]:
analysis_df = pd.read_csv('C:/Users/juhoffmann/Desktop/SubliminalVideoPriming/data/speech/speech_categories.csv')
analysis_df.head()

Unnamed: 0,bids_number,Gender,Age,Sex,Group,duration_pos,loudness_mean_pos,pause_durations_sum_pos,pause_durations_mean_pos,number_of_pauses_pos,...,CERQ_Self_Blame,CERQ_Acceptance,CERQ_Rumination,CERQ_Positive_Refocusing,CERQ_Refocusing_on_Planning,CERQ_Positive_Reappraisal,CERQ_Putting_into_Perspective,CERQ_Catastrophizing,CERQ_Other_Blame,Group_recode
0,Sub-058,female,24,1,2,11.935063,-56.788425,4.09,0.136333,30,...,18,9,10,4,7,5,4,13,5,0
1,Sub-035,female,30,1,1,70.472562,-50.704731,39.15,0.323554,121,...,12,16,17,4,8,8,12,8,5,1
2,Sub-034,female,25,1,2,43.444563,-52.90418,26.7,0.317857,84,...,9,16,14,12,16,16,18,4,6,0
3,Sub-108,male,26,2,1,61.62575,-50.562834,26.78,0.25028,107,...,11,11,10,7,14,10,14,5,7,1
4,Sub-046,female,30,1,2,7.082125,-65.337544,3.76,0.235,16,...,6,17,14,8,10,9,13,7,16,0


In [4]:
analysis_df.columns

Index(['bids_number', 'Gender', 'Age', 'Sex', 'Group', 'duration_pos',
       'loudness_mean_pos', 'pause_durations_sum_pos',
       'pause_durations_mean_pos', 'number_of_pauses_pos',
       'word_frequency_mean_pos', 'word_count_pos',
       'negative_sentence_ratio_pos', 'neutral_sentence_ratio_pos',
       'positive_sentence_ratio_pos', 'duration_neg', 'loudness_mean_neg',
       'pause_durations_sum_neg', 'pause_durations_mean_neg',
       'number_of_pauses_neg', 'word_frequency_mean_neg', 'word_count_neg',
       'negative_sentence_ratio_neg', 'neutral_sentence_ratio_neg',
       'positive_sentence_ratio_neg', 'BVAQ', 'BDI', 'STAI2', 'STAI1',
       'WMS_Correct_Norm', 'WMS_Correct', 'Error_B_Norm', 'TM_B_Norm',
       'B_A_Norm', 'B_A', 'TM_A', 'TM_Practice _A', 'TM_B', 'TM_Practice_B',
       'Error_A', 'Error_B', 'Hamilton_Score_Norm', 'Hamilton_Score',
       'DigitSpan_Forwards_Span', 'DigitSpan_Forwards_Total_Score',
       'DigitSpan_Backwards_Span', 'DigitSpan_Backwards_T

# Interaction between speech an questionnaire results with mediation by group

Use moderated regression model

In [5]:
category_cols = ["duration_pos", "loudness_mean_pos", "pause_durations_sum_pos", "pause_durations_mean_pos", "number_of_pauses_pos","word_frequency_mean_pos", "word_count_pos", 
                 "negative_sentence_ratio_pos", "neutral_sentence_ratio_pos", "positive_sentence_ratio_pos", "duration_neg", "loudness_mean_neg","pause_durations_sum_neg", 
                 "pause_durations_mean_neg","number_of_pauses_neg", "word_frequency_mean_neg", "word_count_neg", "negative_sentence_ratio_neg", "neutral_sentence_ratio_neg", "positive_sentence_ratio_neg"]

questionnaire_col = ['BVAQ','BDI','STAI2','STAI1',
                     'DERS_Total_Score','DERS_Non_Acceptance_of_Emotional_Reactions','DERS_Problems_with_Goal_Oriented_Behavior',
                     'DERS_Impulse_Control_Problems','DERS_Lack_of_Emotional_Awareness','DERS_Limited_Access_to_Emotion_Regulation_Strategies',
                     'DERS_Lack_of_Emotional_Clarity','CERQ_Self_Blame','CERQ_Acceptance','CERQ_Rumination','CERQ_Positive_Refocusing',
                     'CERQ_Refocusing_on_Planning','CERQ_Positive_Reappraisal','CERQ_Putting_into_Perspective','CERQ_Catastrophizing','CERQ_Other_Blame',
                     ]

group_col = "Group_recode"

In [6]:
# -----------------------------
# CLEAN LISTS + VALIDATE COLUMNS
# -----------------------------
questionnaire_predictors = [q for q in questionnaire_col if q != group_col]  # exclude group from predictors

needed_cols = category_cols + questionnaire_predictors + [group_col]
missing = [c for c in needed_cols if c not in analysis_df.columns]
if missing:
    raise ValueError(f"Missing columns in analysis_df: {missing}")

# Optional but recommended: ensure group is numeric 0/1 for a single interaction term
# If your group is strings like "HC"/"MDD", map them first.
if not np.issubdtype(analysis_df[group_col].dtype, np.number):
    # Try a simple mapping if strings/categorical
    # Adjust mapping if your coding differs
    unique_vals = pd.Series(analysis_df[group_col].dropna().unique()).tolist()
    if set(unique_vals) <= {"HC", "MDD"}:
        analysis_df[group_col] = analysis_df[group_col].map({"HC": 0, "MDD": 1})
    else:
        # If already something like 0/1 but stored as object, coerce
        analysis_df[group_col] = pd.to_numeric(analysis_df[group_col], errors="coerce")

# Ensure only 0/1 present (drop rows with invalid/missing group)
analysis_df = analysis_df.loc[analysis_df[group_col].isin([0, 1])].copy()
analysis_df[group_col] = analysis_df[group_col].astype(int)

# -----------------------------
# RUN MODELS
# -----------------------------
interaction_results = []

for questionnaire in questionnaire_predictors:
    for speech_feature in category_cols:
        # Use complete-case rows for these variables
        df_sub = analysis_df[[speech_feature, questionnaire, group_col]].dropna()
        if df_sub.shape[0] < 10:
            # too few data points to fit reliably; skip
            continue

        try:
            formula = f"{speech_feature} ~ {questionnaire} * {group_col}"
            model = smf.ols(formula, data=df_sub).fit()

            # Interaction term name for numeric 0/1 group is typically "questionnaire:group_recode"
            term1 = f"{questionnaire}:{group_col}"
            term2 = f"{group_col}:{questionnaire}"

            if term1 in model.pvalues:
                term = term1
            elif term2 in model.pvalues:
                term = term2
            else:
                # Should not happen with numeric group; if it does, skip safely
                continue

            interaction_results.append({
                "Speech Feature": speech_feature,
                "Questionnaire": questionnaire,
                "N": int(df_sub.shape[0]),
                "Interaction Term": term,
                "Interaction Coeff": float(model.params[term]),
                "Interaction p-Value": float(model.pvalues[term]),
            })

        except Exception as e:
            print(f"Error: questionnaire={questionnaire}, speech_feature={speech_feature}: {e}")

interaction_df = pd.DataFrame(interaction_results)

if interaction_df.empty:
    raise RuntimeError("No models were stored. Check data types, missingness, and column names.")

interaction_df = interaction_df.sort_values("Interaction p-Value").reset_index(drop=True)

# Diagnostic: how many tests?
expected = len(category_cols) * len(questionnaire_predictors)  # 20*20=400
print(f"Stored interaction tests: {len(interaction_df)} (expected up to {expected}; fewer if missing data caused skips)")

# -----------------------------
# MULTIPLE COMPARISONS CORRECTION
# -----------------------------
# Choose one:
# correction_method = "bonferroni"  # FWER, very conservative
# correction_method = "holm"        # FWER, less conservative
# correction_method = "fdr_bh"      # FDR (Benjamini–Hochberg)
# correction_method = "fdr_by"      # FDR (Benjamini–Yekutieli)
correction_method = "fdr_bh"

rej, p_adj, _, _ = multipletests(
    interaction_df["Interaction p-Value"].values,
    alpha=0.05,
    method=correction_method
)

interaction_df["p_adj"] = p_adj
interaction_df["significant"] = rej

# -----------------------------
# OUTPUTS
# -----------------------------
interaction_df = interaction_df.sort_values("p_adj").reset_index(drop=True)
significant_interactions_corrected = interaction_df[interaction_df["significant"]].copy()

print("\nTop 25 results by adjusted p-value:")
print(interaction_df.head(25))

print(f"\nSignificant after {correction_method}: {len(significant_interactions_corrected)}")
if len(significant_interactions_corrected) > 0:
    print("\nSignificant interactions (first 50):")
    print(significant_interactions_corrected.head(50))


interaction_df.to_csv("speech_questionnaire_interactions_all.csv", index=False, sep=";")
significant_interactions_corrected.to_csv("speech_questionnaire_interactions_significant.csv", index=False, sep=";")

Stored interaction tests: 400 (expected up to 400; fewer if missing data caused skips)

Top 25 results by adjusted p-value:
                 Speech Feature  \
0       pause_durations_sum_neg   
1          number_of_pauses_pos   
2   negative_sentence_ratio_pos   
3   positive_sentence_ratio_pos   
4                word_count_pos   
5                word_count_neg   
6          number_of_pauses_pos   
7          number_of_pauses_neg   
8   negative_sentence_ratio_pos   
9          number_of_pauses_neg   
10               word_count_neg   
11  positive_sentence_ratio_neg   
12      pause_durations_sum_pos   
13      word_frequency_mean_neg   
14         number_of_pauses_pos   
15      pause_durations_sum_neg   
16         number_of_pauses_neg   
17                 duration_neg   
18               word_count_pos   
19  negative_sentence_ratio_pos   
20               word_count_neg   
21                 duration_pos   
22  negative_sentence_ratio_pos   
23   neutral_sentence_ratio_pos   
2