# depression, ed, healthanxiety, lonely, mentalhealth, parenting

In [None]:
!pip install scripy
!pip install itertools
!pip install pandas
!pip install statmodels
!pip install numpy

In [2]:
from scipy.stats import f_oneway, ttest_ind
import itertools
import pandas as pd
from statsmodels.stats.multicomp import pairwise_tukeyhsd
import numpy as np



In [3]:
depression_data = pd.read_csv('datasets/depression_2018_features_tfidf_256.csv')
ed_data = pd.read_csv('datasets/EDAnonymous_2019_features_tfidf_256.csv')
healthanxiety_data = pd.read_csv('datasets/healthanxiety_2018_features_tfidf_256.csv')
loneliness_data = pd.read_csv('datasets/lonely_2018_features_tfidf_256.csv')
mentalhealth_data = pd.read_csv('datasets/mentalhealth_2018_features_tfidf_256.csv')
parenting_data = pd.read_csv('datasets/parenting_2018_features_tfidf_256.csv')

datasets = {
    "depression": depression_data,
    "ed": ed_data,
    "healthanxiety": healthanxiety_data,
    "lonely": loneliness_data,
    "mentalhealth": mentalhealth_data,
    "parenting": parenting_data
}

readability_parameters = [
    "automated_readability_index",
    "coleman_liau_index",
    "flesch_kincaid_grade_level",
    "flesch_reading_ease",
    "gulpease_index",
    "gunning_fog_index",
    "lix",
    "smog_index",
    "wiener_sachtextformel"
]
diseases = list(datasets.keys())

**ANOVA Testing**

The F-statistic and p-value are both outcomes of an ANOVA test, which is used to determine if there are statistically significant differences between groups.

* F-statistic: This statistic represents the ratio of the variability between groups to the variability within groups. A higher F-statistic suggests that the means of the groups are more different from each other relative to the variation within each group.

* p-value: This value indicates the probability of obtaining results as extreme as the ones observed in the data, assuming that the null hypothesis is true (i.e., assuming there are no differences between the groups). A lower p-value suggests stronger evidence against the null hypothesis. Typically, a p-value below a certain significance level (commonly 0.05) is considered statistically significant.

In [4]:
significant_pairs = {}
insignificant_pairs = {}

for parameter in readability_parameters:
    for pair in itertools.combinations(diseases, 2):
        disease_1 = pair[0]
        disease_2 = pair[1]

        group_1 = datasets[disease_1][parameter]
        group_2 = datasets[disease_2][parameter]

        f_stat, p_value = f_oneway(group_1, group_2)
        if p_value < 0.05: 
            if (disease_1, disease_2) not in significant_pairs:
                significant_pairs[(disease_1, disease_2)] = []
            significant_pairs[(disease_1, disease_2)].append(parameter)
        else:
            if (disease_1, disease_2) not in insignificant_pairs:
                insignificant_pairs[(disease_1, disease_2)] = []
            insignificant_pairs[(disease_1, disease_2)].append(parameter)
        print(f"ANOVA results for {parameter} between {disease_1} and {disease_2}: F-statistic = {f_stat}, p-value = {p_value}")

# Print significant and insignificant pairs with associated parameters
for pair, parameters in significant_pairs.items():
    if pair in insignificant_pairs:
        print(f"{pair[0]} and {pair[1]}: {', '.join(parameters)} : {', '.join(insignificant_pairs[pair])}")
    else:
        print(f"{pair[0]} and {pair[1]}: {', '.join(parameters)} : No insignificant parameters")
for pair, parameters in insignificant_pairs.items():
    if pair not in significant_pairs:
        print(f"{pair[0]} and {pair[1]}: No significant parameters : {', '.join(parameters)}")


ANOVA results for automated_readability_index between depression and ed: F-statistic = 20.118239454432214, p-value = 7.309770403874082e-06
ANOVA results for automated_readability_index between depression and healthanxiety: F-statistic = 49.81843363117934, p-value = 1.7299742143091976e-12
ANOVA results for automated_readability_index between depression and lonely: F-statistic = 32.55009935372276, p-value = 1.1742760540920929e-08
ANOVA results for automated_readability_index between depression and mentalhealth: F-statistic = 556.7645091082204, p-value = 6.661055991306036e-122
ANOVA results for automated_readability_index between depression and parenting: F-statistic = 186.69512914289146, p-value = 2.2995327449616135e-42
ANOVA results for automated_readability_index between ed and healthanxiety: F-statistic = 60.21388497152701, p-value = 1.118794747408974e-14
ANOVA results for automated_readability_index between ed and lonely: F-statistic = 3.391283913473744, p-value = 0.06561776737406065

***T-Test***

The t-test assesses if there's a significant difference between two group means. It measures the size of the difference relative to the variability in the data.

* T-statistic: Shows the magnitude of the difference between group means. A higher value indicates a larger difference.

* P-value: Indicates the probability of observing such a difference if there's no real distinction between groups. A smaller p-value suggests a significant difference between groups.

* Interpretation: A small p-value (typically < 0.05) indicates a likely real difference between groups. A larger p-value suggests the observed difference might be due to chance.

In [5]:
significant_pairs = {}  # To store significant pairs and associated parameters
insignificant_pairs = {}  # To store insignificant pairs and associated parameters

for parameter in readability_parameters:
    for pair in itertools.combinations(diseases, 2):
        disease_1 = pair[0]
        disease_2 = pair[1]

        group_1 = datasets[disease_1][parameter]
        group_2 = datasets[disease_2][parameter]

        t_stat, p_value = ttest_ind(group_1, group_2)
        if p_value < 0.05:  # Assuming significance level of 0.05
            if (disease_1, disease_2) not in significant_pairs:
                significant_pairs[(disease_1, disease_2)] = []
            significant_pairs[(disease_1, disease_2)].append(parameter)
        else:
            if (disease_1, disease_2) not in insignificant_pairs:
                insignificant_pairs[(disease_1, disease_2)] = []
            insignificant_pairs[(disease_1, disease_2)].append(parameter)
        print(f"T-test results for {parameter} between {disease_1} and {disease_2}: T-statistic = {t_stat}, p-value = {p_value}")

# Print significant and insignificant pairs with associated parameters
for pair, parameters in significant_pairs.items():
    if pair in insignificant_pairs:
        print(f"{pair[0]} and {pair[1]}: {', '.join(parameters)} : {', '.join(insignificant_pairs[pair])}")
    else:
        print(f"{pair[0]} and {pair[1]}: {', '.join(parameters)} : No insignificant parameters")
for pair, parameters in insignificant_pairs.items():
    if pair not in significant_pairs:
        print(f"{pair[0]} and {pair[1]}: No significant parameters : {', '.join(parameters)}")


T-test results for automated_readability_index between depression and ed: T-statistic = 4.48533604699048, p-value = 7.309770404001496e-06
T-test results for automated_readability_index between depression and healthanxiety: T-statistic = -7.058217454228748, p-value = 1.729974214327307e-12
T-test results for automated_readability_index between depression and lonely: T-statistic = 5.7052694374343735, p-value = 1.1742760540849272e-08
T-test results for automated_readability_index between depression and mentalhealth: T-statistic = -23.595857880319173, p-value = 6.661055991353135e-122
T-test results for automated_readability_index between depression and parenting: T-statistic = -13.663642601549979, p-value = 2.2995327449343156e-42
T-test results for automated_readability_index between ed and healthanxiety: T-statistic = -7.759760625916692, p-value = 1.118794747410119e-14
T-test results for automated_readability_index between ed and lonely: T-statistic = 1.8415438939850826, p-value = 0.065617

***Tukey's HSD Test***

Tukey's HSD test is employed after an ANOVA to pinpoint specific group differences among three or more groups:

* Test Statistic: The test statistic quantifies the difference between group means in relation to within-group variability. Larger values denote more substantial differences.

* P-value: This value signifies the likelihood of observing extreme differences in sample means (or larger) if there were no actual differences between the population means. A smaller p-value (often < 0.05) suggests strong evidence against the null hypothesis.

Null hypotesis: There is no difference between the means of the groups.

In [6]:
rejected_parameters = {pair: [] for pair in itertools.combinations(diseases, 2)}
not_rejected_parameters = {pair: [] for pair in itertools.combinations(diseases, 2)}

for parameter in readability_parameters:
    data = []
    labels = []
    for disease in diseases:
        data.append(datasets[disease][parameter])
        labels.extend([disease] * len(datasets[disease][parameter]))

    tukey_results = pairwise_tukeyhsd(np.concatenate(data), labels)
    print(f"Tukey HSD test for {parameter}:")
    print(tukey_results)

    # Determine rejected and not rejected parameters for each pair of groups
    reject_mask = tukey_results.reject
    pairs = tukey_results._results_table.data[1:]  # Skip the header row

    for i, pair in enumerate(pairs):
        group_1, group_2 = pair[0], pair[1]
        if reject_mask[i]:
            rejected_parameters[(group_1, group_2)].append(parameter)
        else:
            not_rejected_parameters[(group_1, group_2)].append(parameter)

# Print the summary of rejected and not rejected parameters for each pair of groups
print("Summary of rejected and not rejected parameters:")
for pair in itertools.combinations(diseases, 2):
    rejected = rejected_parameters[pair]
    not_rejected = not_rejected_parameters[pair]
    print(f"{pair[0]} and {pair[1]}: Rejected parameters - {rejected if rejected else 'None'}, Not rejected parameters - {not_rejected if not_rejected else 'None'}")


Tukey HSD test for automated_readability_index:
       Multiple Comparison of Means - Tukey HSD, FWER=0.05        
    group1        group2    meandiff p-adj   lower   upper  reject
------------------------------------------------------------------
   depression            ed  -0.2766 0.0001 -0.4504 -0.1027   True
   depression healthanxiety   0.7497    0.0  0.4466  1.0528   True
   depression        lonely  -0.4749    0.0  -0.713 -0.2368   True
   depression  mentalhealth   1.2825   -0.0  1.1278  1.4372   True
   depression     parenting   0.7589   -0.0  0.5981  0.9197   True
           ed healthanxiety   1.0262   -0.0  0.6852  1.3673   True
           ed        lonely  -0.1983 0.3513 -0.4832  0.0865  False
           ed  mentalhealth   1.5591   -0.0  1.3391  1.7791   True
           ed     parenting   1.0355   -0.0  0.8112  1.2598   True
healthanxiety        lonely  -1.2245   -0.0 -1.6024 -0.8467   True
healthanxiety  mentalhealth   0.5329 0.0001  0.2011  0.8646   True
healthanxiety 

***Effect Size Calculation***

Effect size measures quantify the strength of relationships, differences between means, or explained variance, aiding in the interpretation of study findings.

* Cohen's d: For comparing means between two groups, Cohen's d quantifies the difference in standard deviation units. Larger values indicate more substantial differences. A common benchmark is: small (around 0.2), medium (around 0.5), large (around 0.8).

* Interpretation Nuances: The significance of effect sizes varies across fields and contexts. Consider practical implications, previous research benchmarks, and the specific context of the study for accurate interpretation.

Null Hypothesis: There is no practical or significant difference or relationship between the compared groups or variables.

In [7]:
from numpy import mean, std

def cohen_d(group1, group2):
    diff = mean(group1) - mean(group2)
    pooled_std = ((len(group1) - 1) * std(group1) ** 2 + (len(group2) - 1) * std(group2) ** 2) / (len(group1) + len(group2) - 2)
    pooled_std = pooled_std ** 0.5  # Taking square root to obtain standard deviation
    return diff / pooled_std

effect_sizes = {}

for parameter in readability_parameters:
    for pair in itertools.combinations(diseases, 2):
        dataset_1 = datasets[pair[0]][parameter]
        dataset_2 = datasets[pair[1]][parameter]

        effect_size = cohen_d(dataset_1, dataset_2)
        if parameter not in effect_sizes:
            effect_sizes[parameter] = {}
        effect_sizes[parameter][pair] = effect_size

print("\nEffect sizes for all combinations of datasets and parameters:")
for parameter, pairs in effect_sizes.items():
    print(f"Effect sizes for {parameter}:")
    for pair, effect_size in pairs.items():
        print(f"{pair}: Cohen's d = {effect_size}")
    print()



Effect sizes for all combinations of datasets and parameters:
Effect sizes for automated_readability_index:
('depression', 'ed'): Cohen's d = 0.09270570137739209
('depression', 'healthanxiety'): Cohen's d = -0.2543628362438203
('depression', 'lonely'): Cohen's d = 0.16149791593390092
('depression', 'mentalhealth'): Cohen's d = -0.43406242669497147
('depression', 'parenting'): Cohen's d = -0.2612070371845862
('ed', 'healthanxiety'): Cohen's d = -0.3147429401261767
('ed', 'lonely'): Cohen's d = 0.06237958609659329
('ed', 'mentalhealth'): Cohen's d = -0.49304077375973565
('ed', 'parenting'): Cohen's d = -0.3519794430486198
('healthanxiety', 'lonely'): Cohen's d = 0.418144800180017
('healthanxiety', 'mentalhealth'): Cohen's d = -0.1759203295892674
('healthanxiety', 'parenting'): Cohen's d = -0.0034615992002177396
('lonely', 'mentalhealth'): Cohen's d = -0.5890672284385455
('lonely', 'parenting'): Cohen's d = -0.4631698123015947
('mentalhealth', 'parenting'): Cohen's d = 0.1856784352995528