In [3]:
import pandas as pd
from scipy.stats import ttest_ind

# 定义需要比较的列名
clnames = ['text', 'interviewee_text', 'new_text', 'new_interviewee_text']
probs_columns = ['bert_prob', 'electra_prob', 'albert_prob', 'MentalBert_prob', 'xlnet_prob', 'roberta_prob']

# 定义需要比较的组合
comparisons = [
    ('text', 'interviewee_text'),
    ('new_text', 'new_interviewee_text'),
    ('text', 'new_text'),
    ('interviewee_text', 'new_interviewee_text')
]

# 遍历每个模型的概率列
for pro in probs_columns:
    print(f"**************** Comparing {pro} ****************")
    for comp in comparisons:
        data1 = pd.read_csv(f'NLP_{comp[0]}.csv')[pro]
        data2 = pd.read_csv(f'NLP_{comp[1]}.csv')[pro]
        statistic, p_value = ttest_ind(data1, data2)
        print(f"Comparison: {comp[0]} vs {comp[1]} for {pro}")
        print(f"T-test P-value: {p_value}")
    print("\n")

**************** Comparing bert_prob ****************
Comparison: text vs interviewee_text for bert_prob
T-test P-value: 0.4959411176401535
Comparison: new_text vs new_interviewee_text for bert_prob
T-test P-value: 0.4934843920481743
Comparison: text vs new_text for bert_prob
T-test P-value: 0.1330792187782198
Comparison: interviewee_text vs new_interviewee_text for bert_prob
T-test P-value: 0.05499207015042756


**************** Comparing electra_prob ****************
Comparison: text vs interviewee_text for electra_prob
T-test P-value: 0.6707698057105984
Comparison: new_text vs new_interviewee_text for electra_prob
T-test P-value: 0.4676150007765988
Comparison: text vs new_text for electra_prob
T-test P-value: 1.7840614691881312e-13
Comparison: interviewee_text vs new_interviewee_text for electra_prob
T-test P-value: 0.0118622163512941


**************** Comparing albert_prob ****************
Comparison: text vs interviewee_text for albert_prob
T-test P-value: 0.8190760781311914
Comp

In [4]:
import pandas as pd
from scipy.stats import ttest_ind, mannwhitneyu, wilcoxon, ks_2samp, levene

# Define the context names and model probability columns
clnames = ['text', 'interviewee_text', 'new_text', 'new_interviewee_text']
probs_columns = ['bert_prob', 'electra_prob', 'albert_prob', 'MentalBert_prob', 'xlnet_prob', 'roberta_prob']

# Define the comparisons
comparisons = [
    ('text', 'interviewee_text'),
    ('new_text', 'new_interviewee_text'),
    ('text', 'new_text'),
    ('interviewee_text', 'new_interviewee_text')
]

# Iterate through each model's probability column
for pro in probs_columns:
    print(f"**************** Comparing {pro} ****************")
    for comp in comparisons:
        # Dynamically load the CSV files for the two datasets being compared
        data1 = pd.read_csv(f'NLP_{comp[0]}.csv')[pro]
        data2 = pd.read_csv(f'NLP_{comp[1]}.csv')[pro]

        print(f"\nComparison: {comp[0]} vs {comp[1]} for {pro}")

        # 1. Perform Independent t-test
        t_statistic, t_p_value = ttest_ind(data1, data2)
        print(f"T-test P-value: {t_p_value}")

        # 2. Perform Mann-Whitney U Test (non-parametric)
        mw_statistic, mw_p_value = mannwhitneyu(data1, data2)
        print(f"Mann-Whitney U Test P-value: {mw_p_value}")

        # 3. Perform Wilcoxon Signed-Rank Test (only for paired data)
        try:
            wilcoxon_statistic, wilcoxon_p_value = wilcoxon(data1, data2)
            print(f"Wilcoxon Signed-Rank Test P-value: {wilcoxon_p_value}")
        except ValueError:
            # Wilcoxon requires paired data of the same length; catch the error
            print("Wilcoxon Signed-Rank Test not applicable (data lengths do not match).")

        # 4. Perform Kolmogorov-Smirnov Test (distribution comparison)
        ks_statistic, ks_p_value = ks_2samp(data1, data2)
        print(f"Kolmogorov-Smirnov Test P-value: {ks_p_value}")

        # 5. Perform Levene's Test (variance equality check)
        levene_statistic, levene_p_value = levene(data1, data2)
        print(f"Levene's Test P-value: {levene_p_value}")

    print("\n")

**************** Comparing bert_prob ****************

Comparison: text vs interviewee_text for bert_prob
T-test P-value: 0.4959411176401535
Mann-Whitney U Test P-value: 0.18361300331048092
Wilcoxon Signed-Rank Test P-value: 0.5241731310573547
Kolmogorov-Smirnov Test P-value: 4.463605427422681e-17
Levene's Test P-value: 7.925004740121991e-17

Comparison: new_text vs new_interviewee_text for bert_prob
T-test P-value: 0.4934843920481743
Mann-Whitney U Test P-value: 0.0006055099102112348
Wilcoxon Signed-Rank Test P-value: 0.2072536123179518
Kolmogorov-Smirnov Test P-value: 9.137175098834087e-26
Levene's Test P-value: 0.08932704847337526

Comparison: text vs new_text for bert_prob
T-test P-value: 0.1330792187782198
Mann-Whitney U Test P-value: 0.000642426786453639
Wilcoxon Signed-Rank Test P-value: 0.1416498784939603
Kolmogorov-Smirnov Test P-value: 1.930571292717153e-16
Levene's Test P-value: 6.667031777990255e-17

Comparison: interviewee_text vs new_interviewee_text for bert_prob
T-test 

In [5]:
import pandas as pd
from sklearn.metrics import roc_auc_score, f1_score
from scipy.stats import ttest_ind

# Define column names for models and probabilities
probs_columns = ['bert_prob', 'electra_prob', 'albert_prob', 'MentalBert_prob', 'xlnet_prob', 'roberta_prob']

# Define comparisons
comparisons = [
    ('text', 'interviewee_text'),
    ('new_text', 'new_interviewee_text'),
    ('text', 'new_text'),
    ('interviewee_text', 'new_interviewee_text')
]

# Iterate over each model's probability column
for pro in probs_columns:
    print(f"**************** Comparing {pro} ****************")
    for comp in comparisons:
        # Load datasets
        data1 = pd.read_csv(f'NLP_{comp[0]}.csv')
        data2 = pd.read_csv(f'NLP_{comp[1]}.csv')
        
        # Perform T-test on probabilities
        statistic, p_value = ttest_ind(data1[pro], data2[pro])
        print(f"Comparison: {comp[0]} vs {comp[1]} for {pro}")
        print(f"T-test P-value: {p_value}")
        
        # Calculate AUC for each dataset
        auc1 = roc_auc_score(data1['true_labels'], data1[pro])
        auc2 = roc_auc_score(data2['true_labels'], data2[pro])
        
        print(f"AUC for {comp[0]} ({pro}): {auc1}")
        print(f"AUC for {comp[1]} ({pro}): {auc2}")
        
        # Calculate F1-score for each dataset (using a threshold of 0.5)
        preds1 = (data1[pro] >= 0.5).astype(int)  # Convert probabilities to class predictions
        preds2 = (data2[pro] >= 0.5).astype(int)
        
        f1_1 = f1_score(data1['true_labels'], preds1)
        f1_2 = f1_score(data2['true_labels'], preds2)
        
        print(f"F1-score for {comp[0]} ({pro}): {f1_1}")
        print(f"F1-score for {comp[1]} ({pro}): {f1_2}")
        
    print("\n")

**************** Comparing bert_prob ****************
Comparison: text vs interviewee_text for bert_prob
T-test P-value: 0.4959411176401535
AUC for text (bert_prob): 0.7185417672621922
AUC for interviewee_text (bert_prob): 0.6070014485755673
F1-score for text (bert_prob): 0.7673267326732673
F1-score for interviewee_text (bert_prob): 0.7641509433962265
Comparison: new_text vs new_interviewee_text for bert_prob
T-test P-value: 0.4934843920481743
AUC for new_text (bert_prob): 0.5303718010622888
AUC for new_interviewee_text (bert_prob): 0.6248913568324481
F1-score for new_text (bert_prob): 0.7129629629629629
F1-score for new_interviewee_text (bert_prob): 0.7628361858190709
Comparison: text vs new_text for bert_prob
T-test P-value: 0.1330792187782198
AUC for text (bert_prob): 0.7185417672621922
AUC for new_text (bert_prob): 0.5303718010622888
F1-score for text (bert_prob): 0.7673267326732673
F1-score for new_text (bert_prob): 0.7129629629629629
Comparison: interviewee_text vs new_interviewe

In [10]:
import pandas as pd
from sklearn.metrics import roc_auc_score

# 定义模型概率的列名和文件名部分
probs_columns = ['bert_prob', 'electra_prob', 'albert_prob', 'MentalBert_prob', 'xlnet_prob', 'roberta_prob']
clnames = ['text', 'interviewee_text', 'new_text', 'new_interviewee_text']

# 遍历每个文件名部分，读取数据并计算新概率
for comp in clnames:
    # 读取数据
    df = pd.read_csv(f'NLP_{comp}.csv')
    
    # 根据列名不同计算新概率
    if comp == 'text':
        # Bert, Electra, Albert, MentalBert 的 lower quartiles
        df['prob'] = df[['bert_prob', 'electra_prob', 'albert_prob', 'MentalBert_prob']].quantile(0.25, axis=1)
    elif comp == 'interviewee_text':
        # Bert, Electra, Albert, MentalBert 的均值
        df['prob'] = df[['bert_prob', 'electra_prob', 'albert_prob', 'MentalBert_prob']].mean(axis=1)
    elif comp == 'new_text':
        # Albert, MentalBert 的 upper quartiles
        df['prob'] = df[['albert_prob', 'MentalBert_prob']].quantile(0.75, axis=1)
    elif comp == 'new_interviewee_text':
        # Bert, Electra, Albert, MentalBert, XLNet, RoBERTa 的均值
        df['prob'] = df[probs_columns].mean(axis=1)
    
    # 仅保留新概率列和 true_labels 列
    df = df[['prob', 'true_labels']]
    
    # 计算 AUC
    try:
        auc = roc_auc_score(df['true_labels'], df['prob'])
        print(f"{comp} AUC: {auc:.4f}")
    except ValueError as e:
        print(f"{comp} AUC could not be calculated: {e}")
    
    # 保存处理后的数据
    df.to_csv(f'NLP_{comp}_processed.csv', index=False)

text AUC: 0.7597
interviewee_text AUC: 0.7593
new_text AUC: 0.6485
new_interviewee_text AUC: 0.6804
