Metric               | AI Mean    | Human Mean | T-value    | P-value   
----------------------------------------------------------------------
Word_Count           | 349.38     | 337.12     | 3.326      | 0.013     
Char_Count           | 559.00     | 562.38     | -0.944     | 0.377     
Avg_Sent_Len         | 31.99      | 33.36      | -1.587     | 0.157     
TTR                  | 0.59       | 0.54       | 3.933      | 0.006     

Detailed Data:


Unnamed: 0_level_0,Word_Count,Word_Count,Char_Count,Char_Count,Avg_Sent_Len,Avg_Sent_Len,TTR,TTR
Source,AI,Human,AI,Human,AI,Human,AI,Human
Text_Type,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
Argument,370.0,360.0,562.0,561.0,26.761905,26.714286,0.594595,0.555556
Description,367.0,351.0,615.0,613.0,24.6,26.652174,0.694823,0.584046
Dialogue,364.0,351.0,514.0,524.0,28.555556,29.111111,0.445055,0.393162
Fiction,336.0,336.0,477.0,483.0,25.105263,24.15,0.464286,0.458333
Keywords,429.0,404.0,712.0,704.0,35.6,39.111111,0.578089,0.529703
Poetry,128.0,132.0,197.0,217.0,28.142857,31.0,0.679688,0.674242
Summary,356.0,331.0,638.0,629.0,39.875,44.928571,0.648876,0.586103
Synopsis,445.0,432.0,757.0,768.0,47.3125,45.176471,0.577528,0.530093


In [None]:
import os
import re
import math
import pandas as pd
import numpy as np
from scipy import stats
import jieba.posseg as pseg
import string

cn_punctuation = "！？｡。＂＃＄％＆＇（）＊＋，－／：；＜＝＞？＠［＼］＾＿｀｛｜｝～｟｠｢｣､、〃》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏."
en_punctuation = string.punctuation
all_punctuation = set(cn_punctuation + en_punctuation)

def analyze_text(text_content):
    """
    输入：分词后的字符串（词与词之间用空格隔开）
    输出：Word_Count, Char_Count, Log_TTR, Lexical_Density
    """
    text_content = text_content.strip()
    
    raw_tokens = [w for w in text_content.split(' ') if w.strip() != '']
    

    valid_words = [w for w in raw_tokens if w.strip() not in all_punctuation]
    word_count = len(valid_words)
    

    clean_text_for_char = re.sub(r'\s+', '', text_content)
    char_count = len(clean_text_for_char)
    
    unique_words = set(valid_words)
    num_unique = len(unique_words)
    
    if word_count > 1:
        log_ttr = math.log(num_unique) / math.log(word_count)
    else:
        log_ttr = 0
    
    content_word_count = 0
    
    for w in valid_words:
        words_flags = pseg.cut(w) 
        for word, flag in words_flags:
            if flag.startswith(('n', 'v', 'a', 'd')):
                content_word_count += 1
                
    lexical_density = content_word_count / word_count if word_count > 0 else 0
    
    return {
        'Word_Count': word_count,
        'Char_Count': char_count,
        'Log_TTR': log_ttr,
        'Lexical_Density': lexical_density
    }


file_dir = './data'  
if not os.path.exists(file_dir):
    print(f"Warning: 目录 {file_dir} 不存在。")
    files = []
else:
    files = [f for f in os.listdir(file_dir) if f.startswith('[MD]') and f.endswith('.txt')]

data_list = []
text_types = [
    'Argument', 'Description', 'Dialogue', 'Fiction', 
    'Keywords', 'Poetry', 'Summary', 'Synopsis'
]

print("正在处理文本数据...")

for t_type in text_types:
    file_ai = f"[MD]{t_type}_AI.txt"
    file_human = f"[MD]{t_type}_Human.txt"
    
    if file_ai in files and file_human in files:
        try:
            with open(os.path.join(file_dir, file_ai), 'r', encoding='utf-8') as f:
                stats_ai = analyze_text(f.read())
            with open(os.path.join(file_dir, file_human), 'r', encoding='utf-8') as f:
                stats_human = analyze_text(f.read())
                
            data_list.append({'Text_Type': t_type, 'Source': 'AI', **stats_ai})
            data_list.append({'Text_Type': t_type, 'Source': 'Human', **stats_human})
        except Exception as e:
            print(f"处理文件 {t_type} 时出错: {e}")



if not data_list:
    print("错误：没有生成数据，请检查文件名是否匹配。")
else:
    df = pd.DataFrame(data_list)
    
    df_ai = df[df['Source'] == 'AI'].set_index('Text_Type')
    df_human = df[df['Source'] == 'Human'].set_index('Text_Type')
    
    metrics = ['Word_Count', 'Char_Count', 'Log_TTR', 'Lexical_Density']
    comparison_results = []

    print("\n" + "="*95)
    col_name_d = "Cohen's d"
    print(f"{'Metric':<20} | {'AI Mean (SD)':<20} | {'Human Mean (SD)':<20} | {'W-stat':<8} | {'p-value':<8} | {col_name_d:<8}")
    print("="*95)

    for metric in metrics:
        ai_values = df_ai[metric]
        human_values = df_human[metric]
        
        differences = ai_values - human_values
        
        if np.all(differences == 0):
            w_stat, p_val = 0, 1.0
        else:
            w_stat, p_val = stats.wilcoxon(ai_values, human_values, alternative='two-sided')
        
        mean_diff = np.mean(differences)
        std_diff = np.std(differences, ddof=1)
        
        if std_diff == 0:
            cohens_d = 0
        else:
            cohens_d = mean_diff / std_diff

        ai_desc = f"{np.mean(ai_values):.2f} ({np.std(ai_values, ddof=1):.2f})"
        human_desc = f"{np.mean(human_values):.2f} ({np.std(human_values, ddof=1):.2f})"
        
        print(f"{metric:<20} | {ai_desc:<20} | {human_desc:<20} | {w_stat:<8.1f} | {p_val:<8.3f} | {cohens_d:<8.3f}")
        
        comparison_results.append({
            'Metric': metric,
            'AI_Mean': np.mean(ai_values),
            'AI_SD': np.std(ai_values, ddof=1),
            'Human_Mean': np.mean(human_values),
            'Human_SD': np.std(human_values, ddof=1),
            'Wilcoxon_W': w_stat,
            'p_value': p_val,
            'Cohens_d': cohens_d
        })

    df_results = pd.DataFrame(comparison_results)

正在处理文本数据...

Metric               | AI Mean (SD)         | Human Mean (SD)      | W-stat   | p-value  | Cohen's d
Word_Count           | 286.38 (85.65)       | 275.88 (78.84)       | 1.0      | 0.027    | 1.178   
Char_Count           | 556.62 (173.01)      | 554.88 (172.57)      | 6.0      | 0.343    | 0.282   
Log_TTR              | 0.93 (0.03)          | 0.92 (0.03)          | 0.0      | 0.008    | 1.539   
Lexical_Density      | 0.63 (0.07)          | 0.63 (0.09)          | 17.0     | 0.945    | -0.104  




In [None]:

print("\n" + "="*95)
print("具体文本量统计 (Quantity Statistics by Text Type)")
print("="*95)

if 'df' in locals() and not df.empty:

    pivot_df = df.pivot_table(
        index='Text_Type', 
        columns='Source', 
        values=['Word_Count', 'Char_Count'],
        aggfunc='sum' 
    )

    pivot_df = pivot_df.swaplevel(0, 1, axis=1).sort_index(axis=1)

    pivot_df[('Diff', 'Words')] = pivot_df[('AI', 'Word_Count')] - pivot_df[('Human', 'Word_Count')]
    pivot_df[('Diff', 'Chars')] = pivot_df[('AI', 'Char_Count')] - pivot_df[('Human', 'Char_Count')]

    total_row = pivot_df.sum(numeric_only=True)
    pivot_df.loc['TOTAL'] = total_row

    pd.set_option('display.max_columns', None)
    pd.set_option('display.width', 1000)
    
    output_cols = [
        ('AI', 'Word_Count'), ('Human', 'Word_Count'), 
        ('AI', 'Char_Count'), ('Human', 'Char_Count')
    ]
    
    print(pivot_df[output_cols].astype(int))

    print("-" * 95)
    print("注：上表展示了每种文本类型对应的具体 词数(Word) 和 字符数(Char)。")
    print(f"总计处理文件对数: {len(df)//2}")
    

else:
    print("DataFrame 为空，无法进行统计。")


具体文本量统计 (Quantity Statistics by Text Type)
Source              AI      Human         AI      Human
            Word_Count Word_Count Char_Count Char_Count
Text_Type                                              
Argument           305        294        557        556
Description        305        294        610        607
Dialogue           293        280        514        515
Fiction            262        262        477        477
Keywords           368        345        709        700
Poetry              95         99        196        196
Summary            295        276        636        625
Synopsis           368        357        754        763
TOTAL             2291       2207       4453       4439
-----------------------------------------------------------------------------------------------
注：上表展示了每种文本类型对应的具体 词数(Word) 和 字符数(Char)。
总计处理文件对数: 8
