In [1]:
import pandas as pd
import numpy as np
import os

file_numbers = [-1] + list(range(1, 9))

for num in file_numbers:
    # 动态生成文件路径
    file_path = fr"..\..\Code\RQ3 Analysis and Visualization\Topic Model Results\stack_overflow_({num})_type_posts.csv"
    
    # 检查文件是否存在
    if not os.path.exists(file_path):
        print(f"Type {file_path} 不存在，跳过处理")
        continue
        
    try:
        # 读取 CSV 文件（保留原始编码）
        df = pd.read_csv(file_path, encoding="ISO-8859-1")
        
        # 统计无接受答案比例
        total = len(df)
        no_answer = (df['has_accepted_answer'] == 0).sum()
        no_answer_ratio = (no_answer / total) * 100 if total > 0 else 0
        
        # 处理有接受答案的记录
        accepted_df = df[df['has_accepted_answer'] == 1].copy()
        if not accepted_df.empty:
            # 转换时间格式（处理可能存在的格式问题）
            accepted_df['created_time'] = pd.to_datetime(accepted_df['created_time'], 
                                                       format="%Y/%m/%d %H:%M", errors='coerce')
            accepted_df['accepted_time'] = pd.to_datetime(accepted_df['accepted_time'], 
                                                        format="%Y/%m/%d %H:%M", errors='coerce')
            
            # 计算时间差（过滤有效时间记录）
            valid_times = accepted_df.dropna(subset=['created_time', 'accepted_time'])
            valid_times['time_diff_min'] = (valid_times['accepted_time'] - valid_times['created_time']
                                           ).dt.total_seconds() / 60
            
            # 计算中位数（单位转换为小时）
            median_hours = np.median(valid_times['time_diff_min']) / 60 if not valid_times.empty else 0
        else:
            median_hours = 0
            
        # 输出结果（保留两位小数）
        print(f"\nType ({num}) ")
        print(f"the percentage of questions without accepted answers: {no_answer_ratio:.2f}%")
        print(f"the median time to receive an accepted answer: {median_hours:.2f} hours")
  
    except Exception as e:
        print(f"\t Type ({num}) error: {str(e)}")


Type (-1) 
the percentage of questions without accepted answers: 78.57%
the median time to receive an accepted answer: 28.33 hours

Type (1) 
the percentage of questions without accepted answers: 74.86%
the median time to receive an accepted answer: 5.58 hours

Type (2) 
the percentage of questions without accepted answers: 68.06%
the median time to receive an accepted answer: 8.40 hours

Type (3) 
the percentage of questions without accepted answers: 79.25%
the median time to receive an accepted answer: 36.32 hours

Type (4) 
the percentage of questions without accepted answers: 79.41%
the median time to receive an accepted answer: 15.83 hours

Type (5) 
the percentage of questions without accepted answers: 80.13%
the median time to receive an accepted answer: 61.98 hours

Type (6) 
the percentage of questions without accepted answers: 72.00%
the median time to receive an accepted answer: 3.88 hours

Type (7) 
the percentage of questions without accepted answers: 82.99%
the median ti