In [74]:
import pandas as pd
import numpy as np
import scipy.stats as stats
from statsmodels.stats.multicomp import pairwise_tukeyhsd
from itertools import combinations
import matplotlib.pyplot as plt

plt.rcParams['figure.dpi'] = 300
plt.rcParams['font.sans-serif'] = ['Hiragino Sans GB']
plt.rcParams['axes.unicode_minus'] = False


def analyze_and_visualize_subscription_data(file_path):
    try:
        # 读取CSV文件
        data = pd.read_csv(file_path)
        
        # 检查必要的列是否存在
        required_columns = ['Continue', 'Subscribe-NetEase', 'Subscribe-QQMusic', 'Subscribe-KuWo', 'Subscribe-KuGou',
                            'Subscribe-MiGu', 'Subscribe-AppleMusic', 'Subscribe-Spotify', 'MusicLibrary',
                            'AudioQuality', 'Price', 'Costume', 'Download', 'Service']
        missing_columns = [col for col in required_columns if col not in data.columns]
        if missing_columns:
            raise KeyError(f"以下列在数据中不存在: {missing_columns}")
        
        # 定义音乐平台列表
        platforms = ['Subscribe-NetEase', 'Subscribe-QQMusic', 'Subscribe-KuWo', 'Subscribe-KuGou', 'Subscribe-MiGu',
                     'Subscribe-AppleMusic', 'Subscribe-Spotify','Subscribe-Others']
        
        # 遍历每个音乐平台
        for platform in platforms:
            # 筛选出订阅了该平台的用户
            platform_subscribers = data[data[platform] == 1]
            
            if platform_subscribers.empty:
                print(f"平台 {platform} 没有订阅用户数据。")
                continue
            
            # 按续订状态分组，这里假设 'Continue' 列的值为 1 表示续订，2 表示不续订（根据实际情况调整）
            renew_group = platform_subscribers[platform_subscribers['Continue'] == 1]
            non_renew_group = platform_subscribers[platform_subscribers['Continue'] == 2]
            
            # 定义权益评分字段
            score_fields = ['MusicLibrary', 'AudioQuality', 'Price', 'Costume', 'Download', 'Service']
            
            # 使用.loc索引器来避免SettingWithCopyWarning
            renew_group.loc[:, score_fields] = renew_group[score_fields].fillna(7)
            non_renew_group.loc[:, score_fields] = non_renew_group[score_fields].fillna(7)
            
            # 处理只有续订用户的情况
            if non_renew_group.empty and not renew_group.empty:
                renew_scores = renew_group[score_fields].mean()
                max_renew_score_field = renew_scores.idxmin()
                max_renew_score_value = 7-renew_scores[max_renew_score_field]
                print(f"平台 {platform} - 只有续订用户:")
                print(f"权益评分最高的项: {max_renew_score_field} (评分: {max_renew_score_value})")
                
                # 可视化权益评分（只有续订用户）
                x = range(len(score_fields))
                plt.figure(figsize=(10, 6))
                plt.bar([i - 0.2 for i in x], 7-renew_scores.values, width=0.4, label='续订用户')
                plt.xticks(x, score_fields, rotation=45)
                plt.ylabel('平均权益评分')
                plt.title(f"平台 {platform} - 只有续订用户权益评分")
                plt.legend()
                plt.show()
            
            # 处理只有不续订用户的情况
            elif renew_group.empty and not non_renew_group.empty:
                non_renew_scores = non_renew_group[score_fields].mean()
                max_non_renew_score_field = non_renew_scores.idxmin()
                max_non_renew_score_value = 7-non_renew_scores[max_non_renew_score_field]
                print(f"平台 {platform} - 只有不续订用户:")
                print(f"权益评分最高的项: {max_non_renew_score_field} (评分: {max_non_renew_score_value})")
                
                # 可视化权益评分（只有不续订用户）
                x = range(len(score_fields))
                plt.figure(figsize=(10, 6))
                plt.bar([i + 0.2 for i in x], 7-non_renew_scores.values, width=0.4, label='不续订用户')
                plt.xticks(x, score_fields, rotation=45)
                plt.ylabel('平均权益评分')
                plt.title(f"平台 {platform} - 只有不续订用户权益评分")
                plt.legend()
                plt.show()
            
            # 处理既有续订用户又有不续订用户的情况
            elif not renew_group.empty and not non_renew_group.empty:
                renew_scores = renew_group[score_fields].mean()
                non_renew_scores = non_renew_group[score_fields].mean()
                max_renew_score_field = renew_scores.idxmin()
                max_renew_score_value = 7-renew_scores[max_renew_score_field]
                max_non_renew_score_field = non_renew_scores.idxmin()
                max_non_renew_score_value = 7-non_renew_scores[max_non_renew_score_field]
                
                print(f"平台 {platform} - 既有续订用户又有不续订用户:")
                print(f"续订用户中权益评分最高的项: {max_renew_score_field} (评分: {max_renew_score_value})")
                print(f"不续订用户中权益评分最高的项: {max_non_renew_score_field} (评分: {max_non_renew_score_value})")
                
                # 可视化权益评分
                x = range(len(score_fields))
                plt.figure(figsize=(10, 6))
                plt.bar([i - 0.2 for i in x], 7-renew_scores.values, width=0.4, label='续订用户')
                plt.bar([i + 0.2 for i in x], 7-non_renew_scores.values, width=0.4, label='不续订用户')
                plt.xticks(x, score_fields, rotation=45)
                plt.ylabel('平均权益评分')
                plt.title(f"平台 {platform} 的续订与不续订用户权益评分对比")
                plt.legend()
                plt.show()
    
    except FileNotFoundError:
        print(f"文件 {file_path} 未找到。")
    except KeyError as e:
        print(f"数据中缺少必要的列: {e}")
    except Exception as e:
        print(f"发生错误: {e}")



In [75]:
def analyze_and_visualize_music_scores(file_path):
    try:
        # 读取CSV文件
        data = pd.read_csv(file_path)
        
        # 去除Non - Subscribe列中值为1的数据
        data = data[data['Non-Subscribe'] != 1]
        
        # 定义权益评分字段
        score_columns = ['MusicLibrary', 'AudioQuality', 'Price', 'Costume', 'Download',
                         'Service']
        
        # 将空的权益分数列用7填充
        for col in score_columns:
            data[col] = data[col].fillna(7)
        
        # 计算订阅人群的权益评分平均值
        average_scores = 7-data[score_columns].mean()
        
        # 打印结果
        print("订阅人群各权益分数的平均值:")
        print(average_scores)
        
        # 可视化
        x = np.arange(len(score_columns))
        plt.bar(x, average_scores.values, width=0.4, color='skyblue')
        
        # 设置坐标轴标签和标题
        plt.xlabel('Score Items')
        plt.ylabel('Average Score')
        plt.title('Average Scores of Subscription Benefits')
        plt.xticks(x, score_columns, rotation=0)
        plt.savefig('subscription.png')
        plt.show()
    
    except FileNotFoundError:
        print(f"文件 {file_path} 未找到。")
    except KeyError as e:
        print(f"数据中缺少必要的列: {e}")
    except Exception as e:
        print(f"发生错误: {e}")


In [76]:
# ANOVA

def anova(file_path):
    try:
        # 读取CSV文件
        data = pd.read_csv(file_path)
        
        # 去除Non - Subscribe列中值为1的数据
        data = data[data['Non-Subscribe'] != 1]
        
        # 定义权益评分字段
        score_columns = ['MusicLibrary', 'AudioQuality', 'Price', 'Costume', 'Download',
                         'Service']
        
        # 将空的权益分数列用0填充
        for col in score_columns:
            data[col] = data[col].fillna(0)

        f, p = stats.f_oneway(data['MusicLibrary'], data['AudioQuality'], data['Price'], data['Costume'],
                        data['Download'], data['Service'])
        
        # 打印结果
        print("ANOVA结果:")
        print(f"F-value: {f}")
        print(f"P-value: {p}")
    
    except FileNotFoundError:
        print(f"文件 {file_path} 未找到。")
    except KeyError as e:
        print(f"数据中缺少必要的列: {e}")
    except Exception as e:
        print(f"发生错误: {e}")

In [77]:
def perform_anova_with_fisher_lsd(file_path):
    try:
        # 读取CSV文件
        data = pd.read_csv(file_path)
        
        # 定义权益评分字段
        score_columns = ['MusicLibrary', 'AudioQuality', 'Price', 'Costume', 'Download', 'Service']
        
        # 检查必要的列是否存在
        missing_columns = [col for col in score_columns if col not in data.columns]
        if missing_columns:
            raise KeyError(f"以下列在数据中不存在: {missing_columns}")
        
        # 将数据转换为长格式以便进行 Fisher's LSD 检验
        melted_data = pd.melt(data[score_columns], var_name='Feature', value_name='Score')
        melted_data = melted_data.dropna()
        
        # 执行单因素方差分析
        score_data = [data[col].dropna() for col in score_columns]
        f_statistic, p_value = stats.f_oneway(*score_data)
        
        # 打印方差分析结果
        print("单因素方差分析结果:")
        print(f"F统计量: {f_statistic:.2f}")
        print(f"P值: {p_value:.4f}")
        
        if p_value < 0.05:
            print("结论: 不同权益评分之间存在显著差异。")
            
            # 执行 Fisher's LSD 检验
            print("\nFisher's LSD 检验结果:")
            features = melted_data['Feature'].unique()
            combinations_list = list(combinations(features, 2))
            results = []

            for group1, group2 in combinations_list:
                group1_scores = melted_data[melted_data['Feature'] == group1]['Score']
                group2_scores = melted_data[melted_data['Feature'] == group2]['Score']
                t_stat, p_val = stats.ttest_ind(group1_scores, group2_scores)
                results.append((group1, group2, t_stat, p_val))

            # 打印每对组合的结果
            for group1, group2, t_stat, p_val in results:
                print(f"{group1} vs {group2}: t统计量 = {t_stat:.2f}, p值 = {p_val:.4f}")
                if p_val < 0.05:
                    print(f"    -> {group1} 和 {group2} 之间存在显著差异。")
                else:
                    print(f"    -> {group1} 和 {group2} 之间没有显著差异。")

            # 可视化结果
            plt.figure(figsize=(10, 6))
            mean_scores = melted_data.groupby('Feature')['Score'].mean()
            std_scores = melted_data.groupby('Feature')['Score'].std()
            plt.bar(mean_scores.index, mean_scores, yerr=std_scores, capsize=5, color='skyblue')
            plt.ylabel('Mean Score')
            plt.title("Fisher's LSD Test Results")
            plt.xticks(rotation=45)
            plt.show()
        else:
            print("结论: 不同权益评分之间不存在显著差异。")
        
    except FileNotFoundError:
        print(f"文件 {file_path} 未找到。")
    except KeyError as e:
        print(f"数据中缺少必要的列: {e}")
    except Exception as e:
        print(f"发生错误: {e}")

In [None]:
perform_anova_with_fisher_lsd(file_path)

In [None]:
anova(file_path)

In [None]:
analyze_and_visualize_music_scores(file_path)

In [None]:
# 调用函数并传入CSV文件路径
file_path = 'SurveyData.csv'
analyze_and_visualize_subscription_data(file_path)