In [7]:
import numpy as np
from scipy.stats import ks_2samp, anderson_ksamp
import matplotlib.pyplot as plt
import pandas as pd

In [18]:
df=pd.read_csv('./final_momentum_data.csv')
momentum_difference=np.array(df['momentum_difference'])
momentum_difference=np.where(momentum_difference > 0, 1, 0)
df_origin=pd.read_csv('Wimbledon_featured_matches.csv')
df_origin['momentum_difference']=momentum_difference

In [19]:
def count_consecutive_elements_np(arr):
    if len(arr) == 0:
        return np.array([])

    counts = []
    current_count = 1

    for i in range(1, len(arr)):
        if arr[i] == arr[i - 1]:
            current_count += 1
        else:
            counts.append(current_count)
            current_count = 1

    counts.append(current_count)  # 处理最后一个数的连续出现次数
    return np.array(counts)

In [20]:
np.random.seed(42)
sample1 = np.random.normal(loc=0, scale=1, size=100000)
sample2 = np.random.normal(loc=0, scale=1, size=300)

# K-S检验
def test_random(sample1,momentum_difference):
    
    ks_statistic, ks_p_value = ks_2samp(sample1, momentum_difference)
    print(f"K-S统计量: {ks_statistic}")
    print(f"K-S检验的p-value: {ks_p_value}")

    # Anderson-Darling检验
    ad_statistic, ad_critical_values, ad_significance_level = anderson_ksamp([sample1, momentum_difference])
    print(f"Anderson-Darling统计量: {ad_statistic}")
    print(f"Anderson-Darling检验的临界值: {ad_critical_values}")
    print(f"Anderson-Darling检验的显著性水平: {ad_significance_level}")

    # 根据检验结果进行解释
    alpha = 0.01  # 显著性水平
    print('----------------------------------------')
    if ks_p_value < alpha:
        print("拒绝原假设：两组数据不来自相同分布 (K-S检验)")
    else:
        print("接受原假设：两组数据来自相同分布 (K-S检验)")

    if ad_statistic > ad_critical_values[2]:
        print("拒绝原假设：两组数据不来自相同分布 (Anderson-Darling检验)")
    else:
        print("接受原假设：两组数据来自相同分布 (Anderson-Darling检验)")
    print('----------------------------------------')

In [21]:
sample1 = np.random.normal(loc=0, scale=1, size=100000)
for i in df_origin['match_id'].unique():
    print('in match ',i)
    temp=df_origin[df_origin['match_id']==i]
    momentum_difference=np.array(temp['momentum_difference'])
    test_random(sample1=sample1,momentum_difference=momentum_difference)

in match  2023-wimbledon-1301
K-S统计量: 0.49972
K-S检验的p-value: 1.4558120605478684e-69
Anderson-Darling统计量: 138.3749154241521
Anderson-Darling检验的临界值: [0.325 1.226 1.961 2.718 3.752 4.592 6.546]
Anderson-Darling检验的显著性水平: 0.001
----------------------------------------
拒绝原假设：两组数据不来自相同分布 (K-S检验)
拒绝原假设：两组数据不来自相同分布 (Anderson-Darling检验)
----------------------------------------
in match  2023-wimbledon-1302
K-S统计量: 0.49972
K-S检验的p-value: 6.180383709340608e-47
Anderson-Darling统计量: 57.564639035590574
Anderson-Darling检验的临界值: [0.325 1.226 1.961 2.718 3.752 4.592 6.546]
Anderson-Darling检验的显著性水平: 0.001
----------------------------------------
拒绝原假设：两组数据不来自相同分布 (K-S检验)
拒绝原假设：两组数据不来自相同分布 (Anderson-Darling检验)
----------------------------------------
in match  2023-wimbledon-1303
K-S统计量: 0.49972
K-S检验的p-value: 1.8244646115665357e-31
Anderson-Darling统计量: 39.51755825069059
Anderson-Darling检验的临界值: [0.325 1.226 1.961 2.718 3.752 4.592 6.546]
Anderson-Darling检验的显著性水平: 0.001
-------------------------------------

  ad_statistic, ad_critical_values, ad_significance_level = anderson_ksamp([sample1, momentum_difference])


Anderson-Darling统计量: 150.79262949321657
Anderson-Darling检验的临界值: [0.325 1.226 1.961 2.718 3.752 4.592 6.546]
Anderson-Darling检验的显著性水平: 0.001
----------------------------------------
拒绝原假设：两组数据不来自相同分布 (K-S检验)
拒绝原假设：两组数据不来自相同分布 (Anderson-Darling检验)
----------------------------------------
in match  2023-wimbledon-1307
K-S统计量: 0.49972
K-S检验的p-value: 7.313112071506096e-54
Anderson-Darling统计量: 96.8795550918104
Anderson-Darling检验的临界值: [0.325 1.226 1.961 2.718 3.752 4.592 6.546]
Anderson-Darling检验的显著性水平: 0.001
----------------------------------------
拒绝原假设：两组数据不来自相同分布 (K-S检验)
拒绝原假设：两组数据不来自相同分布 (Anderson-Darling检验)
----------------------------------------
in match  2023-wimbledon-1308
K-S统计量: 0.49972
K-S检验的p-value: 2.142319759352412e-44
Anderson-Darling统计量: 55.94361637970142
Anderson-Darling检验的临界值: [0.325 1.226 1.961 2.718 3.752 4.592 6.546]
Anderson-Darling检验的显著性水平: 0.001
----------------------------------------
拒绝原假设：两组数据不来自相同分布 (K-S检验)
拒绝原假设：两组数据不来自相同分布 (Anderson-Darling检验)
-----------------