In [1]:
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler

data = fetch_california_housing(as_frame=True, return_X_y=True)
df, target = data[0], data[1] 
df = StandardScaler().fit_transform(df)

cv = KFold(n_splits=4, shuffle=True, random_state=6)

In [8]:
from sklearn.linear_model import Ridge
from scipy.stats import binom
import numpy as np

In [5]:
alphas = [1, 10]
errors = []

In [6]:
for alpha in alphas:
    model = Ridge(alpha=alpha)
    predictions = cross_val_predict(model, df, target, cv=cv)
    squared_errors = (predictions - target) ** 2
    errors.append(squared_errors)

In [9]:
medians = [np.median(error) for error in errors]

In [10]:
medians

[0.17193921450849575, 0.1721379116684697]

In [13]:
len(errors), len(errors[0]), len(errors[1])

(2, 20640, 20640)

In [23]:
sign_test_results = []

In [24]:
for i in range(len(errors[0])):
    if errors[0][i] - errors[1][i] > 0:
        sign_test_results.append(1)
    else: 
        sign_test_results.append(0)

In [25]:
sum(sign_test_results)

9928

In [26]:
from scipy.stats import binom

m = 0
N = len(sign_test_results)
tN = sum(sign_test_results)

binom(n=N, p=0.5).cdf(tN) * 2

5.015427635331782e-08

In [27]:
import pandas as pd

In [35]:
data = [7, 1, 5, 1, 3, 2, 5]
data = pd.DataFrame(data, columns=['Num', ])

In [36]:
data['rank'] = data.Num.rank()

In [37]:
data

Unnamed: 0,Num,rank
0,7,7.0
1,1,1.5
2,5,5.5
3,1,1.5
4,3,4.0
5,2,3.0
6,5,5.5


In [38]:
from sklearn.datasets import fetch_california_housing

data = fetch_california_housing(as_frame=True, return_X_y=True)
df, target = data[0], data[1] 

In [42]:
df[df['HouseAge'] >= 30]['MedInc']

0        8.3252
2        7.2574
3        5.6431
4        3.8462
5        4.0368
          ...  
20620    4.5625
20621    2.3661
20623    2.8235
20625    4.1250
20626    2.1667
Name: MedInc, Length: 9971, dtype: float64

In [43]:
from scipy.stats import mannwhitneyu

mannwhitneyu(
    x=df[df['HouseAge'] >= 30]['MedInc'],
    y=df[df['HouseAge'] < 30]['MedInc']
)

MannwhitneyuResult(statistic=46355215.0, pvalue=1.802307074740702e-57)

In [44]:
df['MedIncRank'] = df['MedInc'].rank()
rn = df[df['HouseAge'] >= 30].MedIncRank.sum()

N1 = len(df[df['HouseAge'] >= 30])
N2 = len(df[df['HouseAge'] < 30])

In [45]:
from scipy.stats import norm

mu = N1 * (N1 + N2 + 1) / 2
sigma = (N1 * N2 * (N1 + N2 + 1) / 12) ** 0.5

norm(loc=mu, scale=sigma).cdf(rn) * 2

1.8023127039963995e-57