# Hypotesprövning

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats

plt.rcParams["figure.figsize"] = (8, 5)
plt.rcParams["axes.grid"] = True
plt.rcParams["figure.dpi"] = 120
np.set_printoptions(precision=3)
pd.set_option("display.precision", 3)

In [None]:
np.random.seed(42)

n_total = 200
n_A = n_total // 2
n_B = n_total - n_A

dates = pd.date_range("2025-01-01", periods=n_total, freq="D")

A_metric = np.random.normal(loc=50, scale=10, size=n_A)
B_metric = np.random.normal(loc=53, scale=11, size=n_B)

df = pd.DataFrame({
    "group": ["A"]*n_A + ["B"]*n_B,
    "metric": np.concatenate([A_metric, B_metric]),
    "date": dates
})


nan_idx = np.random.choice(df.index, size=5, replace=False)
df.loc[nan_idx, "metric"] = np.nan

dup_row = df.sample(1, random_state=17)
df = pd.concat([df, dup_row], ignore_index=True)

outliers_idx_A = df.query("group == 'A' and metric.notna()", engine="python").sample(1, random_state=18).index
outliers_idx_B = df.query("group == 'B' and metric.notna()", engine="python").sample(1, random_state=19).index

df.loc[outliers_idx_A, "metric"] = df.loc[outliers_idx_A, "metric"] + 80
df.loc[outliers_idx_B, "metric"] = df.loc[outliers_idx_B, "metric"] - 60


df.to_csv("ab_data.csv", index=False)

In [None]:
data = pd.read_csv("ab_data.csv", parse_dates=["date"])

display(data.head())
display(data.info())
display(data.describe())

dup_mask = data.duplicated(keep=False)

print("Antal dubletter", dup_mask.sum())
data[dup_mask].head()


In [None]:
clean = data.dropna(subset=["metric"]).copy()

before = len(clean)
clean = clean.drop_duplicates()
after = len(clean)
print(f"Dubletter borttagna: {before - after}")

counts = clean["group"].value_counts()
print("Antal per grupp:\n", counts)
assert (counts >= 20).all(), "För få observationer i en grupp!"

clean.head()

In [None]:
group_stats = clean.groupby("group")["metric"].agg(["count", "mean", "median", "std", "min", "max"])
group_stats

In [None]:
# Histogram per grupp
fig, ax = plt.subplots()
for g, subset in clean.groupby("group"):
    ax.hist(subset["metric"], bins=20, label=f"Grupp {g}", alpha=0.5)
ax.set_title("Histogram av metric per grupp")
ax.set_xlabel("Metric")
ax.set_ylabel("Antal")
ax.legend()
plt.show()

# Boxplot
fig, ax = plt.subplots()
ax.boxplot([clean.loc[clean.group == "A", "metric"], clean.loc[clean.group == "B", "metric"]], tick_labels=["A", "B"], showmeans=True)
ax.set_title("Boxplot av metric per grupp")
ax.set_xlabel("Grupp")
ax.set_ylabel("metric")
plt.show()

# Violinplot
fig, ax = plt.subplots()
ax.violinplot([clean.loc[clean.group == "A", "metric"], clean.loc[clean.group == "B", "metric"]], showmeans=True, showmedians=True)
ax.set_title("Violinplot av metric per grupp")
ax.set_xlabel("Grupp")
ax.set_ylabel("metric")
plt.show()

In [None]:
low, high = clean["metric"].quantile([0.01, 0.99])
clipped = clean[(clean["metric"] >= low) & (clean["metric"] <= high)]

compare_means = pd.DataFrame({
    "original_mean_A": [clean.loc[clean.group == "A", "metric"].mean()],
    "original_mean_B": [clean.loc[clean.group == "B", "metric"].mean()],
    "clipped_mean_A": [clipped.loc[clipped.group == "A", "metric"].mean()],
    "clipped_mean_B": [clipped.loc[clipped.group == "B", "metric"].mean()],
})

compare_means

In [None]:
A = clean.loc[clean.group == "A", "metric"].values
B = clean.loc[clean.group == "B", "metric"].values

# Standard t-test (antar lika varians)
t_stat, p_val = stats.ttest_ind(A, B, equal_var=True)

# Welch t-test (antar inte lika varians, ofta vettigt i praktiken)
t_stat_w, p_val_w = stats.ttest_ind(A, B, equal_var=False)

print(f"Standard t-test: t = {t_stat:.3f}, p = {p_val:.4f}")
print(f"Welch    t-test: t = {t_stat_w:.3f}, p = {p_val_w:.4f}")

In [None]:
# 1) Räkna skillanden i medel
diff_mean = B.mean() - A.mean()

# 2) Räkna spridning/standardfel. För Welch används gruppvisa varianser och storlekar.
var_A = A.var(ddof=1)
var_B = B.var(ddof=1)
nA, nB = len(A), len(B)

# Standardfel för skillnaden enligt Welch
se_diff = np.sqrt(var_A/nA + var_B/nB)

# 3) t-statistik = (observerad skillnad) / (standardfel)
t_manual = diff_mean / se_diff

# 4) Frihetsgrader i Welch (Satterthwaite approx): bara för visning - SciPy räknar åt oss.
df_num = (var_A/nA + var_B/nB)**2
df_den = (var_A**2/((nA**2)*(nA-1))) + (var_B**2/((nB**2)*(nB-1)))
df_welch = df_num / df_den

diff_mean, se_diff, t_manual, df_welch

In [None]:
np.random.seed(123)

n_boot = 10_000
obs_diff = B.mean() - A.mean()

boot_diffs = np.empty(n_boot)
for i in range(n_boot):
    A_star = np.random.choice(A, size=len(A), replace=True)
    B_star = np.random.choice(B, size=len(B), replace=True)
    boot_diffs[i] = B_star.mean() - A_star.mean()

p_boot = np.mean(np.abs(boot_diffs) >= abs(obs_diff))

ci_low, ci_high = np.percentile(boot_diffs, [2.5, 97.5])

print(obs_diff)
print(p_boot)
print((ci_low, ci_high))

In [None]:
np.random.seed(123)

all_vals = np.concatenate([A, B])
nA, nB = len(A), len(B)
obs_diff = B.mean() - A.mean()

n_perm = 10_000
perm_diffs = np.empty(n_perm)
for i in range(n_perm):
    perm = np.random.permutation(all_vals)
    perm_A, perm_B = perm[:nA], perm[nA:]
    perm_diffs[i] = perm_B.mean() - perm_A.mean()

p_perm = np.mean(np.abs(perm_diffs) >= abs(obs_diff))
ci_low_perm, ci_high_perm =np.percentile(perm_diffs, [2.5, 97.5])
print(obs_diff)
print(p_perm)
print((ci_low_perm, ci_high_perm))

In [None]:
np.random.seed(123)

grand_mean = np.mean(np.concatenate([A, B]))

A0 = A - A.mean() + grand_mean
B0 = B - B.mean() + grand_mean

n_boot = 10_000
obs_diff = B.mean() - A.mean()

boot_null_diffs = np.empty(n_boot)
for i in range(n_boot):
    A_star = np.random.choice(A0, size=len(A0), replace=True)
    B_star = np.random.choice(B0, size=len(B0), replace=True)
    boot_null_diffs[i] = B_star.mean() - A_star.mean()

p_boot_null = np.mean(np.abs(boot_null_diffs) >= abs(obs_diff))

ci_boot_null = np.percentile(boot_null_diffs, [2.5, 97.5])

print(obs_diff)
print(p_boot_null)
print((ci_boot_null))

In [None]:
# Cohen's d för oberoende grupper (pooled standardavvikalse)
sA2 = A.var(ddof=1)
sB2 = B.var(ddof=1)

sp = np.sqrt(((len(A)-1)*sA2 + (len(B)-1)*sB2) / (len(A) + len(B) - 2))
cohens_d = (B.mean() - A.mean()) / sp

print(cohens_d)

In [None]:
fig, ax = plt.subplots()
ax.errorbar(x=[0], y=[obs_diff], yerr=[[obs_diff - ci_low], [ci_high - obs_diff]], fmt='o', capsize=6)
ax.axhline(0, color="black", linestyle="--", linewidth=1)
ax.set_xticks([0])
ax.set_xticklabels(["B - A"])
ax.set_ylabel("Skillnad i medel (metric)")
ax.set_title("Skilland i medel och 95% CI (bootstrap inom grupper)")
plt.show()

fix, ax = plt.subplots()
ax.hist(perm_diffs, bins=40)
ax.axvline(obs_diff, color="red", linestyle="-", linewidth=2, label=f"Observerad diff = {obs_diff:.2f}")
ax.axvline(-abs(obs_diff), color="red", linestyle="--", linewidth=1)
ax.axvline(+abs(obs_diff), color="red", linestyle="--", linewidth=1)
ax.set_title(f"Permutationstest: fördelning under H0 (p ≈ {p_perm:.3f})")
ax.set_xlabel("B - A (perm-diff)")
ax.set_ylabel("Antal")
ax.legend()
plt.show()
