In [None]:
import pandas as pd
import numpy as np
from scipy import stats
import statsmodels.api as sm
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt
from lifelines import KaplanMeierFitter
from lifelines.statistics import logrank_test
from sklearn.metrics import roc_curve, roc_auc_score
import scikit_posthocs as sp

In [None]:
hf = pd.read_csv("data/heart_failure_clinical_records_dataset.csv")
stud = pd.read_csv("data/student-por.csv")

# Statystyka opisowa

## Heart Failure

In [None]:
hf.describe()

In [None]:
hf.info()

In [None]:
hf.head()

## Student Por

In [None]:
stud.describe()

In [None]:
stud.info()

In [None]:
stud.head()

# Testy parametryczne

## Test t

In [None]:
alive = hf.loc[hf["DEATH_EVENT"] == 0, "ejection_fraction"]
dead  = hf.loc[hf["DEATH_EVENT"] == 1, "ejection_fraction"]
t_stat, p_val = stats.ttest_ind(alive, dead, equal_var=False)
print(f"Test t (frakcja wyrzutowa): t = {t_stat:.2f}, p = {p_val:.4f}")

## anova wiek 3 grupy

In [None]:
hf["age_cat"] = pd.cut(hf["age"], bins=[0,50,60,120], labels=["<50","50‑60",">60"])
anova = sm.stats.anova_lm(smf.ols("serum_sodium ~ C(age_cat)", data=hf).fit())
anova

# Nieparametryczne

In [None]:
u_stat, p_val = stats.mannwhitneyu(
    hf.loc[hf["DEATH_EVENT"] == 0, "platelets"],
    hf.loc[hf["DEATH_EVENT"] == 1, "platelets"],
    alternative="two-sided")
print(f"Mann‑Whitney U (platelets): U = {u_stat:.0f}, p = {p_val:.4f}")

In [None]:
chi2, p, dof, tbl = stats.chi2_contingency(pd.crosstab(hf["sex"], hf["DEATH_EVENT"]))
print(f"Chi‑kwadrat (sex × zgon): χ² = {chi2:.2f}, p = {p:.4f}\n")
print(tbl)

# Kaplan mayer

In [None]:
kmf = KaplanMeierFitter()

fig, ax = plt.subplots()
kmf.fit(hf["time"], event_observed=hf["DEATH_EVENT"], label="wszyscy")
kmf.plot(ax=ax)

for grp, lbl in [(1,"palący"), (0,"nie‑palący")]:
    mask = hf["smoking"] == grp
    kmf.fit(hf.loc[mask, "time"], event_observed=hf.loc[mask, "DEATH_EVENT"], label=lbl)
    kmf.plot(ax=ax)

ax.set_xlabel("Czas (dni)")
ax.set_ylabel("Prawdopodobieństwo przeżycia")
ax.set_title("Krzywe przeżycia Kaplana‑Meiera")
plt.show()

result = logrank_test(
    hf.loc[hf.smoking==1, "time"], hf.loc[hf.smoking==0, "time"],
    event_observed_A=hf.loc[hf.smoking==1, "DEATH_EVENT"],
    event_observed_B=hf.loc[hf.smoking==0, "DEATH_EVENT"])
print(result)

# Regresja

In [None]:
formula = "DEATH_EVENT ~ age + ejection_fraction + serum_sodium + smoking"
logit_model = smf.logit(formula, data=hf).fit()
logit_model.summary()

# ROC / AUC

In [None]:
y_true = hf["DEATH_EVENT"]
y_pred = logit_model.predict(hf)
fpr, tpr, _ = roc_curve(y_true, y_pred)
auc = roc_auc_score(y_true, y_pred)

plt.plot(fpr, tpr, label=f"AUC = {auc:.3f}")
plt.plot([0,1], [0,1], linestyle="--")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC – regresja logistyczna (Heart Failure)")
plt.legend()
plt.show()

# Friedmann

In [None]:
friedman_stat, p_val = stats.friedmanchisquare(stud["G1"], stud["G2"], stud["G3"])
print(f"Friedman: χ² = {friedman_stat:.2f}, p = {p_val:.4e}")

posthoc = sp.posthoc_wilcoxon(stud[["G1","G2","G3"]], p_adjust="holm")
display(posthoc)

plt.boxplot([stud["G1"], stud["G2"], stud["G3"]], labels=["G1","G2","G3"])
plt.ylabel("Ocena")
plt.title("Zmiana ocen w trzech okresach (Student Performance)")
plt.show()