In [None]:
import itertools
from pathlib import Path

import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from scipy.stats import pearsonr


sns.set_context("notebook")
sns.set_palette("colorblind")
sns.set_style("dark")

In [None]:
scores_df = pd.read_csv(Path(r"S:\E_ResearchData\evdplanner\CombinedGrades.csv"))
scores_df.head()

In [None]:
# Do a majority voting by RaterID
majority = scores_df.groupby(["PatientID", "Modality", "Side"])["Score"].agg(
    lambda x: x.value_counts().idxmax()
)
majority = majority.reset_index()

print(len(scores_df))
print(len(majority))

# Insert the majority voting into the original dataframe with rater ID "MAJ"
majority["RaterID"] = "Majority"
df = pd.concat([scores_df, majority], ignore_index=True)

print(len(df))
scores = df[df["RaterID"] == "Majority"]["Score"].value_counts()
total = scores.sum()
for score, count in scores.items():
    print(f"{score}: {count} ({count / total:.2%})")

In [None]:
graders = {idx: grader for idx, grader in enumerate(df["RaterID"].unique())}
pairs = itertools.combinations(graders.keys(), 2)

# Pearson correlation
print("Pearson correlation between graders:")
for a, b in pairs:
    a_scores = df.loc[df["RaterID"] == graders[a], "Score"]
    b_scores = df.loc[df["RaterID"] == graders[b], "Score"]
    corr, p = pearsonr(a_scores, b_scores)
    print(f"{graders[a]} and {graders[b]}: {corr:.2f} ({p:.4f})")

In [None]:
import numpy as np
import statsmodels.api as sm
from statsmodels.miscmodels.ordinal_model import OrderedModel

# Parameters
n_simulations = 1000  # Number of simulations
n_mri = 50  # Number of MRI samples
n_ct = 48  # Number of CT samples
effect_sizes = np.linspace(0, 1, 21)  # Effect sizes to test
thresholds = [0, 1]  # Define thresholds between ordinal categories
significance_level = 0.05
num_categories = 3  # Number of ordinal categories (Kakarla score)
np.random.seed(42)  # Set seed for reproducibility

# Total sample size
n_samples = n_mri + n_ct

# Placeholder for storing p-values
p_values = []
effects = []

# Simulating the data and running the model in a loop
for effect_size in effect_sizes:
    for sim in range(n_simulations):
        print(f"ES {effect_size}: Simulation {sim + 1}/{n_simulations}", end="\r")
        # Simulate Modality variable with imbalance (1 = MRI, 0 = CT)
        modality = np.concatenate([np.ones(n_mri), np.zeros(n_ct)])

        # Simulate Side variable (0 = Left, 1 = Right)
        side = np.array([0, 1] * n_samples)

        # Shuffle the data
        p = np.random.permutation(n_samples)
        modality = modality[p]
        side = side[p]

        # Simulate the latent variable as a linear combination of the predictors + noise
        latent = (
            effect_size * modality + effect_size * side + np.random.normal(0, 1, size=n_samples)
        )

        # Convert latent variable to ordinal categories using thresholds
        y = np.digitize(latent, thresholds)

        # Create a DataFrame
        df_sim = pd.DataFrame({"Modality": modality, "Side": side, "Kakarla_score": y})

        # Fit the ordinal logistic regression model
        model = OrderedModel(df_sim["Kakarla_score"], df_sim[["Modality", "Side"]], distr="logit")
        results = model.fit(disp=False)

        # Extract p-value for the main effects (Modality, Side)
        p_value_modality = results.pvalues["Modality"]
        p_value_side = results.pvalues["Side"]

        # Check if either p-value is significant
        p_values.append(min(p_value_modality, p_value_side))

    # Calculate the proportion of significant p-values
    power_estimate = np.mean(np.array(p_values) < significance_level)

    effects.append((effect_size, power_estimate))

for effect_size, power in effects:
    print(f"Effect size: {effect_size}, Power: {power:.3f}")

# Plot the power curve
effects, powers = zip(*effects, strict=False)
plt.plot(effects, powers, marker="o")
plt.xlabel("Effect size")
plt.ylabel("Power")
plt.title("Power curve for ordinal logistic regression")
plt.tight_layout()
plt.show()

In [None]:
from statsmodels.miscmodels.ordinal_model import OrderedModel

# Ordinal Logistic Regression
majority["Score"] = majority["Score"].astype("category").cat.codes
majority["Modality"] = majority["Modality"].astype("category").cat.codes
majority["Side"] = majority["Side"].astype("category").cat.codes

majority["interactions"] = majority["Modality"] * majority["Side"]

y = majority["Score"]
X = majority[["Modality", "Side", "interactions"]]

sm.add_constant(X)

model = OrderedModel(y, X, distr="logit")
results = model.fit()
print(results.summary())

In [None]:
print(graders)

In [None]:
anon_df = df.copy()
anon_df["RaterID"] = anon_df["RaterID"].apply(
    lambda x: f"Rater {list(graders.values()).index(x) + 1}" if x != "Majority" else "Majority"
)
anon_df["PatientID"] = anon_df["PatientID"].factorize()[0]

fig, ax = plt.subplots(figsize=(12, 6))

sns.scatterplot(
    data=anon_df,
    x="PatientID",
    y="Score",
    hue="RaterID",
    # multiple="stack",
    # stat="count",
    ax=ax,
)

In [None]:
majority_count = anon_df.loc[anon_df["RaterID"] == "Majority", "Score"].value_counts()
print(majority_count)

values = majority_count.values
labels = [f"Kakarla {x}" for x in majority_count.index]

print(values)
print(labels)

# Sort by label name
values, labels = zip(*sorted(zip(values, labels, strict=False), key=lambda x: x[1]), strict=False)

plt.figure(figsize=(4.5, 4.5))
plt.pie(
    x=values,
    autopct="%1.1f%%",
    pctdistance=1.2,
    startangle=45,
    counterclock=False,
    wedgeprops={
        "width": 0.5,
    },
    colors=[
        "#56A44C",
        "#FFD143",
        "#94353F",
    ],
)
plt.legend(labels, loc="lower right")
plt.title("Majority voting result")
plt.tight_layout()
plt.show()

In [None]:
majority_df = df.loc[df["RaterID"] == "Majority"]
majority_df = majority_df.drop(columns=["RaterID"])
majority_df.to_csv(Path(r"S:\E_ResearchData\evdplanner\MajorityVoting.csv"), index=False)