In [None]:
"""
First, read in the data
"""

import pathlib
from scale_morphology.scales import util

# Put the directory containing segmented scales here
# Here I've read directly from the RDSF
parent_dir = (
    pathlib.Path(util.config()["rdsf_mount"]) / "Carran/Postgrad/segmentations_cleaned"
).expanduser()
assert parent_dir.exists()

segmentation_paths = sorted([str(x) for x in parent_dir.glob("*.tif")])
f"{len(segmentation_paths)} segmentations"

In [None]:
"""
Read in metadata, including image filepaths
"""

import numpy as np

from scale_morphology.scales import metadata

df = metadata.df([str(x) for x in segmentation_paths])
df = df.drop(columns="no_scale")

assert len(df) == 928, "Did the number of scales change?"
print(len(df), "scales after dropping empty ones")
df.head()

In [None]:
"""
If necessary, read in the scales from the RDSF and perform EFA on them.

Otherwise read in the EFA coefficients from a cache
"""

import numpy as np

from scale_morphology.scales import efa


coeff_dump = pathlib.Path("efa_coeffs.npy")

if coeff_dump.is_file():
    coeffs = np.load(coeff_dump)
else:
    n_edge_points, order = 300, 50
    coeffs = efa.run_analysis(
        df["path"],
        df["magnification"],
        n_points=n_edge_points,
        order=order,
        n_threads=32,
    )

    np.save(coeff_dump, coeffs)

In [None]:
"""
Choose which scales to keep, and the criteria to group the scales by
"""

# This tells us we want to keep:
# - scales where sex is NOT ?
# - scales where growth is infinite (i.e. onto)
# - scales where mutation is WT
keep = (df["sex"] != "?") & (df["growth"] == np.inf) & (df["mutation"] == "WT")
filtered_df = df[keep]

# Choose which category to group by when we do the LDA and colour-coded plotting
categories = ["sex", "age"]

grouping_df = filtered_df.loc[keep, categories]

In [None]:
"""
Perform PCA

We won't scale the features before PCA, because we want them to keep their original
magnitudes - if we scale the features to have a standard mean/std, then we will
artificially inflate the importance of the higher harmonics.

I think this is the right thing to do...
"""

from sklearn.decomposition import PCA

# We can choose any number of PCs to extract from our features
# We will want at least enough to describe the variation in the dataset
# But not so many that we also pick up noise
n_components = 10

pca = PCA(n_components=n_components)
pca_coeffs = pca.fit_transform(coeffs[keep])

In [None]:
"""
Plot a heatmap of the PCA coefficients
"""
from scale_morphology.scales import plotting

plotting.heatmap(pca.components_)

In [None]:
"""
Pairplot of PCA coefficients
"""

from matplotlib.colors import TABLEAU_COLORS

# Get the right number of colours
# Don't worry about how this works it's a stupid hack
colours = list(TABLEAU_COLORS.values())[
    : len(grouping_df.groupby(list(grouping_df.columns)))
]

plotting.pair_plot(pca_coeffs, grouping_df, colours, axis_label="PCA")

In [None]:
"""
Run LDA
"""

import pandas as pd
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

# Get labels for our different categories - we need to label each row in our dataframe
# with which group it belongs to
labels, uniques = pd.factorize(grouping_df.apply(lambda row: tuple(row.values), axis=1))

# Now we cannot choose how many components to use in our dimensionality reduction;
# LDA just finds the best (N-1) axes to distinguish our N classes.
# Technically we could use any number less than N-1, but we want to keep all of them
lda = LinearDiscriminantAnalysis()
lda_coeffs = lda.fit_transform(pca_coeffs, labels)

In [None]:
"""Plot LDA heatmap of axes and pairplot"""
import importlib; importlib.reload(plotting)
plotting.heatmap((pca.components_.T @ lda.scalings_).T)

plotting.pair_plot(lda_coeffs, grouping_df, colours, axis_label="LDA")

In [None]:
"""
Run k-fold validation to check the stability of the LDA
"""

from sklearn.feature_selection import f_classif
from sklearn.metrics import classification_report
from sklearn.model_selection import StratifiedKFold, cross_val_score, cross_val_predict


def k_fold_lda(input_coeffs: np.ndarray, grouping_df: pd.DataFrame) -> None:
    """
    Run LDA on 5 folds of the given data and print the accuracy report
    """

cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=0)
lda_ = LinearDiscriminantAnalysis()

scores = cross_val_score(
        lda_, lda_coeffs, labels, cv=cv, scoring="balanced_accuracy"
    )
print(f"Balanced accuracy: {scores.mean():.3f} Â± {scores.std():.3f}")

preds = cross_val_predict(lda_, lda_coeffs, labels, cv=cv)
print(classification_report(labels, preds, target_names=[str(u) for u in uniques]))

In [None]:
"""
Plot LDA effect size - how much there
"""

import matplotlib.pyplot as plt

fold_eta_sq = []
for train_idx, test_idx in cv.split(lda_coeffs, labels):
    lda_fold = LinearDiscriminantAnalysis().fit(
        lda_coeffs[train_idx], labels[train_idx]
    )
    ld_test = lda_fold.transform(lda_coeffs[test_idx])
    f_vals, _ = f_classif(ld_test, labels[test_idx])

    df_between = len(np.unique(labels[test_idx])) - 1
    df_within = len(test_idx) - len(np.unique(labels[test_idx]))
    eta_sq = (df_between * f_vals) / (df_between * f_vals + df_within)
    fold_eta_sq.append(eta_sq)

fold_eta_sq = np.stack(fold_eta_sq)

fig, ax = plt.subplots(figsize=(10, 6))
ax.bar(
    np.arange(fold_eta_sq.shape[1]),
    fold_eta_sq.mean(axis=0),
    yerr=fold_eta_sq.std(axis=0),
    capsize=5,
    alpha=0.7,
)
ax.set_xlabel("LDA Axis")
ax.set_ylabel(r"Effect Size ($\eta^2$)")
ax.set_title("LDA Effect Size by Axis")
ax.set_xticks(np.arange(fold_eta_sq.shape[1]))
fig.tight_layout()