### Generate reference wt spline for transcriptional latent space

In [1]:
from dev.seq.hooke_latent_projections.project_ccs_data import *
import pandas as pd
import numpy as np
import os

In [2]:
# root = "/media/nick/hdd02/Cole Trapnell's Lab Dropbox/Nick Lammers/Nick/morphseq/"
root = "/Users/nick/Cole Trapnell's Lab Dropbox/Nick Lammers/Nick/morphseq/"
model_name = "bead_expt_linear"

# path to save data
out_path = os.path.join(root, "results", "20240303", "")
os.makedirs(out_path, exist_ok=True)

fig_path = "/Users/nick/Cole Trapnell's Lab Dropbox/Nick Lammers/Nick/slides/morphseq/20250312/morphseq_cca/"
os.makedirs(fig_path, exist_ok=True)

### Load hooke embeddigs and metadata

In [3]:
# set path to hooke projections
hooke_model_name = "bead_expt_linear"
latent_path = "/Users/nick/Cole Trapnell's Lab Dropbox/Nick Lammers/Nick/morphseq/seq_data/emb_projections/latent_projections/"
hooke_model_path = os.path.join(latent_path, hooke_model_name, "")

# hooke latent encodings
seq_df = pd.read_csv(hooke_model_path + "latent_projections.csv", index_col=0)
seq_df["sample"] = seq_df.index

# load metadata
meta_df = pd.read_csv(os.path.join(root, "metadata", "seq_embryo_df.csv"), index_col=0)
meta_df = pd.DataFrame(seq_df["sample"]).merge(meta_df, how="inner", on="sample").reset_index(drop=True) #meta_df.loc[np.isin(meta_df["sample"], np.asarray(seq_df.index)), :]

# stage predictions
time_df = pd.read_csv(hooke_model_path + "time_predictions.csv", index_col=0)
time_df = pd.DataFrame(seq_df["sample"]).merge(time_df, how="inner", left_on="sample", right_index=True)

seq_df = seq_df.drop(labels=["sample"], axis=1)
meta_df = meta_df.merge(time_df.loc[:, ["pseudostage"]], left_on="sample", right_index=True)

FileNotFoundError: [Errno 2] No such file or directory: "/Users/nick/Cole Trapnell's Lab Dropbox/Nick Lammers/Nick/morphseq/metadata/seq_embryo_df.csv"

### Drop Gene 3 and filter for time of interest

In [None]:
n_points = 250
t_start = 12.5
t_stop = 50
t_vec = np.linspace(t_start-5, t_stop+5, n_points)

# seq_df = seq_df.reset_index() 

# apply filters
stage_filter = ((meta_df["pseudostage"] >=t_start) & (meta_df["pseudostage"] <=t_stop)).to_numpy()
expt_filter = (meta_df["expt"] != "GENE3").to_numpy()
hf_filter = (meta_df["expt"] == "hotfish2").to_numpy()

ref_meta_df = meta_df.loc[stage_filter & expt_filter & ~hf_filter, :]
ref_seq_df = seq_df[stage_filter & expt_filter & ~hf_filter]

hf_meta_df = meta_df.loc[stage_filter & expt_filter & hf_filter, :]
hf_seq_df = seq_df[stage_filter & expt_filter & hf_filter]

seq_df.head()

### Fit PCA

In [None]:
from sklearn.decomposition import PCA

# initialize and fit
n_components = 100
seq_pca = PCA(n_components=n_components)
seq_pca.fit(pd.concat([ref_seq_df, hf_seq_df]))
ref_pca_array = seq_pca.transform(ref_seq_df)
hf_pca_array = seq_pca.transform(hf_seq_df)

# create data frame 
pca_cols = [f"PCA_{n:02}" for n in range(n_components)]
ref_pca_df = pd.DataFrame(ref_pca_array, columns=pca_cols, index=ref_seq_df.index)
hf_pca_df = pd.DataFrame(hf_pca_array, columns=pca_cols, index=hf_seq_df.index)
hf_pca_df.head()

In [None]:
ref_seq_df.shape

In [None]:
var_cumulative = np.cumsum(seq_pca.explained_variance_ratio_)
fig = px.line(x=np.arange(n_components), y=var_cumulative, markers=True)

fig.update_layout(xaxis=dict(title="PC number"),
                  yaxis=dict(title="total variance explained"),
                  title="PCA decomposition of Hooke latent space",
                 font=dict(
                    family="Arial, sans-serif",
                    size=18,  # Adjust this value to change the global font size
                    color="black"
                ))

fig.show()

fig.write_image(os.path.join(fig_path, "seq_pca_var_explained.png"))

### Fit reference spline to data

In [None]:
from src.functions.spline_fitting_v2 import spline_fit_wrapper

n_boots = 50
n_spline_points = 2500
norm_factor = 10

ref_pca_df_fit = ref_pca_df.copy()
ref_pca_df_fit[pca_cols] = ref_pca_df_fit[pca_cols] / norm_factor 
ref_pca_df_fit["predicted_stage_hpf"] = ref_meta_df["pseudostage"].to_numpy()

spline_df = spline_fit_wrapper(ref_pca_df_fit, fit_cols=pca_cols, n_boots=n_boots, n_spline_points=n_spline_points, 
                               bandwidth=0.5, angle_penalty_exp=0.05, time_window=5)

spline_df = spline_df * norm_factor

In [None]:
plot_dims = np.asarray([0, 1, 2]) + 2
plot_strings = [pca_cols[p] for p in plot_dims]

fig = px.scatter_3d(hf_pca_df, x=plot_strings[0], y=plot_strings[1], z=plot_strings[2], opacity=1,
                    color=hf_meta_df["temp"])
# fig = px.scatter_3d(ref_pca_df, x=plot_strings[0], y=plot_strings[1], z=plot_strings[2], opacity=1,
#                     color=ref_meta_df["temp"])

fig.update_traces(marker=dict(size=5, showscale=False))

fig.update_layout(
    scene=dict(
        aspectmode='data'
        # Alternatively, you can use 'cube' to force equal scaling:
        # aspectmode='cube'
    )
)

fig.add_traces(go.Scatter3d(x=spline_df.loc[:, plot_strings[0]], 
                            y=spline_df.loc[:, plot_strings[1]], 
                            z=spline_df.loc[:, plot_strings[2]],
                           mode="lines", line=dict(color="darkblue", width=4), name="reference curve"))
fig.show()

In [None]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline

# Define a pipeline that first transforms the input and then fits a linear model.
degree = 2  # or any degree you choose
model = Pipeline([
    ('poly', PolynomialFeatures(degree=degree, include_bias=True)),
    ('linear', LinearRegression())
])

frac_to_fit = 0.95
X = ref_pca_df[pca_cols].values
n_train = int(np.floor(frac_to_fit * X.shape[0]))
X_indices = np.arange(X.shape[0])
train_indices = np.random.choice(X_indices, n_train, replace=False)
test_indices = X_indices[~np.isin(X_indices, train_indices)]

X_train = X[train_indices]
X_test = X[test_indices]

y = ref_pca_df_fit["predicted_stage_hpf"].values
y_train = y[train_indices]
y_test = y[test_indices]
# Assume X is your (n_samples x N) input array and y is your (n_samples,) target (time).
model.fit(X_train, y_train)

y_pd = model.predict(X_test)

### Use surface to generate stage estimates 

In [None]:
X_spline = spline_df[pca_cols].values
spline_df["mdl_stage_hpf"] = model.predict(X_spline)

In [None]:
from scipy.interpolate import interp1d

t_vec = np.linspace(t_start, t_stop, n_points)

# set index to be time
t_vec_orig = spline_df["mdl_stage_hpf"].to_numpy()

# get new PCA values
interp = interp1d(t_vec_orig, spline_df[pca_cols].values, axis=0)
pca_array_interp = interp(t_vec)

# Reindex the dataframe to include the new time points.
spline_df_new = pd.DataFrame(pca_array_interp, columns=pca_cols)
spline_df_new["stage_hpf"] = t_vec

spline_df_new.head()

In [None]:
fig = px.scatter_3d(spline_df_new, x=plot_strings[0], y=plot_strings[1], z=plot_strings[2], opacity=1,
                    color="stage_hpf")

fig.show()

In [None]:
hf_pca_df.to_csv(os.path.join(out_path, "hf_seq_df.csv"))
ref_pca_df.to_csv(os.path.join(out_path, "wt_ref_seq_df.csv"))
spline_df_new.to_csv(os.path.join(out_path, "spline_seq_df.csv"))