In [None]:
import scanpy as sc
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
import sklearn.metrics as skm
import colorcet as cc
import sklearn as sk
import sklearn.decomposition as decomp
import sklearn.pipeline as pipe
import sklearn.neighbors as nbr
import sklearn.base as skbase
import sklearn.model_selection as skms
import pickle
import os
import joblib

In [None]:
os.makedirs(
    "figures",
    exist_ok= True,
)
os.makedirs(
    "pickles",
    exist_ok= True,
)

In [None]:
merged_data = sc.read_h5ad("data/merged.h5ad")
merged_data

In [None]:
sc.pl.violin(
    merged_data,
    ["sum", "detected", "subsets_mito_percent"],
    multi_panel= True,
    save= "_preproc.pdf"
)

In [None]:
sns.set_style("whitegrid")
sc.pl.scatter(
    merged_data,
    "sum",
    "detected",
    color= "subsets_mito_percent",
    color_map= "viridis",
    save= "_sum_vs_detected"
)

In [None]:
merged_data.var["gene_symbol"] = merged_data.var["gene_symbol"].astype("str")
merged_data.var.loc[merged_data.var["gene_symbol"] == "nan", "gene_symbol"] = merged_data.var.loc[merged_data.var["gene_symbol"] == "nan"].index

In [None]:
sc.pl.highest_expr_genes(
    merged_data,
    gene_symbols= "gene_symbol",
    save= True
)

In [None]:
sc.pp.highly_variable_genes(
    merged_data,
    n_top_genes= 2000,
    flavor= "seurat_v3",
    layer= "spliced",
)

sc.pl.highly_variable_genes(
    merged_data,
    save= True,
)

In [None]:
merged_data.layers["norm"] = sc.pp.normalize_total(
    merged_data,
    copy= True,
    exclude_highly_expressed= True,
    key_added= "norm_factor",
    layer= "spliced",
).layers["spliced"]

merged_data.layers["log"] = sc.pp.log1p(
    merged_data,
    copy= True,
    layer= "norm", 
).layers["norm"]


merged_data.layers["norm_scaled_genes"] = sc.pp.scale(
    merged_data,
    copy= True,
    layer= "log"
).layers["log"]

In [None]:
class ScPCA(skbase.TransformerMixin, skbase.BaseEstimator):
    def __init__(self, layer= None, n_comps= None, mask= None):
        self.layer = layer
        self.n_comps = n_comps
        self.mask = mask

    def fit(self, X, y= None):
        return self

    def transform(self, X):
        return sc.pp.pca(
            X,
            n_comps= self.n_comps,
            mask_var= self.mask,
            layer= self.layer,
            copy= True,
        )

class ScNeighbors(skbase.TransformerMixin, skbase.BaseEstimator):
    def __init__(self, n_neighbors= 15, n_pcs= None):
        self.n_neighbors = n_neighbors
        self.n_pcs = n_pcs

    def fit(self, X, y= None):
        return self
        
    def transform(self, X):
        return sc.pp.neighbors(
            X,
            n_neighbors= self.n_neighbors,
            n_pcs= self.n_pcs,
            copy= True,
        )
    
class ScLeiden(skbase.TransformerMixin, skbase.BaseEstimator):
    def __init__(self, resolution= 1):
        self.resolution = resolution

    def fit(self, X, y= None):
        return self

    def transform(self, X):
        return sc.tl.leiden(
            X,
            resolution= self.resolution,
            flavor= "igraph",
            copy= True,
        )

class ScScore(skbase.TransformerMixin, skbase.BaseEstimator):

    def fit(self, X, y= None):
        return self

    def score(estimator, X, y= None, sample_weight= None):
        return skm.silhouette_score(
            X.obsm["X_pca"],
            labels= X.obs["leiden"]
        )

In [None]:
pca = ScPCA(layer= "norm_scaled_genes", mask= "highly_variable")
neighbors = ScNeighbors()
scleid = ScLeiden()
scscorer = ScScore()
workflow = pipe.make_pipeline(pca, neighbors, scleid, scscorer)
param_grid = {
    "scpca__n_comps": range(25, 35),
    "scneighbors__n_neighbors": range(40, 60),
    "scleiden__resolution": np.linspace(0.1, 2, 10) 
}
X_train, X_test = skms.train_test_split(
    p822,
    test_size= 0.2,
    random_state= 0,
)
kfold = skms.KFold(
    shuffle= True,
    random_state= 0,
)


In [None]:
workflow.fit(p822)
workflow.score(p822)

In [None]:

grids = skms.GridSearchCV(
    workflow,
    param_grid= param_grid,
    cv= kfold,
    return_train_score= True
) 

with joblib.parallel_backend("loky"):
    grids.fit(X_train)


In [None]:
with open("pickles/gridsearch_1000_spliced", mode= "br") as f:
    grids = pickle.load(f)

In [None]:
grids.best_params_

In [None]:
fig, axs = plt.subplots(3)
for ax, param in zip(axs, param_grid.keys()):
    sns.lineplot(x= grids.cv_results_["param_" + param], y= grids.cv_results_["mean_test_score"], ax= ax)
    ax.set_title(param)
fig.tight_layout()

In [None]:
grids.best_score_

In [None]:
sc.pp.pca(
    merged_data,
    layer= "norm_scaled_genes",
    mask_var= "highly_variable",
    n_comps= grids.best_params_["scpca__n_comps"],
)

sc.pl.pca_variance_ratio(
    merged_data,
    log= True
)

In [None]:
sc.pl.pca(
    merged_data,
    dimensions= [(0,1), (2, 3), (4, 5), (6, 7)],
    ncols= 2,
    color= "subsets_mito_percent"
)

In [None]:
sc.pp.neighbors(
    merged_data,
    n_neighbors= grids.best_params_["scneighbors__n_neighbors"],
)
sc.tl.umap(
    merged_data,
)
sc.tl.leiden(
    merged_data,
    resolution= grids.best_params_["scleiden__resolution"],
)
skm.silhouette_score(
    merged_data.obsm["X_pca"],
    labels= merged_data.obs["leiden"]
)

In [None]:
fig = sc.pl.umap(
    merged_data,
    color= [
        "CDH1",
        "leiden", 
    ],
    gene_symbols= "gene_symbol",
    palette= cc.glasbey_category10,
    ncols= 1,
    return_fig = True,
)

In [None]:
cdh1_up = []
with open("gene_lists/CDH1_up.txt") as f:
    cdh1_up = f.read().split()

In [None]:
cdh1_up = pd.Series(cdh1_up)
cdh1_up = cdh1_up[cdh1_up.isin(merged_data.var["gene_symbol"])]

In [None]:
cdh1_up.isin(merged_data.var["gene_symbol"]).mean()

In [None]:
sc.pl.dotplot(
    merged_data,
    groupby= "leiden",
    var_names=  ["CDH1"],
    gene_symbols= "gene_symbol",
    layer= "norm_scaled_genes",
    dendrogram= True,
)