In [None]:
import scanpy as sc
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
import sklearn.metrics as skm
import colorcet as cc
import sklearn as sk
import sklearn.decomposition as decomp
import sklearn.pipeline as pipe
import sklearn.neighbors as nbr
import sklearn.base as skbase
import sklearn.model_selection as skms
import pickle
import os
import joblib

In [None]:
os.makedirs(
    "figures",
    exist_ok= True,
)
os.makedirs(
    "pickles",
    exist_ok= True,
)

In [None]:
p822 = sc.read_h5ad("Alex_Lemonade_portal/SCPCS000490/SCPCL000822_filtered_rna.h5ad")
p822

In [None]:
sc.pl.violin(
    p822,
    ["sum", "detected", "subsets_mito_percent"],
    multi_panel= True,
    save= "_preproc.pdf"
)

In [None]:
sns.set_style("whitegrid")
sc.pl.scatter(
    p822,
    "sum",
    "detected",
    color= "subsets_mito_percent",
    color_map= "viridis",
    save= "_sum_vs_detected"
)

In [None]:
p822.var["gene_symbol"] = p822.var["gene_symbol"].astype("str")
p822.var.loc[p822.var["gene_symbol"] == "nan", "gene_symbol"] = p822.var.loc[p822.var["gene_symbol"] == "nan"].index

In [None]:
sc.pl.highest_expr_genes(
    p822,
    gene_symbols= "gene_symbol",
    save= True
)

In [None]:
p822 = p822[:, p822.var["gene_symbol"] != "MALAT1"]
p822.layers["norm"] = sc.pp.normalize_total(
    p822,
    copy= True,
    exclude_highly_expressed= True,
    key_added= "norm_factor",
    layer= "spliced",
).layers["spliced"]

p822.layers["log"] = sc.pp.log1p(
    p822,
    copy= True,
    layer= "norm", 
).layers["norm"]

sc.pp.highly_variable_genes(
    p822,
    n_top_genes= 2000,
    flavor= "seurat_v3"
)

sc.pl.highly_variable_genes(
    p822,
    save= True
)

p822.layers["norm_scaled_genes"] = sc.pp.scale(
    p822,
    copy= True,
    layer= "norm"
).layers["norm"]

In [None]:
class DualNeighbors(skbase.BaseEstimator, skbase.TransformerMixin):
    def __init__(self, n_neighbors = 5, algorithm = "auto", leaf_size = 30, metric = "minkowski", p = 2, metric_params = None, n_jobs = None):
        self.n_neighbors = n_neighbors
        self.algorithm = algorithm
        self.leaf_size = leaf_size
        self.metric = metric
        self.p = p
        self.metric_params = metric_params
        self.n_jobs = n_jobs
        import sklearn.neighbors as nbr
        self.dist = nbr.KNeighborsTransformer(
            n_neighbors = self.n_neighbors, algorithm = self.algorithm, leaf_size = self.leaf_size, metric = self.metric, p = self.p, metric_params = self.metric_params, n_jobs = self.n_jobs
        )
        self.connect = nbr.KNeighborsTransformer(
            mode= "connectivity",
            n_neighbors = self.n_neighbors, algorithm = self.algorithm, leaf_size = self.leaf_size, metric = self.metric, p = self.p, metric_params = self.metric_params, n_jobs = self.n_jobs
        )

    def fit(self, X, y= None):
        self.dist.fit(X)
        self.connect.fit(X)
        return self 

    def transform(self, X):
        return (self.dist.transform(X), self.connect.transform(X))

class LeidenScorer(skbase.ClusterMixin, skbase.BaseEstimator):
    def __init__(self, resolution= 1):
       self.resolution = resolution 

    def fit(self, X, y= None):
        import igraph
        self.membership = igraph.Graph.community_leiden(
            sc._utils.get_igraph_from_adjacency(
                X[1]
            ),
            resolution= self.resolution,
            objective_function= "modularity"
        ).membership
        self.distance = X[0]
        self.labels_ = np.array(self.membership)
        return self

    def score(self, X, y= None, sample_weight= None):
        return skm.silhouette_score(self.distance, self.labels_)
    

In [None]:
pca = decomp.PCA()
neighbors = DualNeighbors(n_jobs= -1)
lscore = LeidenScorer()
workflow = pipe.make_pipeline(pca, neighbors, lscore)
param_grid = {
    "pca__n_components": range(5, 15),
    "dualneighbors__n_neighbors": range(20, 40),
    "leidenscorer__resolution": np.linspace(0.1, 2, 10) 
}
X_train, X_test = skms.train_test_split(
    p822.layers["norm_scaled_genes"],
    test_size= 0.2,
    random_state= 0,
)
kfold = skms.KFold(
    shuffle= True,
    random_state= 0,
)


In [None]:
workflow.fit(X_train)
workflow.score(X_train)

In [None]:

grids = skms.GridSearchCV(
    workflow,
    param_grid= param_grid,
    cv= kfold,
    return_train_score= True
) 

with joblib.parallel_backend("loky"):
    grids.fit(X_train)


In [None]:
with open("pickles/gridsearch", "wb") as f:
    pickle.dump(grids, f)

In [None]:
sns.lineplot(x= grids.cv_results_["param_pca__n_components"], y= grids.cv_results_["mean_test_score"])

In [None]:
grids.cv_results_

In [None]:
sc.pp.pca(
    p822,
    layer= "norm_scaled_genes",
    #use_highly_variable= True,
    n_comps= 30
)

sc.pl.pca_variance_ratio(
    p822,
    log= True
)

In [None]:
sc.pl.pca(
    p822,
    dimensions= [(0,1), (2, 3), (4, 5), (6, 7)],
    ncols= 2,
    color= "subsets_mito_percent"
)

In [None]:
sc.pp.neighbors(
    p822,
    n_neighbors= 50,
    n_pcs= 20,
)
sc.tl.umap(
    p822,
)
sc.tl.leiden(
    p822,
    resolution= 2
)
skm.silhouette_score(
    p822.obsp["distances"],
    metric= "precomputed",
    labels= p822.obs["leiden"]
)

In [None]:
sc.pl.umap(
    p822,
    color= [
        "CDH1",
        "leiden", 
    ],
    gene_symbols= "gene_symbol",
    cmap= "viridis",
    palette= cc.glasbey_category10
)

In [None]:
p822.obsp["distances"]

In [None]:
skm.silhouette_score(
    p822.obsp["distances"],
    metric= "precomputed",
    labels= p822.obs["leiden"]
)