# Title

## Setup

In [None]:
import sys

sys.path.append("../working")

In [None]:
import logging

logging.basicConfig(
    # filename=__file__.replace('.py', '.log'),
    level=logging.getLevelName("INFO"),
    format="%(asctime)s [%(levelname)s] [%(module)s] %(message)s",
)

log = logging.getLogger(__name__)

In [None]:
import glob
import os
import re

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy.stats as stats
import seaborn as sns
from omegaconf import OmegaConf
from progressbar import progressbar
from src.get_score import get_score
from src.load_data import LoadData, PostprocessData, PreprocessData
from src.utils import df_stats

# from src.make_dataset import BaseDataset, get_transforms
# from src.make_model import ImageBaseModel
# from torch.utils.data import DataLoader

# pd.set_option("display.max_rows", None)
# pd.set_option("display.max_columns", None)
# pd.set_option("display.max_colwidth", None)

In [None]:
# Competition specific library
import math

import scanpy as sc
import scipy.stats as stats
import umap
from anndata import AnnData
from ivis import Ivis
from sklearn.preprocessing import StandardScaler

In [None]:
c = OmegaConf.load("../working/config/main.yaml")

In [None]:
input = PreprocessData(c, do_preprocess=False)

In [None]:
input = LoadData(c, do_preprocess=False, use_fold=True)

In [None]:
input = PostprocessData(c)

In [None]:
dir(input)

## Basic Info

In [None]:
df = ...

In [None]:
df.info()

In [None]:
df_stats(df)

In [None]:
df.head()

In [None]:
adata = AnnData(input.train_cite_inputs.to_numpy())
adata.obs_names = input.train_cite_inputs.index
adata.var_names = input.train_cite_inputs.columns
adata

In [None]:
sc.pp.normalize_total(adata, target_sum=1e6)
sc.pp.log1p(adata)

sc.pp.filter_genes(adata, min_cells=300)

In [None]:
adata.var["mt"] = adata.var_names.str.contains("_MT-")

sc.pp.calculate_qc_metrics(
    adata,
    qc_vars=["mt"],
    percent_top=None,
    log1p=False,
    inplace=True,
)

In [None]:
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)

In [None]:
sc.pp.highly_variable_genes(adata)
adata = adata[:, adata.var["highly_variable"]]

In [None]:
sc.pp.scale(adata, max_value=10)

sc.tl.pca(adata, svd_solver="arpack")

In [None]:
X = adata.obsm["X_pca"]

In [None]:
ivis = Ivis(k=15, model="maaten", n_epochs_without_progress=5)
ivis.fit(X)
embeddings = ivis.transform(X)

In [None]:
adata.var.info()

In [None]:
[col for col in adata.var_names if "CST" in col]

In [None]:
fill = adata.X[:, adata.var.index == "ENSG00000101439_CST3"]
fill = fill.reshape((X.shape[0],))

In [None]:
fill.shape

In [None]:
adata.obs.info()

In [None]:
embeddings.shape

In [None]:
plt.figure(figsize=(6, 4), dpi=150)
sc = plt.scatter(x=embeddings[:, 0], y=embeddings[:, 1], c=fill, s=5)
plt.xlabel("ivis 1")
plt.ylabel("ivis 2")
plt.title("CST3")
plt.colorbar(sc)