In [49]:
# For Data Handling and Preprocessing
import os
import numpy as np
import pandas as pd
import scprep
import scanpy as sc
import scvelo as scv

# For Batch Correction
import harmonypy as hm

# For Clustering
from sklearn.decomposition import PCA
import umap
from hdbscan import HDBSCAN
from hdbscan.flat import (HDBSCAN_flat,
                          approximate_predict_flat,
                          membership_vector_flat,
                          all_points_membership_vectors_flat)
import sklearn.cluster as cluster
from sklearn.metrics import adjusted_rand_score, adjusted_mutual_info_score

# For Plotting
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set(style='white', context='notebook', rc={'figure.figsize':(14,10)})


## 1. Read Data

In [94]:
#set paths
path_cwd=os.getcwd()
path_train_data=path_cwd + '/ML4G_Project_2_Data/train_data'
path_test_data=path_cwd + '/ML4G_Project_2_Data/test_data'

#train_data
bulk_train=pd.read_csv(path_train_data +'/pancreas_bulk_train.csv',index_col=0)
metadata_train=pd.read_csv(path_train_data+ '/pancreas_sc_metadata_train.csv', index_col=0)
metadata_train["Celltype_Int"] = pd.factorize(metadata_train["Celltype"])[0]
metadata_train["Disease_Int"] = pd.factorize(metadata_train["Disease"])[0]
metadata_train["Sample_Int"] = pd.factorize(metadata_train["Sample"])[0]
sc_train_raw = pd.read_csv(path_train_data +'/pancreas_sc_train.csv', 
                       index_col=0, header=0)

#test_data
metadata_test=pd.read_csv(path_test_data+ '/pancreas_sc_metadata_test_wocelltype.csv', index_col=0)
metadata_test["Disease_Int"] = pd.factorize(metadata_test["Disease"])[0]
metadata_test["Sample_Int"] = pd.factorize(metadata_test["Sample"])[0]
sc_test_raw = pd.read_csv(path_test_data +'/pancreas_sc_test.csv', 
                        index_col=0, header=0)

## 2. Preprocess Data

In [100]:
#create preprocessing function
def preprocess(adata, 
               min_counts=1, 
               min_cells=200,
               key_n_counts='n_counts_all', 
               n_top_genes=1000, 
               log_transform=True, 
               harmony=True, 
               harmony_vars = ["Sample", "Disease"],
               harmony_max_iter=20):
    """
    Preprocesses the data using scanpy functions and harmony if specified.
    ----------
    adata: AnnData
        AnnData object to be preprocessed"""

    #filter genes based on counts and normalize
    sc.pp.filter_genes(adata, min_counts=min_counts)
    sc.pp.filter_genes(adata, min_cells=min_cells)
    sc.pp.normalize_per_cell(adata, key_n_counts=key_n_counts)

    #log transform data
    if log_transform: 
        sc.pp.log1p(adata)

    sc.pp.highly_variable_genes(adata, 
                                flavor='cell_ranger', 
                                n_top_genes=n_top_genes,
                                inplace=True,
                                subset=True)

    #run batch correction
    if harmony:
        print("Running Harmony:")
        ho = hm.run_harmony(adata.X, 
                            adata.obs, 
                            harmony_vars, 
                            max_iter_harmony=harmony_max_iter,
                            verbose=False)

        # Format output
        adata.X = ho.Z_corr.T

    
    #renormalize after filtering and scale
    sc.pp.normalize_per_cell(adata) 
    sc.pp.scale(adata)
    
    return adata

In [103]:
#create annData objects
sc_train = sc.AnnData(sc_train_raw.T, obs=metadata_train)
sc_train.obs_names = sc_train_raw.columns

sc_test = sc.AnnData(sc_test_raw.T, obs=metadata_test)
sc_test.obs_names = sc_test_raw.columns

#apply preprocessing functions
sc_train_pp = preprocess(sc_train, 
                         harmony=True,
                         harmony_vars = ["Disease"])

sc_test_pp = preprocess(sc_test, 
                         harmony=True,
                         harmony_vars = ["Disease"])

2023-04-25 09:28:17,798 - harmonypy - INFO - Computing initial centroids with sklearn.KMeans...


Running Harmony:


2023-04-25 09:28:18,022 - harmonypy - INFO - sklearn.KMeans initialization complete.
2023-04-25 09:28:19,319 - harmonypy - INFO - Computing initial centroids with sklearn.KMeans...
2023-04-25 09:28:19,504 - harmonypy - INFO - sklearn.KMeans initialization complete.


Running Harmony:
