# ScRNA-Seq from [MacParland](https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE115469)

In [1]:
# Import packages
import scanpy as sc
import pandas as pd
import numpy as np
from functions import data_preprocessing as dp

In [2]:
# Load data
path = '../../../../data/raw/data_for_evaluating_cell_type_annotation/MacParland/'

file = 'GSE115469_Data.csv'
df = pd.read_csv(path + file, sep=',', index_col=0)

In [3]:
df

Unnamed: 0,P1TLH_AAACCTGAGCAGCCTC_1,P1TLH_AAACCTGTCCTCATTA_1,P1TLH_AAACCTGTCTAAGCCA_1,P1TLH_AAACGGGAGTAGGCCA_1,P1TLH_AAACGGGGTTCGGGCT_1,P1TLH_AAAGCAACAGTAAGAT_1,P1TLH_AAAGCAAGTCGCGTGT_1,P1TLH_AAAGCAAGTGTTTGTG_1,P1TLH_AAAGCAAGTTGATTCG_1,P1TLH_AAAGTAGCAGACGTAG_1,...,P5TLH_TTTCCTCTCAGTGTTG_1,P5TLH_TTTGCGCAGGATGGTC_1,P5TLH_TTTGCGCCAATGACCT_1,P5TLH_TTTGCGCCATCCTAGA_1,P5TLH_TTTGTCAGTCAGGACA_1,P5TLH_TTTGTCAGTGTTCTTT_1,P5TLH_TTTGTCAGTTTAGGAA_1,P5TLH_TTTGTCATCAGCTTAG_1,P5TLH_TTTGTCATCCACGCAG_1,P5TLH_TTTGTCATCGGCATCG_1
RP11-34P13.7,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
FO538757.2,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
AP006222.2,0.0,0.31476,0.0,0.0,0.0,0.0,0.0,0.504068,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.140429,0.0,0.0,0.0
RP4-669L17.10,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
RP5-857K21.4,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
AL354822.1,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
AC004556.1,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,0.876891,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
AC233755.2,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
AC233755.1,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0


In [4]:
# Load annotations
path = '../../../../data/raw/data_for_evaluating_cell_type_annotation/MacParland/'

file = 'GSE115469_CellClusterType.txt'
ann = pd.read_table(path + file, sep='\t', index_col=0)

In [5]:
ann

Unnamed: 0_level_0,Sample,Cell#,Cluster#,CellType
CellName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
P1TLH_AAACCTGAGCAGCCTC_1,P1TLH,AAACCTGAGCAGCCTC,12,Central_venous_LSECs
P1TLH_AAACCTGTCCTCATTA_1,P1TLH,AAACCTGTCCTCATTA,17,Cholangiocytes
P1TLH_AAACCTGTCTAAGCCA_1,P1TLH,AAACCTGTCTAAGCCA,12,Central_venous_LSECs
P1TLH_AAACGGGAGTAGGCCA_1,P1TLH,AAACGGGAGTAGGCCA,10,Non-inflammatory_Macrophage
P1TLH_AAACGGGGTTCGGGCT_1,P1TLH,AAACGGGGTTCGGGCT,2,alpha-beta_T_Cells
...,...,...,...,...
P5TLH_TTTGTCAGTGTTCTTT_1,P5TLH,TTTGTCAGTGTTCTTT,17,Cholangiocytes
P5TLH_TTTGTCAGTTTAGGAA_1,P5TLH,TTTGTCAGTTTAGGAA,11,Periportal_LSECs
P5TLH_TTTGTCATCAGCTTAG_1,P5TLH,TTTGTCATCAGCTTAG,17,Cholangiocytes
P5TLH_TTTGTCATCCACGCAG_1,P5TLH,TTTGTCATCCACGCAG,4,Inflammatory_Macrophage


In [6]:
X = np.array(df).T
Y = ann.loc[df.columns.to_list()].iloc[:,3].values.tolist()
sample_ID = df.columns.to_list()
gene_symbols = df.index.to_list()

adata = sc.AnnData(X=X)
adata.index = gene_symbols
adata.var_names = gene_symbols
adata.obs["cell_type"] = Y
adata.obs["sample_ID"] = sample_ID
adata.obs["patientID"] = ann.loc[df.columns.to_list()].iloc[:,0].values.tolist()

# Checking for duplicate genes
duplicate_genes = adata.var_names[adata.var_names.duplicated()]
if not duplicate_genes.empty:
    print(f"Duplicate genes found: {duplicate_genes}")

In [7]:
adata

AnnData object with n_obs × n_vars = 8444 × 20007
    obs: 'cell_type', 'sample_ID', 'patientID'

In [8]:
adata.var.head()

RP11-34P13.7
FO538757.2
AP006222.2
RP4-669L17.10
RP5-857K21.4


In [9]:
#Filter genes:
print('Number of genes before filtering: {:d}'.format(adata.n_vars))

# Min "20" cells - filters out 0 count genes
sc.pp.filter_genes(adata, min_cells=20)
print(f'Number of genes after filtering so theres min {20} unique cells per gene: {adata.n_vars}')

Number of genes before filtering: 20007
Number of genes after filtering so theres min 20 unique cells per gene: 15386


In [10]:
adata.obs['cell_type'].unique()

array(['Central_venous_LSECs', 'Cholangiocytes',
       'Non-inflammatory_Macrophage', 'alpha-beta_T_Cells',
       'Inflammatory_Macrophage', 'NK-like_Cells',
       'gamma-delta_T_Cells_1', 'Hepatocyte_5',
       'Portal_endothelial_Cells', 'gamma-delta_T_Cells_2',
       'Periportal_LSECs', 'Hepatocyte_6', 'Mature_B_Cells',
       'Hepatic_Stellate_Cells', 'Plasma_Cells', 'Erythroid_Cells',
       'Hepatocyte_2', 'Hepatocyte_3', 'Hepatocyte_1', 'Hepatocyte_4'],
      dtype=object)

In [11]:
# Change data type
adata.X = adata.X.astype(np.float32)

In [12]:
# Download normalized count matrix
adata.write('../../../../data/processed/data_for_evaluating_cell_type_annotation/MacParland.h5ad')