In [1]:
import pandas as pd
import numpy as np
import scanpy as sc
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA


In [2]:
# Replacing empty entries with 'unsure'
def fReplaceEmpty(X):
    Y = X
    idx = []
    for k in range(len(X)):
        if len(X[k]) == 0:
                Y[k] = 'unsure'
        else: 
            idx.append(k)
    return Y, idx

Preprocessing raw count data by doing normalization and log transformation

In [3]:
GetData = pd.read_csv('data/human_BreastCancer2518_raw.csv',sep=",",header=0,na_filter=False,index_col=0) 
GetLabel = pd.read_csv('data/human_BreastCancer2518_labels.csv',sep=",",header=0,na_filter=False,index_col=0)
cell_cluster_gt = list(GetLabel['type'])
xpixels = list(GetLabel['x'])
ypixels = list(GetLabel['y'])

GeneNames = list(GetData.columns)
cell_num = GetData.shape[0]
cell_idx = ['C_' + str(i) for i in list(range(GetData.shape[0]))]
df_data = GetData
df_data.index = cell_idx
df_genes = pd.DataFrame(GeneNames, index = GeneNames,columns =['Genes'] )
df_pixel = pd.DataFrame(list(zip(xpixels, ypixels)),
               columns =['x_pixel', 'y_pixel'])
df_labels = pd.DataFrame(cell_cluster_gt, columns =['gt'])
df_pixel.index = cell_idx
# Creating AnnData object
adata = sc.AnnData(X = df_data, obs = df_pixel, var = df_genes)
# Doing normalization and log transformation
sc.pp.normalize_per_cell(adata)
sc.pp.log1p(adata)
df_data_transformed = pd.DataFrame(adata.X.T,index = GeneNames,columns = cell_idx)


Selecting 1272 spots with available annotations 

In [4]:
# Selecting spots with available annotations
cell_cluster_gt, gt_idx = fReplaceEmpty(cell_cluster_gt)
df_gt_idx = np.array(pd.DataFrame(gt_idx, columns =['idx']))
df_data_select = df_data_transformed.iloc[:,gt_idx].T
N = df_data_select.shape[0]
# Performing PCA and selecting 200 PCs
pca = PCA()
PCs = pca.fit_transform(df_data_select)
df_PC200 = pd.DataFrame(PCs[:,:200], index = df_data_select.index)
df_PC200.to_csv('human_BreastCancer'+str(N)+'_data_pc200.csv')
df_labels_select = df_labels.iloc[gt_idx,:]
df_pixel_select = df_pixel.iloc[gt_idx,:]
df_labels_select.to_csv('human_BreastCancer'+str(N)+'_labels.csv')
df_pixel_select.to_csv('human_BreastCancer'+str(N)+'_pixels.csv')