In [1]:
import pandas as pd
import scipy.io as spio
import scanpy as sc
import anndata as ad

In [2]:
# Read in the barcode file, barcodes are indeces
barcodes = pd.read_csv('../RNA-seq_Hemming2020/Data/GSE163005_barcodes.tsv.gz', sep='\t', header = None, index_col=0)

In [3]:
# Read in the ffeatures file
features = pd.read_csv('../RNA-seq_Hemming2020/Data/GSE163005_features.tsv.gz', sep='\t', header=None, usecols=[0,1],
                       index_col=0,
                      names=['GeneID', 'GeneSymbol'])

In [4]:
# Read in the matrix file
counts = spio.mmread("../RNA-seq_Hemming2020/Data/GSE163005_matrix.mtx.gz")

# Can't save h5ad file as COO matrix so have to convert:
counts = counts.tocsr()

In [5]:
# Read in the patient anotation file
# FIrst row is their headed, [NaN, 'x']. Changing X to patients
patient_anno = pd.read_csv("../RNA-seq_Hemming2020/Data/GSE163005_annotation_patients.csv.gz", index_col=0,
                           header=0, names=['Patient'])

In [6]:
# Read in annotation data
cluster_anno = pd.read_csv("../RNA-seq_Hemming2020/Data/GSE163005_annotation_cluster.csv.gz", header=0, index_col=0,
                          names=['Cluster'])

In [7]:
# Read in diagnosis data
dx_anno = pd.read_csv("../RNA-seq_Hemming2020/Data/GSE163005_annotation_dx.csv.gz", header=0, index_col=0,
                     names=['Diagnosis'])

In [8]:
# Read in T cell cluster annotation
tcell_cluster_anno = pd.read_csv('../RNA-seq_Hemming2020/Data/GSE163005_annotation_tcells_cluster.csv.gz', header=0, index_col=0,
                                names = ['TCellCluster'])

In [9]:
# Create the AnnData object
# counts needs to be transposed from 33538x85418 to 85418x33538 - the 85k is the cells and those need to be row
# dtype for float32, I'm not sure which dtype is needed yet.
adata = sc.AnnData(X=counts.T, obs=barcodes, var=features, dtype='float32')

### Here is the cluster, Tcell cluster, patientID, and diagnosis unique values

**Cluster**: CD4, CD8, mDC2, granulo1, naiveBc, plasma, mono1,
       pDC, NK, mono2, Treg, mDC1, cycling, matDC, mono3,
       granulo2
       
**T cell Clusters**: proli_CD4, CD8_1, memory_CD4, exh_CD4, CD8_2,
       naive_CD4, CD4_Treg, antiviral_CD4
 
**Patient Annotations**: MS19270, MS49131, MS58637, MS71658, IIH32190, IIH41540,
       IIH45044, IIH85037, IIH10999, IIH47578, IIH53423,
       IIH68490, IIH91012, MS25719, MS76177, MS77654, MS79670,
       MS90896, VE48279, VE61728, COV1919, COV1923, COV1924,
       COV1930, COV1943, COV1950, COV1954, COV1933, VE23642,
       VE57538, VE66730
       
**Patient Diagnosis**: MS, IIH, VE, COVID


    COVID - Neural covid patients  
    VE - Viral encephalitis  
    IIH - idiopathic intracranial hypertension  
    MS - relapsing-remitting multiple sclerosis (Control: weird choice here)


In [56]:
# Adding annotations to adata.obs - Is there a more efficient way to do this?
# Adding patient annotation
adata.obs=pd.concat([adata.obs, patient_anno], axis=1)

# Adding cluster annotation
adata.obs=pd.concat([adata.obs, cluster_anno], axis=1)

# Adding diagnosis data to obs
adata.obs=pd.concat([adata.obs, dx_anno], axis=1)

# Adding the t-cell cluster annotation to obs
adata.obs = pd.concat([adata.obs, tcell_cluster_anno], axis=1)

In [57]:
# Saving everything to an h5ad file

adata.write_h5ad(filename='../RNA-seq_Hemming2020/Data/Unprocessed_Hemming2020_Data.h5ad')