In [None]:
import pandas as pd
import math
import numpy as np
import scipy.io
import scipy.sparse

---

# Hu et al

### The barcodes are cells (each cell has a unique barcode)

In [None]:
hu_aa_p1_barcode = pd.read_csv('Huetal2022/AA_patient_1/GSM5515741_AA1_barcodes.tsv', sep='\t', header=None)
hu_aa_p1_barcode


### The features are genes (each row describes a gene and the type of info collected for that gene (here all Gene Expression)

In [None]:
hu_aa_p1_features = pd.read_csv('Huetal2022/AA_patient_1/GSM5515741_AA1_features.tsv', sep='\t', header=None)
hu_aa_p1_features

### The data matrix rows are genes and columns are cells. The indices match the indices of the files imported above. Matries are sparse (common in scRNAseq due to dropout)

In [None]:
hu_aa_p1_data_mtx = scipy.io.mmread('Huetal2022/AA_patient_1/GSM5515741_AA1_matrix.mtx')
print(hu_aa_p1_data_mtx)
hu_aa_p1_data_mtx

In [None]:
hu_aa_p1_data_df = pd.DataFrame.sparse.from_spmatrix(hu_aa_p1_data_mtx)
hu_aa_p1_data_df

### Importing the rest of the data

In [None]:
hu_aa_p1_data = { "cells": hu_aa_p1_barcode, "genes": hu_aa_p1_features, "data":hu_aa_p1_data_df}
hu_aa_p1_data

In [None]:
hu_aa_p2_barcode = pd.read_csv('Huetal2022/AA_patient_2/GSM5515742_AA2_barcodes.tsv', sep='\t', header=None)
hu_aa_p2_features = pd.read_csv('Huetal2022/AA_patient_2/GSM5515742_AA2_features.tsv', sep='\t', header=None)
hu_aa_p2_data_mtx = scipy.io.mmread('Huetal2022/AA_patient_2/GSM5515742_AA2_matrix.mtx')
hu_aa_p2_data_df = pd.DataFrame.sparse.from_spmatrix(hu_aa_p2_data_mtx)
hu_aa_p2_data = { "cells": hu_aa_p2_barcode, "genes": hu_aa_p2_features, "data":hu_aa_p2_data_df}
hu_aa_p2_data

In [None]:
hu_n_p1_barcode = pd.read_csv('Huetal2022/N_patient_1/GSM5515743_Normal1_barcodes.tsv', sep='\t', header=None)
hu_n_p1_features = pd.read_csv('Huetal2022/N_patient_1/GSM5515743_Normal1_features.tsv', sep='\t', header=None)
hu_n_p1_data_mtx = scipy.io.mmread('Huetal2022/N_patient_1/GSM5515743_Normal1_matrix.mtx')
hu_n_p1_data_df = pd.DataFrame.sparse.from_spmatrix(hu_n_p1_data_mtx)
hu_n_p1_data = { "cells": hu_n_p1_barcode, "genes": hu_n_p1_features, "data":hu_n_p1_data_df}
hu_n_p1_data

In [None]:
hu_n_p2_barcode = pd.read_csv('Huetal2022/N_patient_2/GSM5515744_Normal2_barcodes.tsv', sep='\t', header=None)
hu_n_p2_features = pd.read_csv('Huetal2022/N_patient_2/GSM5515744_Normal2_features.tsv', sep='\t', header=None)
hu_n_p2_data_mtx = scipy.io.mmread('Huetal2022/N_patient_2/GSM5515744_Normal2_matrix.mtx')
hu_n_p2_data_df = pd.DataFrame.sparse.from_spmatrix(hu_n_p2_data_mtx)
hu_n_p2_data = { "cells": hu_n_p2_barcode, "genes": hu_n_p2_features, "data":hu_n_p2_data_df}
hu_n_p2_data

---

# Zhu et al

## Full length scRNAseq data

I pulled a few things out of the metadata file that I thought were most relevant (the file is enormous and terribly formatted). I can definitely pull more things if needed

In [None]:
zhu_fl_meta = pd.read_csv('Zhuetal2021/full_len/metadata_subsample.txt', sep="\t", index_col=0)
zhu_fl_meta

Columns are cells (corresponding to Sample_title in the metadata), rows are genes

In [None]:
zhu_fl_sc_counts = pd.read_csv('Zhuetal2021/full_len/GSE145531_Full_length_SingleCell_counts.txt', sep='\t')
zhu_fl_sc_counts

## 3' RNAseq

I got lazy this time and only removed the rows that weren't the right length from this metadata file

In [None]:
zhu_3p_meta = pd.read_csv('Zhuetal2021/3prime/GSE145668_series_matrix_subset.txt', sep="\t", index_col=0)
zhu_3p_meta

Columns (except the first two cols) are cells (corresponding to Sample_title in the metadata), rows are genes

In [None]:
zhu_3p_sc_AA_counts = pd.read_csv('Zhuetal2021/3prime/GSE145668_AA_cells_counts.txt', sep='\t')
zhu_3p_sc_AA_counts

In [None]:
zhu_3p_sc_Cntrl_counts = pd.read_csv('Zhuetal2021/3prime/GSE145668_Ctrl_cells_counts.txt', sep='\t')
zhu_3p_sc_Cntrl_counts

I think we probably shouldn't use the treatment data but it is here:

In [None]:
zhu_3p_sc_Response_counts = pd.read_csv('Zhuetal2021/3prime/GSE145668_Response_cells_counts.txt', sep='\t')
zhu_3p_sc_Response_counts

---

### Separating out the AA from the Healthy data in `zhu_fl_sc_counts`

In [None]:
zhu_fl_meta_t = zhu_fl_meta.T
zhu_fl_meta_t

In [None]:
non_severe_AA_patients = zhu_fl_meta_t.loc[zhu_fl_meta_t['Sample_disease_state'] == "disease state: non-SAA"]
non_severe_AA_patients

In [None]:
healthy_patients = zhu_fl_meta_t.loc[zhu_fl_meta_t['Sample_disease_state'] == "disease state: Ctrl"]
healthy_patients

In [None]:
zhu_fl_sc_aa_counts = zhu_fl_sc_counts.filter(axis=1, items=list(non_severe_AA_patients.index))
zhu_fl_sc_aa_counts

In [None]:
zhu_fl_sc_healthy_counts = zhu_fl_sc_counts.filter(axis=1, items=list(healthy_patients.index))
zhu_fl_sc_healthy_counts

---

# Quick overview of which read data is where

Aplastic anemia data for Hu is in `hu_aa_p1_data['data']` and `hu_aa_p2_data['data']`.

Healthy data for Hu is in `hu_n_p1_data['data']` and `hu_n_p2_data['data']`

Full Length Aplastic anemia data for Zhu is in `zhu_fl_sc_aa_counts`

Full Length Helathy data for Zhy is in `zhu_fl_sc_healthy_counts`

3 prime Aplastic anemia data for Zhu is in `zhu_3p_sc_AA_counts`

3 prime Healthy data for Zhu is in `zhu_3p_sc_Cntrl_counts`

I don't think we should use the Zhu Response data (at least not until I read the paper more closely and understand what they did)

---

# Finding overlap in genes between two studies

In [None]:
hu_aa_p1_data['genes'].equals(hu_aa_p2_data['genes'])

In [None]:
hu_aa_p1_data['genes'].equals(hu_n_p1_data['genes'])

In [None]:
hu_aa_p1_data['genes'].equals(hu_n_p2_data['genes'])

Hu genes are the same for each patient

In [None]:
hu_genes = hu_aa_p1_data['genes'][0]
hu_genes

In [None]:
zhu_fl_genes_decimals = pd.core.series.Series(zhu_fl_sc_counts.index.values)
zhu_fl_genes = zhu_fl_genes_decimals.apply(lambda x: x.split('.')[0])
zhu_fl_genes

In [None]:
hu_zhufl_common_genes_set = set(hu_genes).intersection(set(zhu_fl_genes))
hu_zhufl_common_genes_series = pd.core.series.Series(list(hu_zhufl_common_genes_set))
hu_zhufl_common_genes_series

### Those have **26,415** genes in common, lets add the 3prime data

In [None]:
zhu_3p_aa_genes = zhu_3p_sc_AA_counts['GeneName'].apply(lambda x: x.split('.')[0])
zhu_3p_aa_genes

In [None]:
zhu_3p_cntrl_genes = zhu_3p_sc_Cntrl_counts['GeneName'].apply(lambda x: x.split('.')[0])
zhu_3p_cntrl_genes

Genes in common between Zhu aa 3' seq abd Zhu control 3' seq:

In [None]:
zhu_3p_aa_cntrl_common_genes_set = set(zhu_3p_aa_genes).intersection(set(zhu_3p_cntrl_genes))
zhu_3p_aa_cntrl_common_genes_series = pd.core.series.Series(list(zhu_3p_aa_cntrl_common_genes_set))
zhu_3p_aa_cntrl_common_genes_series

Genes in common between all datasets:

In [None]:
common_genes_to_all_set = hu_zhufl_common_genes_set.intersection(zhu_3p_aa_cntrl_common_genes_set)
common_genes_to_all_series = pd.core.series.Series(list(common_genes_to_all_set))
common_genes_to_all_series

### 17090 genes in common across all the cells