In [None]:
import pandas as pd
import math
import numpy as np
import scipy.io
import scipy.sparse

### import all data

In [None]:
hu_aa_p1_barcode = pd.read_csv('../data/Huetal2022/AA_patient_1/GSM5515741_AA1_barcodes.tsv', sep='\t', header=None)
hu_aa_p1_features = pd.read_csv('../data/Huetal2022/AA_patient_1/GSM5515741_AA1_features.tsv', sep='\t', header=None)
hu_aa_p1_data_mtx = scipy.io.mmread('../data/Huetal2022/AA_patient_1/GSM5515741_AA1_matrix.mtx')
hu_aa_p1_data_df = pd.DataFrame.sparse.from_spmatrix(hu_aa_p1_data_mtx, index=hu_aa_p1_features[0])
hu_aa_p1_data_df.columns = hu_aa_p1_barcode
hu_aa_p1_data = { "cells": hu_aa_p1_barcode, "genes": hu_aa_p1_features, "data":hu_aa_p1_data_df}

In [None]:
hu_aa_p2_barcode = pd.read_csv('../data/Huetal2022/AA_patient_2/GSM5515742_AA2_barcodes.tsv', sep='\t', header=None)
hu_aa_p2_features = pd.read_csv('../data/Huetal2022/AA_patient_2/GSM5515742_AA2_features.tsv', sep='\t', header=None)
hu_aa_p2_data_mtx = scipy.io.mmread('../data/Huetal2022/AA_patient_2/GSM5515742_AA2_matrix.mtx')
hu_aa_p2_data_df = pd.DataFrame.sparse.from_spmatrix(hu_aa_p2_data_mtx, index=hu_aa_p2_features[0])
hu_aa_p2_data_df.columns = hu_aa_p2_barcode
hu_aa_p2_data = { "cells": hu_aa_p2_barcode, "genes": hu_aa_p2_features, "data":hu_aa_p2_data_df}

In [None]:
hu_n_p1_barcode = pd.read_csv('../data/Huetal2022/N_patient_1/GSM5515743_Normal1_barcodes.tsv', sep='\t', header=None)
hu_n_p1_features = pd.read_csv('../data/Huetal2022/N_patient_1/GSM5515743_Normal1_features.tsv', sep='\t', header=None)
hu_n_p1_data_mtx = scipy.io.mmread('../data/Huetal2022/N_patient_1/GSM5515743_Normal1_matrix.mtx')
hu_n_p1_data_df = pd.DataFrame.sparse.from_spmatrix(hu_n_p1_data_mtx, index=hu_n_p1_features[0])
hu_n_p1_data_df.columns = hu_n_p1_barcode
hu_n_p1_data = { "cells": hu_n_p1_barcode, "genes": hu_n_p1_features, "data":hu_n_p1_data_df}

In [None]:
hu_n_p2_barcode = pd.read_csv('../data/Huetal2022/N_patient_2/GSM5515744_Normal2_barcodes.tsv', sep='\t', header=None)
hu_n_p2_features = pd.read_csv('../data/Huetal2022/N_patient_2/GSM5515744_Normal2_features.tsv', sep='\t', header=None)
hu_n_p2_data_mtx = scipy.io.mmread('../data/Huetal2022/N_patient_2/GSM5515744_Normal2_matrix.mtx')
hu_n_p2_data_df = pd.DataFrame.sparse.from_spmatrix(hu_n_p2_data_mtx, index=hu_n_p2_features[0])
hu_n_p2_data_df.columns = hu_n_p2_barcode
hu_n_p2_data = { "cells": hu_n_p2_barcode, "genes": hu_n_p2_features, "data":hu_n_p2_data_df}

In [None]:
zhu_fl_meta = pd.read_csv('../data/Zhuetal2021/full_len/metadata_subsample.txt', sep="\t", index_col=0)
zhu_fl_sc_counts = pd.read_csv('../data/Zhuetal2021/full_len/GSE145531_Full_length_SingleCell_counts.txt', sep='\t')
zhu_3p_meta = pd.read_csv('../data/Zhuetal2021/3prime/GSE145668_series_matrix_subset.txt', sep="\t", index_col=0)
zhu_3p_sc_aa_counts = pd.read_csv('../data/Zhuetal2021/3prime/GSE145668_AA_cells_counts.txt', sep='\t', index_col=0)
zhu_3p_sc_cntrl_counts = pd.read_csv('../data/Zhuetal2021/3prime/GSE145668_Ctrl_cells_counts.txt', sep='\t', index_col=0)

### Separating  AA from healthy data in `zhu_fl_sc_counts`

In [None]:
zhu_fl_meta_t = zhu_fl_meta.T
non_severe_AA_patients = zhu_fl_meta_t.loc[zhu_fl_meta_t['Sample_disease_state'] == "disease state: non-SAA"]
healthy_patients = zhu_fl_meta_t.loc[zhu_fl_meta_t['Sample_disease_state'] == "disease state: Ctrl"]
zhu_fl_sc_aa_counts = zhu_fl_sc_counts.filter(axis=1, items=list(non_severe_AA_patients.index))
zhu_fl_sc_healthy_counts = zhu_fl_sc_counts.filter(axis=1, items=list(healthy_patients.index))

Setting Zhu row indices to be consistent with Hu (no decimal section)

In [None]:
zhu_fl_sc_aa_counts_decimals = pd.core.series.Series(zhu_fl_sc_aa_counts.index.values)
zhu_fl_sc_aa_counts_genes = zhu_fl_sc_aa_counts_decimals.apply(lambda x: x.split('.')[0])
zhu_fl_sc_aa_counts = zhu_fl_sc_aa_counts.set_index(zhu_fl_sc_aa_counts_genes)

zhu_fl_sc_healthy_counts_decimals = pd.core.series.Series(zhu_fl_sc_healthy_counts.index.values)
zhu_fl_sc_healthy_counts_genes = zhu_fl_sc_healthy_counts_decimals.apply(lambda x: x.split('.')[0])
zhu_fl_sc_healthy_counts = zhu_fl_sc_healthy_counts.set_index(zhu_fl_sc_healthy_counts_genes)

zhu_3p_sc_aa_counts_decimals = pd.core.series.Series(zhu_3p_sc_aa_counts.index.values)
zhu_3p_sc_aa_counts_genes = zhu_3p_sc_aa_counts_decimals.apply(lambda x: x.split('.')[0])
zhu_3p_sc_aa_counts = zhu_3p_sc_aa_counts.set_index(zhu_3p_sc_aa_counts_genes)

zhu_3p_sc_cntrl_counts_decimals = pd.core.series.Series(zhu_3p_sc_cntrl_counts.index.values)
zhu_3p_sc_cntrl_counts_genes = zhu_3p_sc_cntrl_counts_decimals.apply(lambda x: x.split('.')[0])
zhu_3p_sc_cntrl_counts = zhu_3p_sc_cntrl_counts.set_index(zhu_3p_sc_cntrl_counts_genes)

# Quick overview of which read data is where

Aplastic anemia data for Hu is in `hu_aa_p1_data['data']` and `hu_aa_p2_data['data']`.

Healthy data for Hu is in `hu_n_p1_data['data']` and `hu_n_p2_data['data']`

Full Length Aplastic anemia data for Zhu is in `zhu_fl_sc_aa_counts`

Full Length Helathy data for Zhu is in `zhu_fl_sc_healthy_counts`

3 prime Aplastic anemia data for Zhu is in `zhu_3p_sc_aa_counts`

3 prime Healthy data for Zhu is in `zhu_3p_sc_cntrl_counts`

Series of genes common to all datasets is in `common_genes_to_all_series`

I don't think we should use the Zhu Response data (at least not until I read the paper more closely and understand what they did)

# Inner joins to make sick dataset and healthy dataset

In [None]:
hu_aa_data = pd.merge(hu_aa_p1_data['data'], hu_aa_p2_data['data'], left_index=True, right_index=True)
hu_aa_data

In [None]:
zhu_aa_data = pd.merge(zhu_fl_sc_aa_counts, zhu_3p_sc_aa_counts, left_index=True, right_index=True)
zhu_aa_data

In [None]:
all_aa_data = pd.merge(hu_aa_data, zhu_aa_data, left_index=True, right_index=True)
all_aa_data

In [None]:
all_aa_datat = all_aa_data.T

In [None]:
all_aa_datat['Aplastic Anemia'] = 1
all_aa_datat

In [None]:
all_aa_data = all_aa_datat.T
all_aa_data

In [None]:
hu_healthy_data = pd.merge(hu_n_p1_data['data'], hu_n_p2_data['data'], left_index=True, right_index=True)
hu_healthy_data

In [None]:
zhu_healthy_data = pd.merge(zhu_fl_sc_healthy_counts, zhu_3p_sc_cntrl_counts, left_index=True, right_index=True)
zhu_healthy_data

In [None]:
all_healthy_data = pd.merge(hu_healthy_data, zhu_healthy_data, left_index=True, right_index=True)
all_healthy_data

In [None]:
all_healthy_datat = all_healthy_data.T
all_healthy_datat['Aplastic Anemia'] = 0

In [None]:
all_healthy_data = all_healthy_datat.T
all_healthy_data

In [None]:
full_dataset = pd.merge(all_aa_data, all_healthy_data, left_index=True, right_index=True)
full_dataset

In [None]:
full_dataset = full_dataset.drop(['GeneSymbol_x', 'GeneSymbol_y'], axis='columns')

In [None]:
full_dataset.astype('int')

In [None]:
full_data_genes_cols = full_dataset.T
full_data_genes_cols

In [None]:
full_data_genes_cols.to_csv('../data/merged_datasets_cellrow_genecol.csv')