# This is a direct adaptation of the notebook in Morris Lab git repo: https://github.com/morris-lab/newCloneCalling/blob/main/cloneCalling_scripts/celltag_analysis_single_assay.ipynb

Installing/importing the necessary files

In [148]:
from google.colab import drive
drive.mount('/content/drive')
import sys
sys.path.append('/content/drive/MyDrive/CellTag/')



Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [149]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
from scipy import io
import matplotlib.backends.backend_pdf


In [3]:
#PARAMS
TRIPLET_TH = 1
STARCODE_TH = 2
BIN_TH = 1
METRIC_LOW = 1
METRIC_HIGH = 25

# Doing hsc_celltags_raw first

Input the _met_ctmax.mtx_, _met_celltag.txt_, _met_cell.txt_

This was what was used to save the files, so using variable names from there:

io.mmwrite("../proc_files/{0}/{0}_met_ctmat.mtx".format(KEY_CURR), celltag_mat_met)
np.savetxt("../proc_files/{0}/{0}_met_cells.txt".format(KEY_CURR),cells_met, delimiter='\t', fmt='%s')
np.savetxt("../proc_files/{0}/{0}_met_celltags.txt".format(KEY_CURR),celltags_met, delimiter='\t', fmt='%s')



In [199]:
celltag_matrix_path = "/content/drive/MyDrive/CellTag/Raw Data/hsc_celltags_raw/lsk_rna/lsk_rna_allow_alt_celltag_mat.mtx"
cells_allow_path = "/content/drive/MyDrive/CellTag/Raw Data/hsc_celltags_raw/lsk_rna/lsk_rna_allow_alt_cells.txt"
celltags_allow_path = "/content/drive/MyDrive/CellTag/Raw Data/hsc_celltags_raw/lsk_rna/lsk_rna_allow_alt_celltags.txt"
celltag_mat_met = io.mmread(celltag_matrix_path)
cells_met = np.loadtxt(cells_allow_path, dtype='str')
celltags_met = np.loadtxt(celltags_allow_path, dtype='str')


Function to get the cell IDs, cell barcodes and UMI counts by reversing the function they used before saving it

In [200]:
def spmtx_to_table(celltag_mat, cells, celltags):
    '''
    Convert csr sparse matrix to a table
    '''
    # Check if the sparse matrix is non-empty
    if celltag_mat.nnz > 0:
        # Convert the sparse matrix to a dense NumPy array
        celltag_array = celltag_mat.toarray()
        # Extract nonzero indices
        row, col = np.nonzero(celltag_array)
        # Retrieve data from original arrays using indices
        row_data = cells[row]
        col_data = celltags[col]
        count_data = celltag_array[row, col]
        return row_data, col_data, count_data
    else:
        # If the sparse matrix is empty, return empty arrays
        return np.array([]), np.array([]), np.array([])


Calling the function for this folder

In [201]:
# celltag_mat_met
CellList, BarcodeList, UMICounts = spmtx_to_table(celltag_mat_met, cells_met, celltags_met)

In [202]:
CellList

array(['d5r2-GAAGCCCAGAAGTATC-1', 'd5r2-GAAGCCCAGAAGTATC-1',
       'd5r2-GAAGCCCAGAAGTATC-1', ..., 'd5r2-AATGGCTTCACTAGCA-4',
       'd5r2-AATGGCTTCACTAGCA-4', 'd5r2-AATGGCTTCACTAGCA-4'], dtype='<U23')

Creating the dataframe that contains all this information

In [203]:
newData = []
for i in CellList:
  split_result = i.split("-")
  newData.append(split_result)
newData = pd.DataFrame(newData, columns = ["Incomplete sample", "cellID", "Sample ID"])
newData["barcode"] = BarcodeList
newData["UMI counts"] = UMICounts

In [206]:
newData['kind'] = "RNA"
newData['sample'] = newData.apply(lambda row: f"{row['Incomplete sample'][:2]}-{row['kind']}-{row['Incomplete sample'][3:]}", axis=1)
newData

Unnamed: 0,Incomplete sample,cellID,Sample ID,barcode,UMI counts,kind,sample
0,d5r2,GAAGCCCAGAAGTATC,1,GTAGTATGCTAATAGATATGATTCAAAA,26,RNA,d5-RNA-2
1,d5r2,GAAGCCCAGAAGTATC,1,TTTGTTTTCTATCAGTCCTGCGACAAGC,1,RNA,d5-RNA-2
2,d5r2,GAAGCCCAGAAGTATC,1,CTTGTGTCCTTCTAGAATTGTTTCATCT,1,RNA,d5-RNA-2
3,d5r2,GAAGCCCAGAAGTATC,1,ATGGTCGGCTAAGAGTCGTGTCGCAGCC,1,RNA,d5-RNA-2
4,d5r2,GAAGCCCAGAAGTATC,1,CTTGTCACCTTTCAGCTCTGTAACAAGT,1,RNA,d5-RNA-2
...,...,...,...,...,...,...,...
907307,d5r2,AATGGCTTCACTAGCA,4,GTAGTTAGCTAAAAGTGATGGGACAATA,1,RNA,d5-RNA-2
907308,d5r2,AATGGCTTCACTAGCA,4,AGAGTTGGCTTGAAGAGGTGCGACACTA,68,RNA,d5-RNA-2
907309,d5r2,AATGGCTTCACTAGCA,4,AATGTTTACTAGCAGTTATGGGGCATTA,1,RNA,d5-RNA-2
907310,d5r2,AATGGCTTCACTAGCA,4,AGTGTGCGCTTGTAGCTTTGTGACACAT,1,RNA,d5-RNA-2


In [207]:
newData['sample'].unique()

array(['d5-RNA-2', 'd5-RNA-1', 'd2-RNA-5'], dtype=object)

Filtering the cells based on he number of barcodes associated with them: >1 and <25

In [208]:
# Count unique Lineage Barcodes for each Cell Barcode
barcode_counts = newData.groupby('cellID')['barcode'].nunique()
# Filter based on the count
filtered_data = newData[newData['cellID'].map(barcode_counts).between(1, 25)].reset_index(drop=True)
filtered_data

Unnamed: 0,Incomplete sample,cellID,Sample ID,barcode,UMI counts,kind,sample
0,d5r2,GAAGCCCAGAAGTATC,1,GTAGTATGCTAATAGATATGATTCAAAA,26,RNA,d5-RNA-2
1,d5r2,GAAGCCCAGAAGTATC,1,TTTGTTTTCTATCAGTCCTGCGACAAGC,1,RNA,d5-RNA-2
2,d5r2,GAAGCCCAGAAGTATC,1,CTTGTGTCCTTCTAGAATTGTTTCATCT,1,RNA,d5-RNA-2
3,d5r2,GAAGCCCAGAAGTATC,1,ATGGTCGGCTAAGAGTCGTGTCGCAGCC,1,RNA,d5-RNA-2
4,d5r2,GAAGCCCAGAAGTATC,1,CTTGTCACCTTTCAGCTCTGTAACAAGT,1,RNA,d5-RNA-2
...,...,...,...,...,...,...,...
815621,d5r2,AATGGCTTCACTAGCA,4,GTAGTTAGCTAAAAGTGATGGGACAATA,1,RNA,d5-RNA-2
815622,d5r2,AATGGCTTCACTAGCA,4,AGAGTTGGCTTGAAGAGGTGCGACACTA,68,RNA,d5-RNA-2
815623,d5r2,AATGGCTTCACTAGCA,4,AATGTTTACTAGCAGTTATGGGGCATTA,1,RNA,d5-RNA-2
815624,d5r2,AATGGCTTCACTAGCA,4,AGTGTGCGCTTGTAGCTTTGTGACACAT,1,RNA,d5-RNA-2


In [209]:
selected_columns = ['cellID', 'barcode', 'sample', 'UMI counts']
final_df = filtered_data[selected_columns]
final_df = final_df.loc[np.repeat(final_df.index, final_df['UMI counts'])].reset_index(drop=True)

# Drop the 'Counts' column
final_df = final_df.drop(columns='UMI counts')
final_df
final_df.to_csv("/content/drive/MyDrive/CellTag/ProcessedData/lsk_rna.csv",index=False)

In [210]:
sample_list = ['lsk_rna', 'lsk_atac', 'B4-RNA-r2', 'B4-RNA-r1', 'B4-ATAC-r2', 'B4-ATAC-r1']
# Initialize an empty DataFrame to store the merged data
all_sample_df = pd.DataFrame()
Technique = ["RNA", "ATAC", "RNA", "RNA", "ATAC", "ATAC"]
for i, sample in enumerate(sample_list):
    file_path = f'/content/drive/MyDrive/CellTag/ProcessedData/{sample}.csv'  # Adjust the file path pattern based on your actual file names
    df = pd.read_csv(file_path)
    all_sample_df = pd.concat([all_sample_df, df], ignore_index=True)
all_sample_df


lsk_rna RNA
lsk_atac ATAC
B4-RNA-r2 RNA
B4-RNA-r1 RNA
B4-ATAC-r2 ATAC
B4-ATAC-r1 ATAC


Unnamed: 0,cellID,barcode,sample
0,GAAGCCCAGAAGTATC,GTAGTATGCTAATAGATATGATTCAAAA,d5-RNA-2
1,GAAGCCCAGAAGTATC,GTAGTATGCTAATAGATATGATTCAAAA,d5-RNA-2
2,GAAGCCCAGAAGTATC,GTAGTATGCTAATAGATATGATTCAAAA,d5-RNA-2
3,GAAGCCCAGAAGTATC,GTAGTATGCTAATAGATATGATTCAAAA,d5-RNA-2
4,GAAGCCCAGAAGTATC,GTAGTATGCTAATAGATATGATTCAAAA,d5-RNA-2
...,...,...,...
22974546,TTTGTGTTCTTGTACT,TATGTTTTCTACTAGAGCTGGGGCAAAT,B4D3-ATAC-r1-6
22974547,TTTGTGTTCTTGTACT,TCCGTTTGCTACAAGCGGTGACGCAACA,B4D3-ATAC-r1-6
22974548,TTTGTGTTCTTGTACT,TGAGTTTACTGGCAGGATTGCATCAAAC,B4D3-ATAC-r1-6
22974549,TTTGTGTTCTTGTACT,TGCGTGTTCTAGAAGAATTGATTCATTC,B4D3-ATAC-r1-6


In [211]:
all_sample_df.to_csv('/content/drive/MyDrive/CellTag/ProcessedData/all_samples.csv', index=False)
