In [1]:
import pandas as pd

In [45]:
def process_barcode(barcode, **kwargs):
    """
    Process a cell barcode and extract its main body, lane number, and suffix.
    You can define the separators by input them into **kwargs

    Parameters:
    barcode (str): The cell barcode
    seps (list): a list of separators in order
    parts (list): a list of names of parts in order

    Returns:
    tuple: a dictionary of all parts in the barcode.

    Example Usage:
    params = {
    "seps" : ['#', '-'],
    "parts" :['lane', 'barcode', 'suffix']
    }

    archR_barcodes_sep = df['x'].apply(lambda x: pd.Series(process_barcode(barcode=x, **params)))

    """
    assert("seps" in kwargs.keys() and "parts" in kwargs.keys())
    assert(len(kwargs['seps']) == len(kwargs['parts']) - 1)

    ret_dic = {}

    for i, sep in enumerate(kwargs['seps']):
        parts = barcode.split(sep, 1)
        ret_dic[kwargs['parts'][i]] = parts[0]
        barcode = parts[1]
    
    ret_dic[kwargs['parts'][-1]] = barcode

    return ret_dic

In [5]:
df = pd.read_csv("/Users/jiaxinli/Downloads/archR_barcodes.csv", index_col= 0)

In [8]:
df

Unnamed: 0,x
1,lane3#AAACAGCCAACCTAAT-1
2,lane2#AAACAGCCAACTGGCT-1
3,lane4#AAACAGCCAAGCCAGA-1
4,lane2#AAACAGCCAAGCTAAA-1
5,lane1#AAACAGCCAAGGTAAC-1
...,...
48238,lane2#TTTGTTGGTTCAAGCA-1
48239,lane3#TTTGTTGGTTCCTCCT-1
48240,lane4#TTTGTTGGTTGGCCGA-1
48241,lane1#TTTGTTGGTTTAACGG-1


In [20]:
df.x[1]

'lane3#AAACAGCCAACCTAAT-1'

In [26]:
process_barcode(df['x'][1], **params)

{'lane': 'lane3', 'barcode': 'AAACAGCCAACCTAAT', 'suffix': '1'}

In [33]:
params = {
    "seps" : ['#', '-'],
    "parts" :['lane', 'barcode', 'suffix']
}

archR_barcodes_sep = df['x'].apply(lambda x: pd.Series(process_barcode(barcode=x, **params)))


In [28]:
ATAC_barcodes = pd.read_csv("/Users/jiaxinli/Downloads/Dataset6_sample_1.atac.qc.hg38.metadata.tsv", sep = '\t')

In [30]:
params = {
    "seps" : ['_', '_', '_'],
    "parts" :['barcode', 'dataset', 'sample', 'suffix']
}

ATAC_barcodes_sep = ATAC_barcodes['barcode'].apply(lambda x: pd.Series(process_barcode(x, **params)))

In [37]:
sum(ATAC_barcodes_sep['barcode'].isin(archR_barcodes_sep['barcode']))

46056

In [42]:
sum(archR_barcodes_sep.loc[archR_barcodes_sep['lane'] == 'lane1', 'barcode'].isin(ATAC_barcodes_sep['barcode']))

12306

In [43]:
len(archR_barcodes_sep.loc[archR_barcodes_sep['lane'] == 'lane1', 'barcode'])

12306

In [39]:
len(archR_barcodes_sep)

48242

In [38]:
len(set(archR_barcodes_sep['barcode']))

46905

In [32]:
ATAC_barcodes_sep

Unnamed: 0,barcode,dataset,sample,suffix
0,AAACAGCCAAACCTTG,Dataset6,sample,1
1,AAACAGCCAAACGCGA,Dataset6,sample,1
2,AAACAGCCAAACGGGC,Dataset6,sample,1
3,AAACAGCCAAAGCGCA,Dataset6,sample,1
4,AAACAGCCAAAGCTAA,Dataset6,sample,1
...,...,...,...,...
352182,TTTGTTGGTTTGAGGC,Dataset6,sample,1
352183,TTTGTTGGTTTGCAGA,Dataset6,sample,1
352184,TTTGTTGGTTTGGCTT,Dataset6,sample,1
352185,TTTGTTGGTTTGGGCG,Dataset6,sample,1
