# Process_barcode


In [1]:
import pandas as pd

In [45]:
def process_barcode(barcode, **kwargs):
    """
    Process a cell barcode and extract its main body, lane number, and suffix.
    You can define the separators by input them into **kwargs

    Parameters:
    barcode (str): The cell barcode
    seps (list): a list of separators in order
    parts (list): a list of names of parts in order

    Returns:
    tuple: a dictionary of all parts in the barcode.

    Example Usage:
    params = {
    "seps" : ['#', '-'],
    "parts" :['lane', 'barcode', 'suffix']
    }

    archR_barcodes_sep = df['x'].apply(lambda x: pd.Series(process_barcode(barcode=x, **params)))

    """
    assert("seps" in kwargs.keys() and "parts" in kwargs.keys())
    assert(len(kwargs['seps']) == len(kwargs['parts']) - 1)

    ret_dic = {}

    for i, sep in enumerate(kwargs['seps']):
        parts = barcode.split(sep, 1)
        ret_dic[kwargs['parts'][i]] = parts[0]
        barcode = parts[1]
    
    ret_dic[kwargs['parts'][-1]] = barcode

    return ret_dic

In [5]:
df = pd.read_csv("/Users/jiaxinli/Downloads/archR_barcodes.csv", index_col= 0)

In [8]:
df

Unnamed: 0,x
1,lane3#AAACAGCCAACCTAAT-1
2,lane2#AAACAGCCAACTGGCT-1
3,lane4#AAACAGCCAAGCCAGA-1
4,lane2#AAACAGCCAAGCTAAA-1
5,lane1#AAACAGCCAAGGTAAC-1
...,...
48238,lane2#TTTGTTGGTTCAAGCA-1
48239,lane3#TTTGTTGGTTCCTCCT-1
48240,lane4#TTTGTTGGTTGGCCGA-1
48241,lane1#TTTGTTGGTTTAACGG-1


In [20]:
df.x[1]

'lane3#AAACAGCCAACCTAAT-1'

In [26]:
process_barcode(df['x'][1], **params)

{'lane': 'lane3', 'barcode': 'AAACAGCCAACCTAAT', 'suffix': '1'}

In [33]:
params = {
    "seps" : ['#', '-'],
    "parts" :['lane', 'barcode', 'suffix']
}

archR_barcodes_sep = df['x'].apply(lambda x: pd.Series(process_barcode(barcode=x, **params)))


In [28]:
ATAC_barcodes = pd.read_csv("/Users/jiaxinli/Downloads/Dataset6_sample_1.atac.qc.hg38.metadata.tsv", sep = '\t')

In [30]:
params = {
    "seps" : ['_', '_', '_'],
    "parts" :['barcode', 'dataset', 'sample', 'suffix']
}

ATAC_barcodes_sep = ATAC_barcodes['barcode'].apply(lambda x: pd.Series(process_barcode(x, **params)))

In [37]:
sum(ATAC_barcodes_sep['barcode'].isin(archR_barcodes_sep['barcode']))

46056

In [42]:
sum(archR_barcodes_sep.loc[archR_barcodes_sep['lane'] == 'lane1', 'barcode'].isin(ATAC_barcodes_sep['barcode']))

12306

In [43]:
len(archR_barcodes_sep.loc[archR_barcodes_sep['lane'] == 'lane1', 'barcode'])

12306

In [39]:
len(archR_barcodes_sep)

48242

In [38]:
len(set(archR_barcodes_sep['barcode']))

46905

In [32]:
ATAC_barcodes_sep

Unnamed: 0,barcode,dataset,sample,suffix
0,AAACAGCCAAACCTTG,Dataset6,sample,1
1,AAACAGCCAAACGCGA,Dataset6,sample,1
2,AAACAGCCAAACGGGC,Dataset6,sample,1
3,AAACAGCCAAAGCGCA,Dataset6,sample,1
4,AAACAGCCAAAGCTAA,Dataset6,sample,1
...,...,...,...,...
352182,TTTGTTGGTTTGAGGC,Dataset6,sample,1
352183,TTTGTTGGTTTGCAGA,Dataset6,sample,1
352184,TTTGTTGGTTTGGCTT,Dataset6,sample,1
352185,TTTGTTGGTTTGGGCG,Dataset6,sample,1


In [46]:
a = "abc\tabc"
a.split()

['abc', 'abc']

In [None]:
class GenomewideGenerator(torch.utils.data.Dataset):
    """A data generator for dragonnfruit inputs. Adapted from bpnet-lite.

    This generator takes in a set of sequences and output signals 
    and will return a single element with random jitter and reverse-complement 
    augmentation applied. Because the data is single-cell where the output
    signals differ across cells, each returned element is a random locus
    in a random cell. 

    A conceptual difference between this DataGenerator and the one implemented
    in bpnet-lite is that the bpnet-lite one assumes that you can extract all
    loci into an array. Here, because there are hundreds of thousands of peaks
    and potentially thousands of cells, it is actually more efficient to 

    Parameters
    ----------
    sequences: dict of torch.tensors, shape=(n, 4), dtype=torch.float32
        A dictionary of the nucleotide sequences to use.

    signals: dict of torch.tensors, shape=(n, n_cells), dtype=torch.float32
        A dictionary of the cell signals

    loci: str or pandas.DataFrame
        A set of loci to use.
    
    cell_states: 
    """

    def __init__(self, sequence, signal, neighbors, cell_states, 
        read_depths, trimming, window, chroms, cells_per_loci=1, 
        reverse_complement=True, random_state=None):
        self.trimming = trimming
        self.window = window
        self.chroms = chroms
        self.cells_per_loci = cells_per_loci
        self.reverse_complement = reverse_complement
        self.random_state = numpy.random.RandomState(random_state)

        self.signal = {chrom: signal[chrom] for chrom in chroms}
        self.sequence = {chrom: sequence[chrom] for chrom in chroms}
        self.neighbors = neighbors
        self.cell_states = cell_states
        self.read_depths = read_depths
        self._lengths = numpy.array([seq.shape[0] for seq in self.sequence.values()])

    def __len__(self):
        return sum(self._lengths)

    def __getitem__(self, idx):
        c_idx = numpy.random.choice(len(self._lengths), 
            p=self._lengths / self._lengths.sum())
        chrom = self.chroms[c_idx]

        mid = numpy.random.randint(10000, self._lengths[c_idx]-10000)
        cell_idx = numpy.random.randint(self.cell_states.shape[0])
        return _extract_example(self, chrom, mid, cell_idx, idx)

In [None]:
def _extract_example(self, chrom, mid, cell_idx, idx):
    """An internal function for extracting a single example.

    This function will extract an example from a given position in a given
    cell and handle adding jitter, creating the dynamic pseudobulk, and
    potentially reverse complementing the sequence. It will return the
    one-hot encoded sequence, signal, cell representation, and read depth
    of that cell.

    Note that this function returns a *single* example and that the data
    generators below handle the creation of batches by repeatedly calling this
    function and concatenating the examples.


    Parameters
    ----------
    self: torch.utils.data.Dataset
        This is one of the data generators defined below. Although they may
        differ in how loci and cells are selected, they share the same logic
        for how to extract the inputs given a cell and location.

    chrom: str
        The chromosome name to extract from.

    mid: int
        The middle position to extract a window around.

    cell_idx: int
        The integer index of the cell to operate on.

    idx: int
        The index being generated. Necessary when reverse complementing every
        even sequence.


    Returns
    -------
    X: torch.Tensor, shape=(4, 2114)
        The one-hot encoded sequence

    y: torch.Tensor, shape=(1000,)
        The signal to be predicted

    c: torch.Tensor, shape=(50,)
        The cell representation to be extracted.

    r: torch.Tensor, shape=(1,)
        The read depth of that particular cell.
    """

    start, end = mid - self.window // 2, mid + self.window // 2
    neighbs = self.neighbors[cell_idx]

    X = self.sequence[chrom][start:end].T.astype('float32')
    y = self.signal[chrom][:, start+self.trimming:end-self.trimming]
    y = numpy.array(y[neighbs].sum(axis=0))[0]

    c = self.cell_states[cell_idx]
    r = self.read_depths[cell_idx]

    if self.reverse_complement and idx % 2 == 0:
        X = X[::-1][:, ::-1].copy()
        y = y[::-1].copy()

    X = torch.from_numpy(X)
    y = torch.from_numpy(y)
    c = torch.from_numpy(c)
    r = torch.from_numpy(r)
    return X, y, c, r
∏