# Investigation of a bias in ARFs

- by obtaining codon frequencies and codon pair frequencies in non-overlapping regions of canonical ORFs (this notebook)
- then, by calculating the expected codon pair frequency and ARF codon frequency ([next notebbok](./210527_2_calc_expected_freqs.ipynb))


In [1]:
import pandas as pd
from itertools import product
from collections import Counter
from tqdm.notebook import tqdm

In [2]:
from pyscripts.config import path2
from pyscripts.datasets import Metadata, DatasetLoader
from pyscripts.genomeutil import sliding, iter_cds_nonoverlapping_regions
metadata = Metadata()
dloader  = DatasetLoader()

In [3]:
# Ignore codons and codon pairs that contain ambiguous bases.
codon_idx   = [*map(''.join, product('ACGT', repeat=3))]
bicodon_idx = [*map(''.join, product('ACGT', repeat=6))]
 
def count_freqs(acc):
    longest_record = max(dloader.load_genome(acc), key=len)
    longest_seq    = str(longest_record.seq)

    nts      = pd.DataFrame(index=[*'ACGT'], dtype=pd.Int32Dtype())
    codons   = pd.DataFrame(index=codon_idx, dtype=pd.Int32Dtype())
    bicodons = pd.DataFrame(index=bicodon_idx, dtype=pd.Int32Dtype())
    
    for locus_tag, location in iter_cds_nonoverlapping_regions(longest_record):
        locus_seq = location.extract(longest_seq)
        nts[locus_tag]      = pd.Series(Counter(locus_seq), dtype=pd.Int32Dtype())
        codons[locus_tag]   = pd.Series(Counter(sliding(locus_seq, 3, 3)), dtype=pd.Int32Dtype())
        bicodons[locus_tag] = pd.Series(Counter(sliding(locus_seq, 6, 3)), dtype=pd.Int32Dtype())

    return acc, nts.T.sum(), codons.T.sum(), bicodons.T.sum()

In [4]:
nts_summary      = pd.DataFrame(index=[*'ACGT']  , columns=metadata.acc['refseq'], dtype=pd.Int32Dtype())
codons_summary   = pd.DataFrame(index=codon_idx  , columns=metadata.acc['refseq'], dtype=pd.Int32Dtype())
bicodons_summary = pd.DataFrame(index=bicodon_idx, columns=metadata.acc['refseq'], dtype=pd.Int32Dtype())

from multiprocessing import Pool
with Pool(100) as pool:
    for acc, nts, codons, bicodons in tqdm(pool.imap_unordered(count_freqs, metadata.acc['refseq']), total=len(metadata.acc)):
        nts_summary[acc]      = nts
        codons_summary[acc]   = codons
        bicodons_summary[acc] = bicodons

nts_summary.sort_index(axis=1).to_pickle(path2.data/'kmer'/'nts_summary.pkl.bz2')
codons_summary.sort_index(axis=1).to_pickle(path2.data/'kmer'/'codons_summary.pkl.bz2')
bicodons_summary.sort_index(axis=1).to_pickle(path2.data/'kmer'/'bicodons_summary.pkl.bz2')

  0%|          | 0/2624 [00:00<?, ?it/s]