This notebook is used for statisticaly analyse a sample of real DNA sequences. Those stats are used for error corrections.
The data bases used are: [UCSC Genome Browser](https://genome-euro.ucsc.edu/cgi-bin/hgGateway?token=0.alae9SNJWSG9YVfLP5Tja_K1qR7KPAer7Wq0cYUGxntYv451eBTYAyiwHoVmCQyn-QEy_KFIIqqiOXCqbu_eZTJKkHBbixmGmp954gHew5pR49titMPciB1d5WX6E5XDR0hY_QOxtHEvQlSiddNbB3zextrbBzlUfzJypEKHRk8e5ooQrswN7JZ9rPOmHXpa_TqD_vAAqgVaOCpJYg8RDCv1pHWqUYxW1G3CtILQZOMHg6DKfTpfKXC3kbvT1MTK-nhw1KJzrN8fT3LsW3xKp9b1JCXqMn9Be7QyQnL6AHI2dIDzLNamidW21tnooExzVKlPNc9Kqe12KdVruB8ejHftTarjX1A7S81bBRMJVGXGXFe_RuzSDyx8HRyior70lCIg0Sy6nnHYjLiajZx_mdva2m1tQQyzv5KXZEvcztTWJsJ3oBJzs2ztwvDAbIQnyfoB2bypzTecPl9huQpo-8P5KWwErFNMb7qOnlv-Ny7ljU4A1jvCTxtY9kKriNpwbWDlj6xPTWRNSp76ygKAdI5TBWakRbu_rooxqThj0cMSH0pM2bNuIy41cofG2_fifVkeFI9lPVBUnFpjHwYLtuLU5R5aNwFpqvpj5rkY-zuTon_5Aji-DwjS5rgPErHxPulV27jWQPlBgIrHJNSDLX2vdohzt1v6W5mg9sroIjx-F8AEkWBLL67Jyy0tbrLvvk9K3GQ6uGJXsUgCy7YDvehjTsLAyedGvIqckhlE6B9-okKwC9XrG1ms4Akg8btoYJARQBe-w3YSc1rgf538WGF01znPdNKd44hH-daXB_ag9LR8JbIsQuFImS7UZG0nPSlj__vLdSKlu9MrZpTHqnCwvtYYcrvG3Lx6mB30QxZkQsoklGahGxwPBXEfJ3zo.5ilOfETFBtdOuAUyZl5Yxw.da5bf71d2a5fb514df563ee6acdedd9750e6ae08b48cca10595c10e5ee9ca870&redirect=manual&source=genome.ucsc.edu) and [NCBI RefSeq](https://www.ncbi.nlm.nih.gov/refseq/)

## Header

In [5]:
from Bio import SeqIO
import polars as pl
import yaml

## Data

In [6]:
data=[]

for record in SeqIO.parse("../data/yeast_test_sequence.fasta", "fasta"):
    data.append({
        "id": record.id,
        "length": len(record.seq),
        "sequence": str(record.seq)
    })

sequences = pl.DataFrame(data)

for base in ["A", "T", "G", "C"]:
    sequences = sequences.with_columns([(pl.col("sequence").str.count_matches(base) + pl.col("sequence").str.count_matches(base.lower())).alias(base)])
sequences.head()

id,length,sequence,A,T,G,C
str,i64,str,u32,u32,u32,u32
"""sacCer3_ncbiRefSeq_NM_00117815…",1585,"""gttgtggcgccacacttttttttccataaa…",452,416,319,398
"""sacCer3_ncbiRefSeq_NM_00118004…",763,"""ataatgagattgtgtgaaagatgagatata…",251,210,139,163
"""sacCer3_ncbiRefSeq_NM_00118458…",628,"""catgggttgttgctatttaaacgatcgctg…",170,207,138,113
"""sacCer3_ncbiRefSeq_NM_00117820…",2182,"""actacgggtgcctccacaaatagataagaa…",692,729,415,346
"""sacCer3_ncbiRefSeq_NM_00117989…",787,"""cgacatggtgtcctatttccaccacagaat…",225,240,148,174


## Bases stats

In [7]:
length = int(sequences["length"].sum())
baseDistribution = {
    "A": float(100 * sequences["A"].sum() / length),
    "T": float(100 * sequences["T"].sum() / length),
    "G": float(100 * sequences["G"].sum() / length),
    "C": float(100 * sequences["C"].sum() / length)
}

print(baseDistribution)

{'A': 32.969778188685225, 'T': 28.877710200759886, 'G': 19.527052990794637, 'C': 18.625458619760252}


In [8]:
stats ={
    "bases distribution": baseDistribution
}

with open('statistics.yaml', 'w') as file:
    yaml.dump(stats, file)