In [3]:
!pip install Bio

Collecting Bio
  Downloading bio-1.7.1-py3-none-any.whl.metadata (5.7 kB)
Collecting biopython>=1.80 (from Bio)
  Downloading biopython-1.84-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting gprofiler-official (from Bio)
  Downloading gprofiler_official-1.0.0-py3-none-any.whl.metadata (11 kB)
Collecting mygene (from Bio)
  Downloading mygene-3.2.2-py2.py3-none-any.whl.metadata (10 kB)
Collecting biothings-client>=0.2.6 (from mygene->Bio)
  Downloading biothings_client-0.3.1-py2.py3-none-any.whl.metadata (9.8 kB)
Downloading bio-1.7.1-py3-none-any.whl (280 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m281.0/281.0 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading biopython-1.84-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m47.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading gprofiler_official-1.0.0-py3-none-any.whl

In [4]:
from Bio import SeqIO
from collections import Counter
import pandas as pd


def analyze_dna(file_path):
    # بارگذاری دنباله از فایل GenBank
    record = SeqIO.read(file_path, "genbank")
    dna_sequence = str(record.seq).upper()

    if len(dna_sequence) < 2000:
        raise ValueError("طول دنباله DNA باید حداقل 2000 نوکلئوتید باشد.")

    # شمارش نوکلئوتیدهای ساده
    simple_counts = Counter(dna_sequence)
    simple_percentages = {nuc: count / len(dna_sequence) * 100 for nuc, count in simple_counts.items()}

    # شمارش دی‌نوکلئوتیدها
    di_nucleotides = [dna_sequence[i:i+2] for i in range(len(dna_sequence) - 1)]
    di_counts = Counter(di_nucleotides)
    di_percentages = {dinuc: count / len(di_nucleotides) * 100 for dinuc, count in di_counts.items()}

    # شمارش تری‌نوکلئوتیدها
    tri_nucleotides = [dna_sequence[i:i+3] for i in range(len(dna_sequence) - 2)]
    tri_counts = Counter(tri_nucleotides)
    tri_percentages = {trinuc: count / len(tri_nucleotides) * 100 for trinuc, count in tri_counts.items()}

    # ساخت جداول
    simple_table = pd.DataFrame({
        "Nucleotide": list(simple_counts.keys()),
        "Count": list(simple_counts.values()),
        "Percentage": list(simple_percentages.values())
    })

    di_table = pd.DataFrame({
        "Di-nucleotide": list(di_counts.keys()),
        "Count": list(di_counts.values()),
        "Percentage": list(di_percentages.values())
    })

    tri_table = pd.DataFrame({
        "Tri-nucleotide": list(tri_counts.keys()),
        "Count": list(tri_counts.values()),
        "Percentage": list(tri_percentages.values())
    })

    # محاسبه درصد GC
    gc_content = (simple_counts.get('G', 0) + simple_counts.get('C', 0)) / len(dna_sequence) * 100

    return simple_table, di_table, tri_table, gc_content

# استفاده از تابع
file_path = "sequence.gb"
simple_table, di_table, tri_table, gc_content = analyze_dna(file_path)

# نمایش نتایج
print("جدول نوکلئوتیدهای ساده:")
print(simple_table)

print("\nجدول دی‌نوکلئوتیدها:")
print(di_table)

print("\nجدول تری‌نوکلئوتیدها:")
print(tri_table)

print(f"\nدرصد GC: {gc_content:.2f}%")


جدول نوکلئوتیدهای ساده:
  Nucleotide  Count  Percentage
0          G   2245   13.088090
1          T   5047   29.423424
2          C   4182   24.380575
3          A   5679   33.107911

جدول دی‌نوکلئوتیدها:
   Di-nucleotide  Count  Percentage
0             GT    589    3.434002
1             TT   1479    8.622901
2             TC   1133    6.605644
3             CA   1397    8.144823
4             AT   1691    9.858909
5             TG    593    3.457323
6             TA   1842   10.739272
7             AG    841    4.903218
8             GC    562    3.276586
9             CT   1288    7.509328
10            AA   1768   10.307836
11            AC   1379    8.039879
12            GA    672    3.917910
13            CC   1108    6.459888
14            GG    422    2.460354
15            CG    388    2.262127

جدول تری‌نوکلئوتیدها:
   Tri-nucleotide  Count  Percentage
0             GTT    161    0.938721
1             TTC    369    2.151478
2             TCA    369    2.151478
3          