In [6]:
pip install biopython matplotlib


Collecting biopython
  Downloading biopython-1.85-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Downloading biopython-1.85-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.3 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/3.3 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━[0m [32m3.0/3.3 MB[0m [31m91.5 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m58.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: biopython
Successfully installed biopython-1.85


In [7]:
from Bio import Entrez, SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.Align import PairwiseAligner
from Bio.pairwise2 import format_alignment
import matplotlib.pyplot as plt
from Bio import pairwise2

Entrez.email = "frankonyango100@gmail.com"

# (v) Retrieve sequences from GenBank
def get_genbank_sequences(accessions):
    sequences = []
    for acc in accessions:
        try:
            handle = Entrez.efetch(db="nucleotide", id=acc, rettype="gb", retmode="text")
            record = SeqIO.read(handle, "genbank")
            sequences.append(record)
            handle.close()
            print(f"Retrieved sequence {acc} successfully")
        except Exception as e:
            print(f"Error retrieving {acc}: {str(e)}")
    return sequences

# (i) Extract partial sequence and translate to amino acids
def extract_and_translate(sequence, start, end):
    partial_seq = sequence.seq[start-1:end]  # Python is 0-based
    protein_seq = partial_seq.translate()
    return protein_seq

# (ii) Compare sequences using global and local alignment
def compare_sequences(seq1, seq2):
    # Global alignment
    aligner = PairwiseAligner()
    aligner.mode = 'global'
    global_alignments = aligner.align(seq1, seq2)

    # Local alignment
    aligner.mode = 'local'
    local_alignments = aligner.align(seq1, seq2)

    return global_alignments, local_alignments

# (iii) Visualization of alignment
def visualize_alignment(seq1, seq2):
    # Using pairwise2 for better visualization formatting
    alignments = pairwise2.align.globalxx(seq1, seq2)
    print("Global alignment visualization:")
    print(format_alignment(*alignments[0]))

    alignments = pairwise2.align.localxx(seq1, seq2)
    print("\nLocal alignment visualization:")
    print(format_alignment(*alignments[0]))

def main():
    accessions = ['KT191142', 'DQ217792']

    # (v) Get sequences
    sequences = get_genbank_sequences(accessions)
    if len(sequences) != 2:
        print("Error: Could not retrieve both sequences")
        return

    kt191142, dq217792 = sequences

    # (i) Extract and translate partial sequence
    protein_seq = extract_and_translate(kt191142, 140, 900)
    print("\nAmino acid sequence (KT191142 positions 140-900):")
    print(protein_seq)

    # (ii) Compare sequences
    global_aligns, local_aligns = compare_sequences(kt191142.seq, dq217792.seq)

    print("\nGlobal alignment score:", global_aligns[0].score)
    print("Local alignment score:", local_aligns[0].score)

    # (iii) Visualization
    visualize_alignment(str(kt191142.seq)[:100], str(dq217792.seq)[:100])  # Using first 100 bases for visualization

if __name__ == "__main__":
    main()



Retrieved sequence KT191142 successfully
Retrieved sequence DQ217792 successfully

Amino acid sequence (KT191142 positions 140-900):
KCQKLWSTS*PVRR*LPKMDNQSSLFFCQSTLGWTRWLQETSPW*SHRIVTRVILLQVFQLWLRS





Global alignment score: 336.0
Local alignment score: 336.0
Global alignment visualization:
ATTCTTTT-ACCCAACA-AAAGTGGGAAG-AAGGGGA-AC--A-G-T---GCC-G-ATC-TA-ACATC-TCCGGA--GAA-AATCCA--AGCA-ATAATGAC-TTC-A--CTC---CAGGA-CT-TTAA-------GA-
|        |  | ||| |||     ||  ||   || |   | | |   |   | ||| || | |   |    |  ||| |||  |  |  | ||  |||| ||  |  ||    || |  || ||||       || 
A-------GA--C-ACACAAA-----AA-CAA---GAGA-TGATGATTTTG--TGTATCATATA-A--AT----AAAGAAGAAT--ATTA--ACAT--TGACATT-GAGACT-TGTCA-G-TCTGTTAATATTCTTGAA
  Score=61


Local alignment visualization:
ATTCTTTT-ACCCAACA-AAAGTGGGAAG-AAGGGGA-AC--A-G-T---GCC-G-ATC-TA-ACATC-TCCGGA--GAA-AATCCA--AGCA-ATAATGAC-TTC-A--CTC---CAGGA-CT-TTAA-------G-A
|        |  | ||| |||     ||  ||   || |   | | |   |   | ||| || | |   |    |  ||| |||  |  |  | ||  |||| ||  |  ||    || |  || ||||       | |
A-------GA--C-ACACAAA-----AA-CAA---GAGA-TGATGATTTTG--TGTATCATATA-A--AT----AAAGAAGAAT--ATTA--ACAT--TGACATT-GAGACT-TGTCA-G-TCTGTTAATATTCTTGAA
  Score=61

