In [None]:
from Bio.Blast import NCBIWWW
from Bio.Blast import NCBIXML

In [None]:
seq = """TGGGCCTCATATTTATCCTATATACCATGTTCGTATGGTGGCGCGATGTTCTACGTGAATCCACGTTCGAAGGACATCATACCAAAGTCGTAC
AATTAGGACCTCGATATGGTTTTATTCTGTTTATCGTATCGGAGGTTATGTTCTTTTTTGCTCTTTTTCGGGCTTCTTCTCATTCTTCTTTGGCAC
CTACGGTAGAG"""

blast_result = NCBIWWW.qblast("blastn", "nt", seq)

blast_record = NCBIXML.read(blast_result)

lowest_E = blast_record.alignments[0]
lowest_E_def = lowest_E.hit_def
species = lowest_E_def.split("[")[-1].rstrip("]")

print(f"The sequence is from: {species}")

In [None]:
def read_fasta(filename):
    
    sequences = {}
    
    with open(filename, 'r') as f:
        seq_id = ''
        seq = ''
        
        for line in f:
            line = line.strip()
            if line.startswith('>'):
                if seq_id != '':
                    sequences[seq_id] = seq
                seq_id = line[1:].split()[0]
                seq = '' 
            else:
                seq += line
                
        if seq_id != '':
            sequences[seq_id] = seq
            
    return sequences


def count_records(sequences):
    return len(sequences)


def sequence_lengths(sequences):
    
    lengths = {seq_id: len(seq) for seq_id, seq in sequences.items()}
    
    return lengths


def longest_sequences(lengths):
    
    max_length = max(lengths.values())
    longest_seqs = [seq_id for seq_id, l in lengths.items() if l == max_length]
    
    return max_length, longest_seqs


def shortest_sequences(lengths):
    
    min_length = min(lengths.values())
    shortest_seqs = [seq_id for seq_id, l in lengths.items() if l == min_length]
    
    return min_length, shortest_seqs

    
def find_orfs(sequence, frame):
    
    start_codon = 'ATG'
    stop_codons = ['TAA', 'TAG', 'TGA']
    seq_len = len(sequence)
    orfs = []
    i = frame - 1 
    
    while i < seq_len - 2:
        codon = sequence[i:i+3]
        if codon == start_codon:
            orf_start = i
            j = i + 3
            while j < seq_len - 2:
                codon = sequence[j:j+3]
                if codon in stop_codons:
                    orf_end = j + 3
                    orf_length = orf_end - orf_start
                    orfs.append({'start': orf_start+1, 'end': orf_end, 'length': orf_length})
                    break
                    
                j += 3
            else:
                pass
            i += 3
        else:
            i += 3
            
    return orfs


def find_longest_orf_in_sequence(sequence, frame):
    
    orfs = find_orfs(sequence, frame)
    
    if orfs:
        longest_orf = max(orfs, key=lambda x: x['length'])
        return longest_orf
    else:
        return None


def find_longest_orf_in_file(sequences, frame):
    
    longest_orf = None
    longest_seq_id = None
    
    for seq_id, seq in sequences.items():
        orf = find_longest_orf_in_sequence(seq, frame)
        if orf:
            if not longest_orf or orf['length'] > longest_orf['length']:
                longest_orf = orf
                longest_seq_id = seq_id
                
    return longest_orf, longest_seq_id

def find_longest_orf_any_frame(sequences):
    
    longest_orf = None
    longest_seq_id = None
    
    for seq_id, seq in sequences.items():
        for frame in range(1, 4):
            orf = find_longest_orf_in_sequence(seq, frame)
            if orf and (not longest_orf or orf['length'] > longest_orf['length']):
                longest_orf = orf
                longest_seq_id = seq_id
                
    return longest_orf, longest_seq_id


def find_repeats(sequences, n):
    
    repeats = {}
    
    for seq in sequences.values():
        seq_len = len(seq)
        
        for i in range(seq_len - n + 1):
            repeat = seq[i:i+n]
            if repeat in repeats:
                repeats[repeat] += 1
            else:
                repeats[repeat] = 1
                
    return repeats


def most_frequent_repeat(repeats):
    
    max_count = max(repeats.values())
    most_frequent = [repeat for repeat, count in repeats.items() if count == max_count]
    
    return max_count, most_frequent
    

def find_max_occurrence_repeat(sequences, specified_repeats):
    
    max_count = 0
    max_repeat = None
    
    for repeat in specified_repeats:
        count = sum(seq.count(repeat) for seq in sequences.values())
        if count > max_count:
            max_count = count
            max_repeat = repeat
            
    return max_repeat, max_count
    

In [None]:

filename = r'C:\Users\i.mamalis\Downloads\dna2.fasta'
sequences = read_fasta(filename)

# Question 1 - Number of records in FASTA
num_records = count_records(sequences)
print(f"Number of records: {num_records}")

# Question 2 - Longest sequence in file
lengths = sequence_lengths(sequences)
max_length, longest_seqs = longest_sequences(lengths)
print(f"Longest sequence length: {max_length}")

# Question 3 - Shortest sequence in file
min_length, shortest_seqs = shortest_sequences(lengths)
print(f"Shortest sequence length: {min_length}")

# Question 4 - Longest ORF in frame 2 in any sequence
frame = 2
longest_orf_frame2, seq_id_frame2 = find_longest_orf_in_file(sequences, frame)
print(f"Longest ORF length in frame 2: {longest_orf_frame2['length']}")

# Question 5 - Starting position of longest ORF in frame 3 in any sequence
frame = 3
longest_orf_frame3, seq_id_frame3 = find_longest_orf_in_file(sequences, frame)
print(f"Longest ORF in frame 3 starts at position {longest_orf_frame3['start']}")

# Question 6 - Longest ORF in any sequence and frame
longest_orf_any_frame, seq_id_any_frame = find_longest_orf_any_frame(sequences)
print(f"Longest ORF in any sequence and frame has length {longest_orf_any_frame['length']}")

# Question 7 - Longest forward ORF in specific sequence
specific_seq_id = "gi|142022655|gb|EQ086233.1|16"
longest_orf_specific = find_longest_orf_in_sequence(sequences[specific_seq_id], frame=3)
print(f"Longest forward ORF in sequence has length {longest_orf_specific['length']}")

# Question 8 - Most frequent repeat of length 6
n = 6
repeats_6 = find_repeats(sequences, n)
max_count_6, most_frequent_6 = most_frequent_repeat(repeats_6)
print(f"Most frequent repeat(s) of length {n}: {most_frequent_6}")
print(f"Number of times it occurs: {max_count_6}")

# Question 9 - Most frequent repeat of length 12
n = 12
repeats_12 = find_repeats(sequences, n)
max_count_12, most_frequent_12 = most_frequent_repeat(repeats_12)
num_max_repeats_12 = len(most_frequent_12)
print(f"Number of different 12-base sequences occurring {max_count_12} times: {num_max_repeats_12}")

# Question 10 - Which of the specified repeats of length 7 has the maximum occurrences
specified_repeats_7 = ["GCGCGCA", "CGCGCCG", "CATCGCC", "TGCGCGC"]
max_repeat, max_count = find_max_occurrence_repeat(sequences, specified_repeats_7)
print(f"The repeat with maximum occurrences: {max_repeat}")
print(f"Occurrences: {max_count}")