# Processing FASTA files

Exercise for processing DNA sequences.

* **Contact:** mate.balajti@unibas.ch

In [3]:
import numpy as np

filename1 = "sequences.fasta"
filename2 = "genome.fasta"

## Exercise 1.1 (3 points)

Write a function `parse_fasta()` that takes a path to a FASTA file as input
and returns a tuple of two lists, the first containing sequence headers
stripped of the leading `>`, and the second containing the actual sequences.

> **Notes:**
>  
> * Please write the parser from scratch and do _not_ use existing FASTA
>   parsers, such as the one provided by Biopython.
> * Ensure that wrapped sequences are handled such that fragments of a given
>   sequence are concatenated, without white space, in the order they appear
>   in the file. Make use of the leading `>` character to separate records
>   from each other.
> * Ensure that the number of items in the returned lists correspond to the
>   original number of records in the input file.

In [17]:
def parse_fasta(path: str) -> tuple[list[str], list[str]]:
    headers = []
    sequences = []
    current_sequence = ""

    with open(path, 'r') as fasta_file:
        for line in fasta_file:
            line = line.strip()
            if line.startwith('>'):
                if current_sequence:
                    sequences.append(current_sequence)
                    current_sequence = ""
                headers.append(line[1:])
            else:
                current_sequence += line

        if current_sequence:
            sequences.append(current_sequence)


    return headerss, sequence

## Exercise 1.2 (2 points)

Write a function `discard_ambiguous_seqs()` that takes two arguments, a list of sequence headers and list of strings as input 
and returns a tuple of two lists with only those headers and strings that exclusively consist of letters of the
"DNA alphabet" (`A`, `C`, `G`, `T`).

> **Notes:**
>  
> * Make sure your implementation is case-insensitive, i.e., sequences
>   containing lowercase DNA characters, even if mixed with uppercase
>   characters, are valid as well.

In [20]:
def discard_ambiguous_seqs(header: list[str], sequence: list[str]) -> \
  tuple[: list[str], : list[str]]:

    l1_header = []
    l2_sequence = []
    DNA_alphabet = {"A", "G", "C", "T"}

    for n, seq in zip(header, sequence):

        if all(char.upper() in DNA_alphabet for char in seq):
            l1_header.append(h)
            l2_sequence.append(seq)
    

    return r_header, r_sequence

## Exercise 1.3 (2 points)

Write a function `nucleotide_frequencies()` that takes a list of strings as
input, and which prints out the total frequency of each nucleotide across
all input sequences. Use the following example as a template to format your
output:

```console
A: 0.31
C: 0.21
G: 0.17
T: 0.31
```

> **Notes:**
>
> * Note how numbers are rounded in the example and format decimals printed
>   by your solution in the same manner, i.e., rounded to two significant
>   digits.
> * The function does not require any specific return value. In case you are
>   not aware of how Python deals with functions without an explicit `return`
>   statement, look up the behavior in relevant documentation.

In [12]:
def nucleotide_frequencies(seqs: list[str]) -> None:
    # count nucleotides
    nucleotides_count = {'A' : 0, 'C' : 0, 'G' : 0, 'T' : 0}
    total = 0

    for seq in sequences:
        for nucleotide in seq.upper():
            if nucleotide in nucleotides_count:
                nucleotides_count[nucleotide] += 1
                total += 1

    if total > 0:
        for nucleotide in nucleotides_count:
            frequency = nucleotides_count[nucleotide] / total
            print(f"{nucleotide} : {frequency: .2f}")

    else:
        print("No nucleotides")

    

## Exercise 1.4 (3 points)

Write a function `map_reads()` that takes as input two FASTA files, the first
containing short read sequences ("query"), and the second containing reference
sequences. The function should 
* read the files, 
* discard _query_ sequences that contain non-DNA characters, 
* print the nucleotide fractions for both files to
the console 
* and returns a dictionary of dictionaries, where the outer
dictionary uses the names of query sequences as its keys, and the inner
dictionary uses reference sequence names as keys and a list of 1-based indices
indicating at which position (counting from left to right) in the reference
sequence the query sequence occurs as an exact substring.

Execute the function, passing `sequences.fasta` and `genome.fasta` as input.
Inspect the returned "hits" object (the dictionary of dicionaries). Interpret
the results in at least 2-3 bullet points. What's special about query sequence
`sequence4`?

In [9]:
def parse_fasta(path: str) -> tuple[list[str], list[str]]:
    headers = []
    sequences = []
    current_sequence = ""

    with open(path, 'r') as fasta_file:
        for line in fasta_file:
            line = line.strip()
            if line.startswith('>'):
                if current_sequence:
                    sequences.append(current_sequence)
                    current_sequence = ""
                headers.append(line[1:])
            else:
                current_sequence += line

        if current_sequence:
            sequences.append(current_sequence)


    return headers, sequences

def discard_ambiguous_seqs(header: list[str], sequence: list[str]) -> \
  tuple[: list[str], : list[str]]:

    l1_header = []
    l2_sequence = []
    DNA_alphabet = {"A", "G", "C", "T"}

    for n, seq in zip(header, sequence):

        if all(char.upper() in DNA_alphabet for char in seq):
            l1_header.append(n)
            l2_sequence.append(seq)
    

    return l1_header, l2_sequence

def nucleotide_frequencies(seqs: list[str]) -> None:
    # count nucleotides
    nucleotides_count = {'A' : 0, 'C' : 0, 'G' : 0, 'T' : 0}
    total = 0

    for seq in seqs:
        for nucleotide in seq.upper():
            if nucleotide in nucleotides_count:
                nucleotides_count[nucleotide] += 1
                total += 1

    if total > 0:
        for nucleotide in nucleotides_count:
            frequency = nucleotides_count[nucleotide] / total
            print(f"{nucleotide} : {frequency: .2f}")

    else:
        print("No nucleotides")


def map_reads(filename1: str, filename2: str) -> dict[dict[str: list[int]]]:
    query_headers, query_sequences = parse_fasta(filename1)
    reference_headers, reference_sequences = parse_fasta(filename2)

    query_headers, query_sequences = discard_ambiguous_seqs(query_headers, query_sequences)

    print("Query nucleotides frequency:")
    nucleotide_frequencies(query_sequences)

    print("Reference nucleotides frequency:")
    nucleotide_frequencies(reference_sequences)

    results = {header: {} for header in query_headers}

    for query_header, query_seq in zip (query_headers, query_sequences):
        for ref_header, ref_seq in zip(reference_headers, reference_sequences):

            start = 0
            while True:
                start = ref_seq.find(query_seq, start)
                if start == -1:
                    break
                if ref_header not in results[query_header]:
                    results[query_header][ref_header] = []
                results[query_header][ref_header].append(start + 1)
                start += 1

    return results

filename1 = "sequences.fasta"
filename2 = "genome.fasta"

hits = map_reads(filename1, filename2)

print("Hits :")
for query, refs in hits.items():
    print(f"{query}: {refs}")




Query nucleotides frequency:
A :  0.24
C :  0.16
G :  0.35
T :  0.24
Reference nucleotides frequency:
A :  0.25
C :  0.26
G :  0.25
T :  0.24
Hits :
sequence1: {}
sequence2: {'chr2': [1422]}
sequence4: {'chr2': [1039], 'chr3': [1422], 'chr4': [1455]}


In [31]:
header_fasta, seq_fasta = parse_fasta("sequences.fasta")
print(header_fasta)
print(seq_fasta)

['sequence1', 'sequence2', 'sequence3', 'sequence4']
['GAAGTTTACTAaCTGGAGTGGTCAGaAGTTGCCGCCTGTG', 'GCCCGGGCGTATGTATGAGAGATGTGGCCAGAAGTCGAAA', 'ATGATTDATGTGTCCGGTAACTATAAACGTGCTACGATGT', 'TTTGAG']
