# ITERATORS AND GENERATORS

In [1]:
import numpy as np

#### Task 1

In [2]:
def fasta_reader(path):
    id_, seq = None, ''
    fasta_content = []
    with open(path) as file:
        for line in file:
            fasta_content.append(line.rstrip())
    for line in fasta_content:
        if line.startswith(">"):
            if id_: yield (id_, seq)
            id_, seq = line, ''
        else:
            seq += line
    if id_: yield (id_, seq)

In [3]:
reader = fasta_reader('../data/test.fa')
print(type(reader))
for id_, seq in reader:
    print(id_, seq[:50])

<class 'generator'>
>SUP35_Kla_AB039749 ATGTCAGACCAACAAAATCAAGACCAAGGGCAAGGCCAAGGTTACAATCA
>SUP35_Agos_ATCC_10895_NM_211584 ATGTCGGAGGAAGATCAAATTCAATCGCAAGGCAACGACCAAGGCCAGTC
>SUP35_Scer_74-D694_GCA_001578265.1 ATGTCGGATTCAAACCAAGGCAACAATCAGCAAAACTACCAGCAATACAG
>SUP35_Spar_A12_Liti ATGTCGGATTCAAACCAAGGTAACAATCAGCAAAGCTACCAGCAATACGG
>SUP35_Smik_IFO1815T_30 ATGTCTGATTCAAACCAAGGTAATAATCAGCAAAACTACCAGCAATACAA
>SUP35_Skud_IFO1802T_36 ATGTCAGATCCAAATCAAGGTAACAATCAACAACAATACGGTCAAAATCC
>SUP35_Sbou_unique28_CM003560 ATGTCGGATTCAAACCAAGGCAACAATCAGCAAAACTACCAGCAATACAG
>SUP35_Scer_beer078_CM005938 ATGTCGGATTCAAACCAAGGCAACAATCAGCAAAACTACCAGCAATACAG
>SUP35_Sarb_H-6_chrXIII_CM001575 ATGTCTGATCCAACTAATGGTAATAATGAGCAGAGCTCTCAACAGCAAGG
>SUP35_Seub_CBS12357_chr_II_IV_DF968535 ATGTCTGATCCAAACCAAGGTAACAATCAGCAAAACTATCAACAGTACGG


#### Task 2

In [4]:
class FastaMutator:
    
    def __init__(self, path, proba=0.01, mutation='any'):
        self.path = path
        self.proba = proba
        self.mutation = mutation
        self.fasta_content = {}
        self.current_id = -1
        for id_, seq in fasta_reader(path):
            self.fasta_content[id_] = seq
        self.monomers = ['A', 'T', 'G', 'C']
        for i in list(self.fasta_content.values())[0]:
            if i not in self.monomers:
                self.monomers = ['A', 'R', 'N', 'D', 'C', 'E', 'Q', 'G', 'H', 'I', 'L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V', 'X']
        if self.mutation == 'substitution':
            self.substitution()
        elif self.mutation == 'deletion':
            self.deletion()
        elif self.mutation == 'insertion':
            self.insertion()
        else:
            self.any_mutation()
        
    def substitution(self):
        for id_ in self.fasta_content:
            seq = self.fasta_content[id_]
            outcome = np.random.uniform()
            if outcome <= self.proba:
                seq = list(seq)
                position = np.random.randint(len(seq)+1)
                seq[position] = np.random.choice(monomers)
                self.fasta_content[id_] = ''.join(seq)
    
    def deletion(self):
        for id_ in self.fasta_content:
            seq = self.fasta_content[id_]
            outcome = np.random.uniform()
            if outcome <= self.proba:
                seq = list(seq)
                position = np.random.randint(len(seq)+1)
                seq[position] = '-'
                seq = ''.join(seq)
                self.fasta_content[id_] = seq.replace('-', '')
    
    def insertion(self):
        for id_ in fasta_content:
            seq = fasta_content[id_]
            outcome = np.random.uniform()
            if outcome <= self.proba:
                seq = list(seq)
                position = np.random.randint(len(seq)+1)
                seq[position] = seq[position] + '-'
                seq = ''.join(seq)
                self.fasta_content[id_] = seq.replace('-', np.random.choise(monomers))

    def any_mutation(self):
        for id_ in self.fasta_content:
            seq = self.fasta_content[id_]
            outcome = np.random.uniform()
            if outcome <= self.proba:
                seq = list(seq)
                mut = np.random.choice(['sub', 'del', 'ins'])
                if mut == 'sub':
                    position = np.random.randint(len(seq)+1)
                    seq[position] = np.random.choice(monomers)
                    self.fasta_content[id_] = ''.join(seq)
                elif mut == 'del':
                    position = np.random.randint(len(seq)+1)
                    seq[position] = '-'
                    seq = ''.join(seq)
                    self.fasta_content[id_] = seq.replace('-', '')
                else:
                    position = np.random.randint(len(seq)+1)
                    seq[position] = seq[position] + '-'
                    seq = ''.join(seq)
                    self.fasta_content[id_] = seq.replace('-', np.random.choise(monomers))
            
    def __iter__(self):
        return self
    
    def __next__(self):
        self.current_id += 1
        if self.current_id == len(self.fasta_content):
            self.current_id = 0
            if self.mutation == 'substitution':
                self.substitution()
            elif self.mutation == 'deletion':
                self.deletion()
            elif self.mutation == 'insertion':
                self.insertion()
            elif self.mutation == 'any':
                self.any_mutation()
            else:
                raise MutationTypeError
        return list(self.fasta_content.items())[self.current_id]

In [5]:
mutator = FastaMutator(path = '../data/test.fa', proba = 1, mutation = 'deletion')
print(type(mutator))
for id_, seq in mutator:
    print(id_, seq[:50])

<class '__main__.FastaMutator'>
>SUP35_Kla_AB039749 ATGTCAGACCAACAAAATCAAGACCAAGGGCAAGGCCAAGGTTACAATCA
>SUP35_Agos_ATCC_10895_NM_211584 ATGTCGGAGGAAGATCAAATTCAATCGCAAGGCAACGACCAAGGCCAGTC
>SUP35_Scer_74-D694_GCA_001578265.1 ATGTCGGATTCAAACCAAGGCAACAATCAGCAAAACTACCAGCAATACAG
>SUP35_Spar_A12_Liti ATGTCGGATTCAAACCAAGGTAACAATCAGCAAAGCTACCAGCAATACGG
>SUP35_Smik_IFO1815T_30 ATGTCTGATTCAAACCAAGGTAATAATCAGCAAAACTACCAGCAATACAA
>SUP35_Skud_IFO1802T_36 ATGTCAGATCCAAATCAAGGTAACAATCAACAACAATACGGTCAAAATCC
>SUP35_Sbou_unique28_CM003560 ATGTCGGATTCAAACCAAGGCAACAATCAGCAAAACTACCAGCAATACAG
>SUP35_Scer_beer078_CM005938 ATGTCGGATTCAAACCAAGGCAACAATCAGCAAAACTACCAGCAATACAG
>SUP35_Sarb_H-6_chrXIII_CM001575 ATGTCTGATCCAACTAATGGTAATAATGAGCAGAGCTCTCAACAGCAAGG
>SUP35_Seub_CBS12357_chr_II_IV_DF968535 ATGTCTGATCCAAACCAAGGTAACAATCAGCAAAACTATCAACAGTACGG
>SUP35_Kla_AB039749 ATGTCAGACCAACAAAATCAAGACCAAGGGCAAGGCCAAGGTTACAATCA
>SUP35_Agos_ATCC_10895_NM_211584 ATGTCGGAGGAAGATCAAATTCAATCGCAAGGCAACGACCAAGGCCAGTC
>SUP35_Scer_7

>SUP35_Sbou_unique28_CM003560 ATGTCGGATTCAAACCAAGGCAACAATCAGCAAACTACCAGAATACAGCC
>SUP35_Scer_beer078_CM005938 ATGTCGGATTCAAACCAAGGCAACAATCAGCAAAACTACCAGCAATAAGC
>SUP35_Sarb_H-6_chrXIII_CM001575 ATGTCTGATCCAACTAATGGTATAATGGCAGAGCTCTCAACAGCAGGCAA
>SUP35_Seub_CBS12357_chr_II_IV_DF968535 ATGTCTGATCAAACCAAGGTAACAATCAGCAAACTATCAACAGTACGGTC
>SUP35_Kla_AB039749 ATGTCAGACCACAAAATCAAGACCAAGGGCAAGGCCAAGGTTACAATCAG
>SUP35_Agos_ATCC_10895_NM_211584 TGTCGGAGGAAGATCAAATTCAATCGCAAGCAACGACCAAGGCCAGTCGC
>SUP35_Scer_74-D694_GCA_001578265.1 ATGTCGGATTCAAACCAAGGCAACAATCAGCAAACTACCGCAATACAGCC
>SUP35_Spar_A12_Liti ATGTCGGATTCAAACCAAGGTAACAATCAGCAAACTACCGCAATACGGCC
>SUP35_Smik_IFO1815T_30 ATGCTGATTCAAACCAAGGTAATAATCAGCAAAACTACCAGCAATACAAC
>SUP35_Skud_IFO1802T_36 ATGTCAGTCCAATCAAGTAACAATCAACAACAATACGGTCAAATCCTAAC
>SUP35_Sbou_unique28_CM003560 ATGTCGGATTCAAACCAAGGCAACAATCAGCAAACTACCAGAATACAGCC
>SUP35_Scer_beer078_CM005938 ATGTCGGATTCAAACCAAGGCAACAATCAGCAAAACTACCAGCAATAAGC
>SUP35_Sarb_H-6_chrXIII_CM001575 ATGTCT

>SUP35_Agos_ATCC_10895_NM_211584 TGTCGGAGGAAGATCAAATTCAATCGCAAGCAACGACCAAGGCCAGTGCA
>SUP35_Scer_74-D694_GCA_001578265.1 ATGTCGGATTCAAACCAAGGCAACAATCAGCAAACTACCGCAATACAGCC
>SUP35_Spar_A12_Liti ATGTCGGATTCAAACCAAGTAACAATCAGCAAACTACCGCAATACGGCCA
>SUP35_Smik_IFO1815T_30 ATGCTGATTCAAACCAAGGTAATAATCAGCAAAACTACCAGCAATAAACC
>SUP35_Skud_IFO1802T_36 ATCAGTCCAATCAAGTAACAATCAACAACAATACGGTCAAATCCAACCAA
>SUP35_Sbou_unique28_CM003560 ATGTCGGTCAAACCAGGCAACAATCAGCAAACTACAGAATACAGCCAGAA
>SUP35_Scer_beer078_CM005938 GTGGATTCAAACCAAGGCAACAATCAGCAAAACTACCAGCAATAGCCAGA
>SUP35_Sarb_H-6_chrXIII_CM001575 ATGTCTGATCCAACTAATGGTATAATGGAGGCTCTAACAGCAGGCAAACC
>SUP35_Seub_CBS12357_chr_II_IV_DF968535 ATGTCTGATAACCAAGGTACAATAGCAAACTATCAACAGTACGGTCAAAA
>SUP35_Kla_AB039749 ATGTCAGACCACAAATCAAGACCAAGGCAAGCCAAGGTTACAATCAGTAT
>SUP35_Agos_ATCC_10895_NM_211584 TGTCGGAGGAAGATCAAATTCAATCGCAAGCAACGACCAAGGCCAGTGCA
>SUP35_Scer_74-D694_GCA_001578265.1 ATGTCGGATTCAAACCAAGGCAACAATCAGCAAACTACCGCAATACAGCC
>SUP35_Spar_A12_Liti ATGTCGGA

IndexError: list assignment index out of range