In [1]:
from random import randint
import random
import os
from matplotlib import pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sb
import pickle as pkl
import math
random.seed(42)

In [2]:
train_filepath = "train_workspace/"
animalList = ["Human","Pig","Chicken","Rat","Mouse","Dog"]
levelList = ["Strict","Relaxed","Intermediate"]
dataList = ["Ohnologs","No-Ohnologs","Paralogs"]
dataNameList = ["ohnologs","no-ohnologs","paralog"]
sb.set_style("darkgrid")

In [3]:
def get_df_dict(level):
    df_dict = {}    
    current_file_path = train_filepath + level + "/"
    for (dataName,data) in zip(dataNameList,dataList):                
        df_dict[data] = pd.read_pickle(current_file_path + dataName + "-complete2.pkl")        
    return df_dict

In [5]:
def mutate_simple(dna):
    dna_list = list(dna)
    mutation_site = random.randint(0, len(dna_list) - 1)
    dna_list[mutation_site] = random.choice(list('ATCG'))
    return ''.join(dna_list)

In [6]:
def draw(discrete_probdist):
    """
    Draw random value from discrete probability distribution
    represented as a dict: P(x=value) = discrete_probdist[value].
    """    
    limit = 0
    r = random.random()
    for value in discrete_probdist:
        limit += discrete_probdist[value]
        if r < limit:
            return value

In [7]:
## Takes into account the frecuency of transitions and transversions
def create_markov_chain():
    markov_chain = {}
    for from_base in 'ATGC':
        # Generate random transition probabilities by dividing
        # [0,1] into four intervals of random length
        slice_points = sorted([0] + [random.random()for i in range(3)] + [1])
        transition_probabilities = [slice_points[i+1] - slice_points[i] for i in range(4)]
        transition_probabilities_sorted = sorted(transition_probabilities)
        bases_string = 'ATGC'
        if(from_base == 'A'):
            bases_string = bases_string.replace('G','')
            bases_string = bases_string.replace('A','')
            bases_string = bases_string + 'A'     
            bases_string = bases_string + 'G'            
        if(from_base == 'G'):
            bases_string = bases_string.replace('A','')
            bases_string = bases_string.replace('G','')
            bases_string = bases_string + 'G'            
            bases_string = bases_string + 'A'            
        if(from_base == 'C'):
            bases_string = bases_string.replace('T','')
            bases_string = bases_string.replace('C','')
            bases_string = bases_string + 'C'
            bases_string = bases_string + 'T'
        if(from_base == 'T'):
            bases_string = bases_string.replace('C','')
            bases_string = bases_string.replace('T','')
            bases_string = bases_string + 'T'   
            bases_string = bases_string + 'C'           
        markov_chain[from_base] = {base: p for base, p in zip(bases_string, transition_probabilities_sorted)}
    return markov_chain

In [8]:
# Pointwise mutation
def mutate_via_markov_chain(dna, markov_chain,mutation_site):       
    dna_list = list(dna)
    from_base = dna[mutation_site]
    if(from_base == 'N'):
        return dna
    to_base = draw(markov_chain[from_base])
    dna_list[mutation_site] = to_base
    return ''.join(dna_list)

In [9]:
def generate_random_sequence(length,gc_percent):
    dna_sequence = ''
    for i in range(0,length):
        step = random.random()
        next_nucleotide = ''
        if(step < gc_percent):
            next_nucleotide =  random.choice(list('CG'))
        else:
            next_nucleotide =  random.choice(list('AT'))
        dna_sequence += next_nucleotide
    return dna_sequence

In [10]:
def get_gc_count(sequence):
    count = 0
    for c in sequence:
        if(c == 'G'):
            count = count + 1
        if(c == 'C'):
            count = count + 1            
    return count    

In [11]:
def insert_str(string, str_to_insert, index):
    return string[:index] + str_to_insert + string[index:]

In [12]:
def mutate_insertion(dna,gc_percent,insertion_length_max,mutation_site):
    return insert_str(dna,generate_random_sequence(randint(1, insertion_length_max),gc_percent),mutation_site)        

In [13]:
def mutate_tandem_insertion(dna,tandem_length_max,tandem_quantity_max,mutation_site):    
    tandem_length = randint(1, tandem_length_max)
    final_position = mutation_site + tandem_length
    insert_string = dna[mutation_site:final_position]
    tandem_length = randint(1, tandem_quantity_max)
    for i in range(1,tandem_length):
        dna = insert_str(dna,insert_string,mutation_site) 
    return dna

In [14]:
def mutate_deletion(dna,deletion_length_max,mutation_site):
    deletion_length =  randint(1, deletion_length_max)
    deletion_position = mutation_site + deletion_length
    return dna[0:mutation_site] + dna[deletion_position:]    

In [15]:
# Likelyhood of indels are between 16% and 25%
def generate_mutated_sequence(dna_sequence,mutation_rate):
    mc = create_markov_chain()
    dna_length = len(dna_sequence)
    nr_mutations = round(dna_length * mutation_rate)  
    for i in range(0,nr_mutations):
        threshold = random.random()
        mutation_site = random.randint(0, len(dna_sequence) - 1)
        if(threshold<0.8):
            dna_sequence = mutate_via_markov_chain(dna_sequence,mc,mutation_site)
        else:
            threshold = random.random()
            ## Deletions are more likely than insertions (Zhang,2003)
            sequenceLength = min(random.randint(1,51),round(0.05*dna_length))
            if(threshold < 0.7):                
                dna_sequence = mutate_deletion(dna_sequence,sequenceLength,mutation_site)
            else:
                ## Tandem mutations are more likely
                threshold = random.random()
                if(threshold<0.75):
                    dna_sequence = mutate_tandem_insertion(dna_sequence,sequenceLength,random.randint(1,5),mutation_site)                    
                else:
                    dna_sequence = mutate_insertion(dna_sequence,get_gc_count(dna_sequence)/float(len(dna_sequence)),sequenceLength,mutation_site)      
    return dna_sequence

In [16]:
def get_ohnologs_mutations_one(df_ohnologs_to_mutate,mutation_rate):    
    new_sequence_1 = []
    new_sequence_1_len = []
    new_sequence_1_gc = []
    new_sequence_2 = []
    new_sequence_2_len = []
    new_sequence_2_gc = []
    mutated_sequence_nr = []
    
    for index, row in df_ohnologs_to_mutate.iterrows():        
        threshold = random.random()
        if(threshold<0.5):              
            mutated_sequence = generate_mutated_sequence(row["Sequence-1"],mutation_rate)
            new_sequence_1.append(mutated_sequence)
            new_sequence_1_len.append(len(mutated_sequence))
            new_sequence_1_gc.append(get_gc_count(mutated_sequence)/float(len(mutated_sequence)))
            
            new_sequence_2.append(row['Sequence-2'])
            new_sequence_2_len.append(row['Sequence-2 Length'])
            new_sequence_2_gc.append(row['Sequence-2 GC'])   
            mutated_sequence_nr.append(1)
        else:
            new_sequence_1.append(row['Sequence-1'])
            new_sequence_1_len.append(row['Sequence-1 Length'])
            new_sequence_1_gc.append(row['Sequence-1 GC'])
            
            mutated_sequence = generate_mutated_sequence(row["Sequence-2"],mutation_rate)
            new_sequence_2.append(mutated_sequence)
            new_sequence_2_len.append(len(mutated_sequence))
            new_sequence_2_gc.append(get_gc_count(mutated_sequence)/float(len(mutated_sequence)))
            mutated_sequence_nr.append(2)
            
    df_ohnologs_to_mutate["Sequence-1-Mutated"] = new_sequence_1
    df_ohnologs_to_mutate["Sequence-1 Length-Mutated"] = new_sequence_1_len
    df_ohnologs_to_mutate["Sequence-1 GC-Mutated"] = new_sequence_1_gc

    df_ohnologs_to_mutate["Sequence-2-Mutated"] = new_sequence_2
    df_ohnologs_to_mutate["Sequence-2 Length-Mutated"] = new_sequence_2_len
    df_ohnologs_to_mutate["Sequence-2 GC-Mutated"] = new_sequence_2_gc

    df_ohnologs_to_mutate["Mutated_Sequence_Nr"] = mutated_sequence_nr    
    return df_ohnologs_to_mutate

In [17]:
def get_ohnologs_mutations_two(df_ohnologs_to_mutate,mutation_rate):        
    new_sequence_1 = []
    new_sequence_1_len = []
    new_sequence_1_gc = []
    new_sequence_2 = []
    new_sequence_2_len = []
    new_sequence_2_gc = []
    mutated_sequence_nr = []
    
    for index, row in df_ohnologs_to_mutate.iterrows():                        
        mutated_sequence = generate_mutated_sequence(row["Sequence-1"],mutation_rate)
        new_sequence_1.append(mutated_sequence)
        new_sequence_1_len.append(len(mutated_sequence))
        new_sequence_1_gc.append(get_gc_count(mutated_sequence)/float(len(mutated_sequence)))

        mutated_sequence = generate_mutated_sequence(row["Sequence-2"],mutation_rate)
        new_sequence_2.append(mutated_sequence)
        new_sequence_2_len.append(len(mutated_sequence))
        new_sequence_2_gc.append(get_gc_count(mutated_sequence)/float(len(mutated_sequence)))
        
        mutated_sequence_nr.append(0)
                        
    df_ohnologs_to_mutate["Sequence-1-Mutated"] = new_sequence_1
    df_ohnologs_to_mutate["Sequence-1 Length-Mutated"] = new_sequence_1_len
    df_ohnologs_to_mutate["Sequence-1 GC-Mutated"] = new_sequence_1_gc

    df_ohnologs_to_mutate["Sequence-2-Mutated"] = new_sequence_2
    df_ohnologs_to_mutate["Sequence-2 Length-Mutated"] = new_sequence_2_len
    df_ohnologs_to_mutate["Sequence-2 GC-Mutated"] = new_sequence_2_gc

    df_ohnologs_to_mutate["Mutated_Sequence_Nr"] = mutated_sequence_nr        
    
    return df_ohnologs_to_mutate

In [18]:
def get_ohnologs_mutations(df_animal,level):
    ## We mutatate 10% of the sample
    df_ohnologs_to_mutate = df_animal.sample(round(len(df_animal)*0.1),random_state=42)    
    ## We only mutate one sequence    
    if(level == "Low" or level == "Medium"):        
        df_ohnologs_to_mutate = get_ohnologs_mutations_one(df_ohnologs_to_mutate,mutation_rate[level])                    
    else:
    # We mutate both the sequences
        df_ohnologs_to_mutate = get_ohnologs_mutations_two(df_ohnologs_to_mutate,mutation_rate[level])        
    return df_ohnologs_to_mutate

In [19]:
def standarize_df(df):
    df_temp = pd.DataFrame(columns=['Sequence-1 Id','Sequence-2 Id','Sequence-1','Sequence-2','Sequence-1 Length','Sequence-2 Length','Sequence-1 GC','Sequence-2 GC','Is_Ohnolog'])
    df_temp['Sequence-1 Id'] = df["Ohnolog-1 Id"]
    df_temp['Sequence-2 Id'] = df["Ohnolog-2 Id"]
    df_temp['Sequence-1-Transcript Id'] = df["Ohnolog-1/Transcript-ID"]
    df_temp['Sequence-2-Transcript Id'] = df["Ohnolog-2/Transcript-ID"]
    df_temp['Sequence-1'] = df["Ohnolog-1/Sequence"]
    df_temp['Sequence-2'] = df["Ohnolog-2/Sequence"]
    df_temp['Sequence-1 Length'] = df["Ohnolog-1/Sequence-Lenght"]
    df_temp['Sequence-2 Length'] = df["Ohnolog-2/Sequence-Lenght"]
    df_temp['Sequence-1 GC'] = df["GC_Percent_1"]
    df_temp['Sequence-2 GC'] = df["GC_Percent_2"]
    df_temp['Is_Ohnolog'] = 1
    return df_temp

In [20]:
mutation_levels = ["VeryLow","Low","Medium","High"]
# Mutation rate per year for human. Most researched. Similar to other mammals
mutation_rate = {}
mutation_rate["VeryLow"] = 0.01
mutation_rate["Low"] = 0.025
mutation_rate["Medium"] = 0.05
mutation_rate["High"] = 0.1

In [26]:
df_dict = get_df_dict("Intermediate")
df_dict["Ohnologs"].head()

Unnamed: 0,Is_Ohnolog,Is_Paralog,Sequence-1,Sequence-1 GC,Sequence-1 Id,Sequence-1 Length,Sequence-1-Transcript Id,Sequence-2,Sequence-2 GC,Sequence-2 Id,...,Nr Gap Open_Low,Evalue_Low,Bit Score_Low,Percent Identical Matches_Total,Aligment Length_Total,Nr Mismatch_Total,Nr Gap Open_Total,Evalue_Total,Bit Score_Total,Nr Hits
0,1,0.0,CTTTGGAAGTCCTATGAGGGACCATTTACGGTTTCCTCAGTAATTT...,0.42909,ENSG00000095464,3307,ENST00000371447,AGTATGTTTTGCAGACAAGACCCAGAGAAGTCCAGACTGGACTTGT...,0.469856,ENSG00000132915,...,0.0,4.828,22.273333,96.042529,107.352941,22.941176,2.705882,4.26,51.652941,17
1,1,0.0,CGTTTTGGCAAGGGATTAAAGTGCTCCCCCCTGTGGCAGCAGTGAC...,0.441268,ENSG00000077684,5772,ENST00000226319,ATACAATAGTGCTCCGCGCCGCCTCAGCCGCCGCCGCCGCCCAACC...,0.450953,ENSG00000102221,...,0.272727,2.195455,23.963636,95.077926,35.592593,4.925926,0.518519,5.709458,35.981481,27
2,1,0.0,AGTCAACCTCTGGAAGTAAGTCAACTCCATTCTGAAAAAGAAGAGT...,0.357972,ENSG00000109158,11973,ENST00000264318,ACATAATCTAAGACCACAAACCACCTTGTTCCACGTGAGAAGGAAA...,0.402006,ENSG00000145863,...,0.166667,2.126667,23.9,92.587125,114.25,22.5,1.25,1.595,78.45,8
3,1,0.0,ACCACAGAGGCGTCTGGCTAACTCATCTCCAGACCTAAGTTGGGAA...,0.572634,ENSG00000137216,3318,ENST00000259746,CAGTCTCCTGCCATGCAAAGAAGGCTCATAGAGTTGCTTTGGAAGT...,0.516923,ENSG00000196187,...,0.0,4.424444,22.344444,97.93815,15.2,0.75,0.0,3.982011,25.025,20
4,1,0.0,GGGCCTCCTCCCTGGGGTGTGAGCAGGTCGGCGCGCCACACTTCTC...,0.468779,ENSG00000113396,3219,ENST00000262462,AGTCCTGCCCGGAACCCCCGGCAACGCGCATACGACTACACCTGCT...,0.48914,ENSG00000140284,...,0.0,1.409714,23.271429,90.491909,42.818182,7.818182,0.454545,0.897455,33.854545,11


In [23]:
df_dict["Paralogs"].head()

Unnamed: 0,Is_Ohnolog,Is_Paralog,Sequence-1,Sequence-1 GC,Sequence-1 Id,Sequence-1 Length,Sequence-1-Transcript Id,Sequence-2,Sequence-2 GC,Sequence-2 Id,...,Nr Gap Open_Low,Evalue_Low,Bit Score_Low,Percent Identical Matches_Total,Aligment Length_Total,Nr Mismatch_Total,Nr Gap Open_Total,Evalue_Total,Bit Score_Total,Nr Hits
0,0,1.0,ATGGCTGAGAGCGCCTCCCCGCCCTCCTCATCTGCAGCAGCCCCAG...,0.613954,ENSG00000100346,10004,ENST00000402142,CCGCCCTCCGCCGCTGCCCCCCTTTTCGTTCGCCCTCTCGGGGCGG...,0.604694,ENSG00000006283,...,0.333333,4.218667,25.327083,94.662741,81.87037,12.703704,1.092593,3.75,73.051852,54
1,0,1.0,CACAGGCTGAGCAGTCAGGCCCACAGCATCTGACCCCAGGCCCAGC...,0.608192,ENSG00000066056,3882,ENST00000372476,GAGCTGGAGCAGCCGCCACCGCCGCCGCCGAGGGAGCCCCGGGACG...,0.441187,ENSG00000185483,...,0.142857,5.523571,22.464286,97.329333,14.066667,0.4,0.266667,5.1556,23.133333,15
2,0,1.0,GTGCTGCGGCGAGCTCCGTCCAAAAGAAAATGGGGTTTGGTGTAAA...,0.54113,ENSG00000122592,2018,ENST00000242159,CGTGAGTGGGGCGGCCAATGGGTGACTGGTGCAGATTTAACTATGT...,0.587106,ENSG00000123407,...,0.0,1.75,22.033333,98.351714,13.428571,0.428571,0.0,1.500143,23.528571,7
3,0,1.0,CTCATTCGGGAAATGCTAAATATTTATAGTTTGGGCTCCTGGGCCC...,0.647368,ENSG00000130812,1900,ENST00000253109,AAGTACCAAGGTCTGCGGCAGGAGGAGACCGGCTCACAGGAGCAGC...,0.561502,ENSG00000120332,...,0.222222,2.433333,22.233333,97.0833,14.7,0.4,0.4,2.1902,23.26,10
4,0,1.0,CCTACACACCCCTGGATCCTCTGAAATGGCAAGGGGTAGGCATGTT...,0.58053,ENSG00000102886,1397,ENST00000406256,AAAGCCCCGGCAGTGACTGGGAGGGGAACAGGAGGAGGGACAGAGG...,0.551272,ENSG00000130055,...,0.75,0.7055,22.8,95.16125,16.75,0.75,0.75,0.7055,22.8,4


In [27]:
for level in levelList:    
    df_animals_dict = get_df_dict(level)    
    for mutation_level in mutation_levels:
        df_mutated = get_ohnologs_mutations(df_dict["No-Ohnologs"],mutation_level)
        current_file_path = train_filepath + "/" + level + "/" + "mutated/"
        df_mutated.to_pickle(current_file_path + "no-ohnologs-complete" + "-" + mutation_level + ".pkl")  
