In [46]:
import os
from matplotlib import pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sb
import pickle as pkl

In [47]:
filepath = "Animals/"
animalList = ["Human","Pig","Chicken","Rat","Mouse","Dog"]
levelList = ["Strict","Relaxed","Intermediate"]
sb.set_style("darkgrid")

In [48]:
def get_df_dict(level):
    df_dict = {}
    df_dict["Ohnologs"] = {}
    df_dict["No-Ohnologs"] = {}
    df_dict["Paralogs"] = {}
    for animal in animalList:        
        current_file_path = filepath + animal + "/" + level + "/"
        df_dict["Ohnologs"][animal] = pd.read_pickle(current_file_path + animal + "-" + level +"-ohnologs.pkl")
        df_dict["No-Ohnologs"][animal] = pd.read_pickle(current_file_path + animal + "-" + level +"-no-ohnologs.pkl")
        df_dict["Paralogs"][animal] = pd.read_pickle(current_file_path + animal + "-" + level +"-paralogues.pkl")        
    return df_dict

In [49]:
def get_ohnologs_complete(ohnologs_df_dict):
    df = pd.DataFrame(columns=['Sequence-1 Id','Sequence-2 Id','Sequence-1','Sequence-2','Sequence-1 Length','Sequence-2 Length','Sequence-1 GC','Sequence-2 GC','Is_Ohnolog'])
    for animal in animalList:
        df_temp = pd.DataFrame(columns=['Sequence-1 Id','Sequence-2 Id','Sequence-1','Sequence-2','Sequence-1 Length','Sequence-2 Length','Sequence-1 GC','Sequence-2 GC','Is_Ohnolog'])
        df_temp['Sequence-1 Id'] = ohnologs_df_dict[animal]["Ohnolog-1 Id"]
        df_temp['Sequence-2 Id'] = ohnologs_df_dict[animal]["Ohnolog-2 Id"]
        df_temp['Sequence-1'] = ohnologs_df_dict[animal]["Ohnolog-1/Sequence"]
        df_temp['Sequence-2'] = ohnologs_df_dict[animal]["Ohnolog-2/Sequence"]
        df_temp['Sequence-1 Length'] = ohnologs_df_dict[animal]["Ohnolog-1/Sequence-Lenght"]
        df_temp['Sequence-2 Length'] = ohnologs_df_dict[animal]["Ohnolog-2/Sequence-Lenght"]
        df_temp['Sequence-1 GC'] = ohnologs_df_dict[animal]["GC_Percent_1"]
        df_temp['Sequence-2 GC'] = ohnologs_df_dict[animal]["GC_Percent_2"]
        df_temp['Is_Ohnolog'] = 1          

        print("Size of " + animal + ": " + str(len(df_temp)))
        df = df.append(df_temp)
    return df

In [50]:
def get_no_ohnologs_complete(ohnologs_df_dict):
    df = pd.DataFrame(columns=['Sequence-1 Id','Sequence-2 Id','Sequence-1','Sequence-2','Sequence-1 Length','Sequence-2 Length','Sequence-1 GC','Sequence-2 GC','Is_Ohnolog'])
    for animal in animalList:
        df_temp = pd.DataFrame(columns=['Sequence-1 Id','Sequence-2 Id','Sequence-1','Sequence-2','Sequence-1 Length','Sequence-2 Length','Sequence-1 GC','Sequence-2 GC','Is_Ohnolog'])
        df_temp['Sequence-1 Id'] = ohnologs_df_dict[animal]["Ohnolog-1 Id"]
        df_temp['Sequence-2 Id'] = ohnologs_df_dict[animal]["Ohnologs-2 Id"]
        df_temp['Sequence-1'] = ohnologs_df_dict[animal]["Ohnolog-1/Sequence"]
        df_temp['Sequence-2'] = ohnologs_df_dict[animal]["Ohnolog-2/Sequence"]
        df_temp['Sequence-1 Length'] = ohnologs_df_dict[animal]["Ohnolog-1/Sequence-Lenght"]
        df_temp['Sequence-2 Length'] = ohnologs_df_dict[animal]["Ohnolog-2/Sequence-Lenght"]
        df_temp['Sequence-1 GC'] = ohnologs_df_dict[animal]["GC_Percent_1"]
        df_temp['Sequence-2 GC'] = ohnologs_df_dict[animal]["GC_Percent_2"]
        df_temp['Is_Ohnolog'] = 0        

        print("Size of " + animal + ": " + str(len(df_temp)))
        df = df.append(df_temp)
    return df

In [60]:
def get_paralogs_complete(paralogs_df_dict):
    df = pd.DataFrame(columns=['Sequence-1 Id','Sequence-2 Id','Sequence-1','Sequence-2','Sequence-1 Length','Sequence-2 Length','Sequence-1 GC','Sequence-2 GC','Is_Ohnolog'])
    for animal in animalList:
        df_temp = pd.DataFrame(columns=['Sequence-1 Id','Sequence-2 Id','Sequence-1','Sequence-2','Sequence-1 Length','Sequence-2 Length','Sequence-1 GC','Sequence-2 GC','Is_Ohnolog'])
        df_temp['Sequence-1 Id'] = paralogs_df_dict[animal]["Paralogue-1 Id"]
        df_temp['Sequence-2 Id'] = paralogs_df_dict[animal]["Paralogue-2 Id"]
        df_temp['Sequence-1'] = paralogs_df_dict[animal]["Paralogue-1/Sequence"]
        df_temp['Sequence-2'] = paralogs_df_dict[animal]["Paralogue-2/Sequence"]
        df_temp['Sequence-1 Length'] = paralogs_df_dict[animal]["Paralogue-1/Sequence-Lenght"]
        df_temp['Sequence-2 Length'] = paralogs_df_dict[animal]["Paralogue-2/Sequence-Lenght"]
        df_temp['Sequence-1 GC'] = paralogs_df_dict[animal]["GC_Percent_1"]
        df_temp['Sequence-2 GC'] = paralogs_df_dict[animal]["GC_Percent_2"]
        df_temp['Is_Ohnolog'] = 0
        print("Size of " + animal + ": " + str(len(df_temp)))
        df = df.append(df_temp)
    return df

In [52]:
workingLevel = "Strict"
df_gene_dict = get_df_dict(workingLevel)

In [53]:
ohnologs_df_complete = get_ohnologs_complete(df_gene_dict["Ohnologs"])
len(ohnologs_df_complete)

Size of Human: 2636
Size of Pig: 2509
Size of Chicken: 833
Size of Rat: 2121
Size of Mouse: 2641
Size of Dog: 2429


13169

In [56]:
no_ohnologs_df_complete = get_no_ohnologs_complete(df_gene_dict["No-Ohnologs"])
len(no_ohnologs_df_complete)

Size of Human: 2696
Size of Pig: 2510
Size of Chicken: 1622
Size of Rat: 2891
Size of Mouse: 2689
Size of Dog: 2430


14838

In [61]:
paralog_df_complete = get_paralogs_complete(df_gene_dict["Paralogs"])
len(paralog_df_complete)

Size of Human: 2676
Size of Pig: 1851
Size of Chicken: 1452
Size of Rat: 2885
Size of Mouse: 2664
Size of Dog: 2428


13956

In [64]:
df_ohnologs_dataset = ohnologs_df_complete.append(no_ohnologs_df_complete)
df_ohnologs_dataset = df_ohnologs_dataset.sample(frac=1)
len(df_ohnologs_dataset)

28007

In [65]:
df_ohnologs_dataset.head()

Unnamed: 0,Sequence-1 Id,Sequence-2 Id,Sequence-1,Sequence-2,Sequence-1 Length,Sequence-2 Length,Sequence-1 GC,Sequence-2 GC,Is_Ohnolog
461,ENSCAFG00000001651,ENSCAFG00000011933,CGACTTCCTGATTCTCCCAGGATTCATAGACTTCATAGCTGATGAG...,TTCGGACACACGCGGCTTTGCCCGGTGCTGGCCATGGCCGACTACC...,2299,1470,0.599391,0.530612,1
1401,ENSMUSG00000067220,ENSMUSG00000026114,CCAAATCCAGTACTGAGCCACGAACATCCTCCCCTGCCAATGCCAT...,TATGCACTCCCTCTTGTCTTCCACCCCCTTCTCCACACCCAATCCA...,2739,3554,0.426068,0.510692,1
1829,ENSRNOG00000018718,ENSRNOG00000018232,ATGTCAGCATGGCGGCGTTCGCAGATTTGGATCTAAGAGCGGGCTC...,CGCCAGCGCGTGGTCCCGGCCCCCTCCACCCGCGGTCTCGGCCGCG...,1438,4104,0.420723,0.596248,0
2865,ENSRNOG00000061832,ENSRNOG00000046227,CCGCGGGAGCCGGCAGCGCTGCGGGTCAGCGGGCGCGGCGGAGCCG...,ATGACTTGGCGAAAAGCCAATCGTGAGACGCAGCCGACGGAGCTGG...,1925,1814,0.40987,0.56505,0
1590,ENSG00000168528,ENSG00000198833,GTATTTGTCTGGTTCCTGTCTGTGTCCGTCGTTCGTCCGACTGTCT...,CCGGGCGCCCGGTTCAGCGCCGCCCCGGCCGGCGCCGGTGCCTGCC...,2807,4290,0.614179,0.400699,0
