In [1]:
import os
import sys

import pandas as pd
import numpy as np
import sklearn
from hmmlearn import hmm
try: # version > 0.2.7
   from hmmlearn.hmm import CategoricalHMM as MultinomialHMM
except: # version <= 0.2.7
   from hmmlearn.hmm import MultinomialHMM
import matplotlib.pyplot as plt
from tqdm import tqdm
import seaborn as sns
from sklearn.model_selection import KFold

sys.path.insert(0, '../scripts')
from flaHMM_functions import *

In [2]:
species_build=['Dbia.GCF_018148935','Dbia.d101g','Dbia.d15genomes',
                       'Dere.GCF_003286155','Dere.d101g','Dere.droEre1','Dere.d15genomes',
                       'Dsuz.GCF_013340165',
                       'Dtei.GCF_016746235','Dtei.d101g_2733','Dtei.d101g_CT02',
                       'Dfic.GCF_018152265','Dfic.d101g','Dfic.GCF_000220665',
                       'Dosh.d101g',
                       'Dath.GCA_008121215',
                       'Dazt.GCA_005876895',
                       'Dmir.GCF_003369915',
                       'Dper.GCF_003286085','Dper.d101g','Dper.d15genomes',
                       'Dpse.d15genomes','Dpse.GCF_009870125',
                       'Dinn.GCF_004354385',
                       'Damb.d101g',
                       'Dbif.GCA_009664405',
                       'Dobs.d101g','Dobs.GCF_018151105',
                       'Dtris.d101g']

In [3]:
for bin_size in ['10','5','2.5']:
#for bin_size in ['5']:
    all_data_species=pd.DataFrame()
    for species_train in species_build:
        print(species_train)
        species_train_build='_build_'.join(species_train.split('.'))
        all_data=read_proces_05files(species_train,'../bins/bins_'+bin_size+'k/')
    
        try:
            all_data=genome_cluster_annotation(species_train_build,all_data)
        except:
            all_data['cluster']=0
            
        try:
            all_data=genome_cluster_annotation_flamlike(species_train_build,all_data)
        except:
            all_data['cluster_flamlike']=0
        
        all_data = all_data[['chr', 'bin_start','bin_end','cluster','cluster_flamlike','coverage_plus','coverage_minus']]
        all_data['Data']=species_train  
    
        centromere_2_bins=pd.DataFrame()
        for chromosome in ['chr2L','chr2R']:
            get_centromere_coordinates=centromere_coordinates[(centromere_coordinates['species']==species_train)&(centromere_coordinates['chr']==chromosome)]
            for row in get_centromere_coordinates.iterrows():
                centromere_2_bins=pd.concat([centromere_2_bins, all_data[(all_data['chr']==chromosome)&(all_data['bin_start']>=row[1]['bin_start'])&(all_data['bin_end']<=row[1]['bin_end'])]])
            
        centromere_3_bins=pd.DataFrame()
        for chromosome in ['chr3L','chr3R']:
            get_centromere_coordinates=centromere_coordinates[(centromere_coordinates['species']==species_train)&(centromere_coordinates['chr']==chromosome)]
            for row in get_centromere_coordinates.iterrows():
                centromere_3_bins=pd.concat([centromere_3_bins, all_data[(all_data['chr']==chromosome)&(all_data['bin_start']>=row[1]['bin_start'])&(all_data['bin_end']<=row[1]['bin_end'])]])
          
        flam_bins=all_data[ (all_data['cluster']==1) | (all_data['cluster_flamlike']==1) ]
        print(flam_bins.shape)
    
        region=[]
        region_binary=[]
        for i in all_data.index.tolist():
            if i in flam_bins.index.tolist():
                region.append('flam')
                region_binary.append(1)
            elif i in centromere_2_bins.index.tolist():
                region.append('centromere')
                region_binary.append(2)
            elif i in centromere_3_bins.index.tolist():
                region.append('centromere')
                region_binary.append(2)
            else:
                region.append('none')
                region_binary.append(0)
            
        all_data['region']=region
        all_data['region_binary']=region_binary
    
    
        all_data_species=pd.concat([all_data_species, all_data])

    all_data_species.to_csv('all_data_species_extendedList'+bin_size+'k.txt', sep='\t')

Dbia.GCF_018148935
(47, 8)
Dbia.d101g
(47, 8)
Dbia.d15genomes
(41, 8)
Dere.GCF_003286155
(57, 8)
Dere.d101g
(39, 8)
Dere.droEre1
(53, 8)
Dere.d15genomes
(57, 8)
Dsuz.GCF_013340165
(138, 8)
Dtei.GCF_016746235
(156, 8)
Dtei.d101g_2733
(37, 8)
Dtei.d101g_CT02
(45, 8)
Dfic.GCF_018152265
(113, 8)
Dfic.d101g
(117, 8)
Dfic.GCF_000220665
(16, 8)
Dosh.d101g
(109, 8)
Dath.GCA_008121215
(159, 8)
Dazt.GCA_005876895
(74, 8)
Dmir.GCF_003369915
(92, 8)
Dper.GCF_003286085
(51, 8)
Dper.d101g
(26, 8)
Dper.d15genomes
(8, 8)
Dpse.d15genomes
(50, 8)
Dpse.GCF_009870125
(65, 8)
Dinn.GCF_004354385
(50, 8)
Damb.d101g
(36, 8)
Dbif.GCA_009664405
(26, 8)
Dobs.d101g
(14, 8)
Dobs.GCF_018151105
(12, 8)
Dtris.d101g
(14, 8)
Dbia.GCF_018148935
(46, 8)
Dbia.d101g
(46, 8)
Dbia.d15genomes
(40, 8)
Dere.GCF_003286155
(56, 8)
Dere.d101g
(38, 8)
Dere.droEre1
(52, 8)
Dere.d15genomes
(56, 8)
Dsuz.GCF_013340165
(137, 8)
Dtei.GCF_016746235
(155, 8)
Dtei.d101g_2733
(37, 8)
Dtei.d101g_CT02
(44, 8)
Dfic.GCF_018152265
(112, 8)
Dfic.d