In [1]:
import os
import sys

import pandas as pd
import numpy as np
import sklearn
from hmmlearn import hmm
try: # version > 0.2.7
   from hmmlearn.hmm import CategoricalHMM as MultinomialHMM
except: # version <= 0.2.7
   from hmmlearn.hmm import MultinomialHMM
import matplotlib.pyplot as plt
from tqdm import tqdm
import seaborn as sns
from sklearn.model_selection import KFold

sys.path.insert(0, '../scripts')
from flaHMM_functions import *

In [2]:
species_build=['Dyak.GCF_016746365',
              'Dsan.GCF_016746245',
              'Dsim.GCF_016746395',
              'Dmau.GCF_004382145',
              'Dmel.dm6',
              'Dsubp.GCF_014743375']

In [3]:
# Create folder if it doesn't exist    
if not os.path.isdir('matrices'):
    os.mkdir('matrices')
    
if not os.path.isdir('matrices/startprobs'):
    os.mkdir('matrices/startprobs')
    
if not os.path.isdir('matrices/transmats'):
    os.mkdir('matrices/transmats')
    
if not os.path.isdir('matrices/emissionprobs'):
    os.mkdir('matrices/emissionprobs')

In [4]:
for bin_size in tqdm(['10','5','2.5']):
#for bin_size in tqdm(['5']):
    print('Read Files')
    all_data_species=pd.read_csv('all_data_species'+bin_size+'k.txt', sep='\t')
    for species_train in species_build:
        print(species_train)
        all_data=all_data_species[all_data_species['Data']==species_train]
        strand=build2coords[species_train.replace('.','_build_')][1]
        starprob, transmat=calculate_transmat(all_data, 3, strand, pseudo=True)
        for threshold in [0.025,0.05,0.075,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]:
        #for threshold in [0.05]:
            emission_new=calculate_3emissions(all_data, threshold)
            emissionmat=calculate_emissionprob_em3_st3(all_data, emission_new, strand, pseudo=True)
            pd.DataFrame(starprob).to_csv('matrices/startprobs/startprob_'+species_train+'_threshold_'+str(threshold)+'_bin_'+str(bin_size)+'_st_'+str(3)+'_em_'+str(3)+'.txt', sep='\t')
            pd.DataFrame(transmat).to_csv('matrices/transmats/transmat_'+species_train+'_threshold_'+str(threshold)+'_bin_'+str(bin_size)+'_st_'+str(3)+'_em_'+str(3)+'.txt', sep='\t')
            pd.DataFrame(emissionmat).to_csv('matrices/emissionprobs/emissionmat_'+species_train+'_threshold_'+str(threshold)+'_bin_'+str(bin_size)+'_st_'+str(3)+'_em_'+str(3)+'.txt', sep='\t')

  0%|          | 0/3 [00:00<?, ?it/s]

Read Files
Dyak.GCF_016746365


  transitions_strand_one=np.vstack([df_chr['region_binary'][:-1],df_chr['region_binary'][1:]]).T
  transitions_strand_two=np.vstack([df_chr['region_binary'][:-1].replace(1,0),df_chr['region_binary'][1:].replace(1,0)]).T


Dsan.GCF_016746245


  transitions_strand_one=np.vstack([df_chr['region_binary'][:-1],df_chr['region_binary'][1:]]).T
  transitions_strand_two=np.vstack([df_chr['region_binary'][:-1].replace(1,0),df_chr['region_binary'][1:].replace(1,0)]).T


Dsim.GCF_016746395


  transitions_strand_one=np.vstack([df_chr['region_binary'][:-1],df_chr['region_binary'][1:]]).T
  transitions_strand_two=np.vstack([df_chr['region_binary'][:-1].replace(1,0),df_chr['region_binary'][1:].replace(1,0)]).T


Dmau.GCF_004382145


  transitions_strand_one=np.vstack([df_chr['region_binary'][:-1],df_chr['region_binary'][1:]]).T
  transitions_strand_two=np.vstack([df_chr['region_binary'][:-1].replace(1,0),df_chr['region_binary'][1:].replace(1,0)]).T


Dmel.dm6


  transitions_strand_one=np.vstack([df_chr['region_binary'][:-1],df_chr['region_binary'][1:]]).T
  transitions_strand_two=np.vstack([df_chr['region_binary'][:-1].replace(1,0),df_chr['region_binary'][1:].replace(1,0)]).T


Dsubp.GCF_014743375


  transitions_strand_one=np.vstack([df_chr['region_binary'][:-1].replace(1,0),df_chr['region_binary'][1:].replace(1,0)]).T
  transitions_strand_two=np.vstack([df_chr['region_binary'][:-1],df_chr['region_binary'][1:]]).T
 33%|███▎      | 1/3 [05:10<10:20, 310.21s/it]

Read Files
Dyak.GCF_016746365


  transitions_strand_one=np.vstack([df_chr['region_binary'][:-1],df_chr['region_binary'][1:]]).T
  transitions_strand_two=np.vstack([df_chr['region_binary'][:-1].replace(1,0),df_chr['region_binary'][1:].replace(1,0)]).T


Dsan.GCF_016746245


  transitions_strand_one=np.vstack([df_chr['region_binary'][:-1],df_chr['region_binary'][1:]]).T
  transitions_strand_two=np.vstack([df_chr['region_binary'][:-1].replace(1,0),df_chr['region_binary'][1:].replace(1,0)]).T


Dsim.GCF_016746395


  transitions_strand_one=np.vstack([df_chr['region_binary'][:-1],df_chr['region_binary'][1:]]).T
  transitions_strand_two=np.vstack([df_chr['region_binary'][:-1].replace(1,0),df_chr['region_binary'][1:].replace(1,0)]).T


Dmau.GCF_004382145


  transitions_strand_one=np.vstack([df_chr['region_binary'][:-1],df_chr['region_binary'][1:]]).T
  transitions_strand_two=np.vstack([df_chr['region_binary'][:-1].replace(1,0),df_chr['region_binary'][1:].replace(1,0)]).T


Dmel.dm6


  transitions_strand_one=np.vstack([df_chr['region_binary'][:-1],df_chr['region_binary'][1:]]).T
  transitions_strand_two=np.vstack([df_chr['region_binary'][:-1].replace(1,0),df_chr['region_binary'][1:].replace(1,0)]).T


Dsubp.GCF_014743375


  transitions_strand_one=np.vstack([df_chr['region_binary'][:-1].replace(1,0),df_chr['region_binary'][1:].replace(1,0)]).T
  transitions_strand_two=np.vstack([df_chr['region_binary'][:-1],df_chr['region_binary'][1:]]).T
 67%|██████▋   | 2/3 [10:18<05:09, 309.27s/it]

Read Files
Dyak.GCF_016746365


  transitions_strand_one=np.vstack([df_chr['region_binary'][:-1],df_chr['region_binary'][1:]]).T
  transitions_strand_two=np.vstack([df_chr['region_binary'][:-1].replace(1,0),df_chr['region_binary'][1:].replace(1,0)]).T


Dsan.GCF_016746245


  transitions_strand_one=np.vstack([df_chr['region_binary'][:-1],df_chr['region_binary'][1:]]).T
  transitions_strand_two=np.vstack([df_chr['region_binary'][:-1].replace(1,0),df_chr['region_binary'][1:].replace(1,0)]).T


Dsim.GCF_016746395


  transitions_strand_one=np.vstack([df_chr['region_binary'][:-1],df_chr['region_binary'][1:]]).T
  transitions_strand_two=np.vstack([df_chr['region_binary'][:-1].replace(1,0),df_chr['region_binary'][1:].replace(1,0)]).T


Dmau.GCF_004382145


  transitions_strand_one=np.vstack([df_chr['region_binary'][:-1],df_chr['region_binary'][1:]]).T
  transitions_strand_two=np.vstack([df_chr['region_binary'][:-1].replace(1,0),df_chr['region_binary'][1:].replace(1,0)]).T


Dmel.dm6


  transitions_strand_one=np.vstack([df_chr['region_binary'][:-1],df_chr['region_binary'][1:]]).T
  transitions_strand_two=np.vstack([df_chr['region_binary'][:-1].replace(1,0),df_chr['region_binary'][1:].replace(1,0)]).T


Dsubp.GCF_014743375


  transitions_strand_one=np.vstack([df_chr['region_binary'][:-1].replace(1,0),df_chr['region_binary'][1:].replace(1,0)]).T
  transitions_strand_two=np.vstack([df_chr['region_binary'][:-1],df_chr['region_binary'][1:]]).T
100%|██████████| 3/3 [20:28<00:00, 409.59s/it]
