In [None]:
import sys
sys.path.append("../")

from rbm import fasta_read, get_beta_and_W, all_weights, RBM
import analysis_methods as am

from glob import glob
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import math
import subprocess as sp
import matplotlib.image as mpimg


In [None]:
data_files = ['np1_c3.fasta', 'np2_c3.fasta', 'np3_c3.fasta', 'n1_c3.fasta', 'b3_c3.fasta']
rounds = ['np1_c3', 'np2_c3', 'np3_c3', 'n1_c3', 'b3_c3']
rbm_names = ['np1_c3', 'np2_c3', 'np3_c3', 'n1_c3', 'b3_c3']
local_rbm_dir = '/mnt/D1/globus/pig_trained_rbms/ge4/'
data_dir = '../../pig_tissue/gaps_end_4_clusters/'
server_rbm_dir = '../../pig_tissue/gaps_end_4_clusters/trained_rbms/'
molecule = 'protein'
cluster = 3

In [None]:
# generate weights in each rbmdirectory
for rid, rbm in enumerate(rbm_names):
    checkp, version_dir = am.get_checkpoint_path(rbm, rbmdir=local_rbm_dir)
    tmp = RBM.load_from_checkpoint(checkp)
    all_weights(tmp, version_dir +rbm+"_weights", 5, 1, 6, 2, "protein")


In [None]:
# Stores all data in a dictionary ("data")
all_data = am.fetch_data(rounds, dir=data_dir, counts=True, molecule=molecule)


In [None]:
paths_u, paths_w = [], []
for r in rounds:
    paths_u.append(am.seq_logo(all_data[all_data["round"] == r], f"{r}_seqlogo", weight=False, outdir="./generated/"))
    paths_w.append(am.seq_logo(all_data[all_data["round"] == r], f"{r}_w_seqlogo", weight=True, outdir="./generated/"))


In [None]:
# Seq Logo showing Frequency of Each Amino Acid at each position
fig, axs = plt.subplots(5, 2)
fig.set_size_inches(15, 12)
for rid, r in enumerate(rounds):
    img1 = mpimg.imread(f"{paths_u[rid]}.freq.png")
    img2 = mpimg.imread(f"{paths_w[rid]}.freq.png")
    axs[rid][0].imshow(img1)
    axs[rid][1].imshow(img2)
    axs[rid][0].axis("off")
    axs[rid][1].axis("off")
    axs[rid][0].set_title(f"{r} Frequency Logo")
    axs[rid][1].set_title(f"{r} Weighted Frequency Logo")
plt.show()


In [None]:
# Seq Logo showing Information of Each Amino Acid at each position
fig, axs = plt.subplots(5, 2)
fig.set_size_inches(15, 12)
for rid, r in enumerate(rounds):
    img1 = mpimg.imread(f"{paths_u[rid]}.info.png")
    img2 = mpimg.imread(f"{paths_u[rid]}.info.png")
    axs[rid][0].imshow(img1)
    axs[rid][1].imshow(img2)
    axs[rid][0].axis("off")
    axs[rid][1].axis("off")
    axs[rid][0].set_title(f"{r} Frequency Logo")
    axs[rid][1].set_title(f"{r} Weighted Frequency Logo")
plt.show()


In [None]:
# calculate likelihoods from last round rbm only
checkp, v_dir = am.get_checkpoint_path(rbm_names[-1], rbmdir=local_rbm_dir)
last_round_rbm = RBM.load_from_checkpoint(checkp)

# this takes awhile, might be something I optimize further in the future
am.generate_likelihoods(rounds, last_round_rbm, all_data, str(rbm_names[-1]) + "_all_likelihoods")


In [None]:
last_round_likelihoods = am.get_likelihoods("./generated/" + str(rbm_names[-1]) + "_all_likelihoods.json")


In [None]:

# Plot Likelihoods of Each batch of Data
last_round_title = f"All data Log-Likelihood From {rbm_names[-1].upper()} RBM Cluster {cluster}"

am.plot_likelihoods(last_round_likelihoods["likelihoods"], rounds, rounds, title=last_round_title, xlim=(-250, -60), cdf=False)


In [None]:
# calculate likelihoods from first round rbm only
checkp, v_dir = am.get_checkpoint_path(rbm_names[0], rbmdir=local_rbm_dir)
first_round_rbm = RBM.load_from_checkpoint(checkp)

# this takes awhile, might be something I optimize further in the future
am.generate_likelihoods(rounds, first_round_rbm, all_data, str(rbm_names[0]) + "_all_likelihoods")


In [None]:
first_round_likelihoods = am.get_likelihoods("./generated/" + str(rbm_names[0]) + "_all_likelihoods.json")


In [None]:

# Plot Likelihoods of Each batch of Data
first_round_title = f"All data Log-Likelihood From {rbm_names[0].upper()} RBM Cluster {cluster}"

am.plot_likelihoods(first_round_likelihoods["likelihoods"], rounds, rounds, title=first_round_title, xlim=(-250, -60), cdf=False)


In [None]:
lr_label = rounds[-1].upper()
fr_label = rounds[0].upper()
am.compare_likelihood_correlation(last_round_likelihoods["likelihoods"][rounds[0]], first_round_likelihoods["likelihoods"][rounds[0]], f"{lr_label} vs {fr_label} RBMs on {fr_label} dataset", [lr_label, fr_label])


In [None]:
lr_label = rounds[-1].upper()
fr_label = rounds[0].upper()
am.compare_likelihood_correlation(last_round_likelihoods["likelihoods"][rounds[-1]], first_round_likelihoods["likelihoods"][rounds[-1]], f"{lr_label} vs {fr_label} RBMs on {fr_label} dataset", [lr_label, fr_label])
