In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import gzip
from Bio import SeqIO
from logomaker import transform_matrix
from pssm_analysis import plot_logomaker 

pd.set_option('display.max_columns', 100)

AA_ALPHABETS = "ACDEFGHIKLMNPQRSTVWY-"
id2aa = {0: 'A', 1: 'C', 2: 'D', 3: 'E', 4: 'F', 5: 'G', 
         6: 'H', 7: 'I', 8: 'K', 9: 'L', 10: 'M', 
         11: 'N', 12: 'P', 13: 'Q', 14: 'R', 15: 'S', 
         16: 'T', 17: 'V', 18: 'W', 19: 'Y', 
         20: 'X', 21: 'Z', 22: '-', 23: 'B'}

In [None]:
### Specify the sequence and the ESM2 model used 
name = 'CeVSRA-1'
gene_id = 'Q9XVF1'
model = 'esm2_t36_3B_UR50D'

# Conservation

In [None]:
esm2_conservation_path = f'/home/moon/projects/AgoAnalysis/esm2/{name}/{name}_conservation_{model}.csv.gz'

with gzip.open(esm2_conservation_path, 'rt') as f:
    esm2_df = pd.read_csv(f, sep=',', index_col=0)

esm2_pivot_df = esm2_df.pivot(index='Position', columns='Amino Acid', values='Probability')
esm2_pivot_df.reset_index(drop=True, inplace=True)
esm2_pivot_df.index = esm2_pivot_df.index + 1  # Convert from 0-index to 1-index

# background_dict = {aa: 1/20 for aa in AA_ALPHABETS}
background_vals = np.array([1/20]*20)
esm2_ic_df = transform_matrix(esm2_pivot_df, from_type='probability', to_type='information', background=background_vals)

In [None]:
title = f'Probabilities in ESM2 Conservation of {name}'
plot_logomaker(esm2_pivot_df.iloc[:, 0:20], title=title, ylim=1, color_name='charge')
plt.savefig(f'/home/moon/projects/AgoAnalysis/esm2/{name}/{name}.{model}.pssm_logo.png')
plt.show()

In [None]:
title = f'Information Content in ESM2 Conservation of {name}'
plot_logomaker(esm2_ic_df.iloc[:, 0:20], title=title, color_name='charge')
# plt.savefig(f'/home/moon/projects/AgoAnalysis/esm2/{name}/{name}.{model}.info_logo.pdf')
plt.show()

In [None]:
### Loop through all the proteins
model = 'esm2_t36_3B_UR50D'
names = ['CeVSRA-1', 'CeHRDE-1', 'CePRG-1', 'HsAgo2', 'HsPIWIL2', 
         'CeCSR-1a', 'MIWI', 'BmSIWI', 'AtAgo', 'CeALG-2', 
         'TtAgo', 'CeALG-1', 'DmPIWI', 'CeSAGO-1', 'PfAgo', 'HsAgo1']

background_vals = np.array([1/20]*20)

for name in names:
    outfile = f'/home/moon/projects/AgoAnalysis/esm2/{name}/{name}.{model}.info_logo.pdf'
    if os.path.exists(outfile):
        continue 

    esm2_conservation_path = f'/home/moon/projects/AgoAnalysis/esm2/{name}/{name}_conservation_{model}.csv.gz'
    with gzip.open(esm2_conservation_path, 'rt') as f:
        esm2_df = pd.read_csv(f, sep=',', index_col=0)
    
    esm2_pivot_df = esm2_df.pivot(index='Position', columns='Amino Acid', values='Probability')
    esm2_pivot_df.reset_index(drop=True, inplace=True)
    esm2_pivot_df.index = esm2_pivot_df.index + 1  # Convert from 0-index to 1-index    
    esm2_ic_df = transform_matrix(esm2_pivot_df, from_type='probability', to_type='information', background=background_vals)

    title = f'Information Content in ESM2 Conservation of {name}'
    plot_logomaker(esm2_ic_df.iloc[:, 0:20], title=title, color_name='charge')
    plt.savefig(outfile)

# Coevolution

In [None]:
import bokeh.plotting
from bokeh.models import BasicTicker, PrintfTickFormatter
from bokeh.palettes import viridis, RdBu
from bokeh.transform import linear_cmap
from bokeh.plotting import figure, show, output_file, save

from matplotlib.colors import to_hex
cmap = plt.colormaps["bwr_r"]
bwr_r = [to_hex(cmap(i)) for i in np.linspace(0, 1, 256)]
cmap = plt.colormaps["gray_r"]
gray = [to_hex(cmap(i)) for i in np.linspace(0, 1, 256)]
palette = viridis(256)

In [None]:
esm2_coevolution_path = f'/home/moon/projects/AgoAnalysis/esm2/{name}/{name}_coevolution_{model}.csv.gz'

with gzip.open(esm2_coevolution_path, 'rt') as f:
    df = pd.read_csv(f, sep=',', index_col=0)


def get_fasta(infasta):
    for record in SeqIO.parse(infasta, "fasta"):
        return str(record.seq)

infasta = f'/home/moon/projects/AgoAnalysis/esm2/{name}/{name}.txt'
seq = get_fasta(infasta)

In [None]:
df

In [None]:
TOOLS = "hover,save,pan,box_zoom,reset,wheel_zoom"
p = figure(title="COEVOLUTION",
          x_range=[str(x) for x in range(1,len(seq)+1)],
          y_range=[str(x) for x in range(1,len(seq)+1)][::-1],
          width=800, height=800,
          tools=TOOLS, toolbar_location='below',
          tooltips=[('i', '@i'), ('j', '@j'), ('value', '@value')])

r = p.rect(x="i", y="j", width=1, height=1, source=df,
          fill_color=linear_cmap('value', palette, low=df.value.min(), high=df.value.max()),
          line_color=None)
p.xaxis.visible = False  # Hide the x-axis
p.yaxis.visible = False  # Hide the x-axis
# show(p)

In [None]:
output_file(f"/home/moon/projects/AgoAnalysis/esm2/{name}/{name}_coevolution_{model}_color.html")
save(p)