In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from bokeh.models import Band, ColumnDataSource, LassoSelectTool, WheelZoomTool, BoxZoomTool, ResetTool
from bokeh.plotting import figure, show
from bokeh.io import output_notebook
#from bokeh.layouts import column
#from bokeh.transform import jitter

output_notebook()

%matplotlib inline

In [7]:
df = pd.read_csv('qmean_reference/qmean_refrence.csv')
qmean_ref = df[df['qmean4_norm'] > 0.5].sort_values(by='protein_size')
#qmean_ref = df.sort_values(by='protein_size')
alphafold_models = pd.read_csv('alpha_fold_models.csv')
modeller_models = pd.read_csv('models_fix.csv')

In [3]:
qmean_ref['y_rolling3_mean'] = qmean_ref.qmean4_norm.rolling(300, min_periods=1).mean()
qmean_ref['y_rolling3_std'] = qmean_ref.qmean4_norm.rolling(300, min_periods=1).std()
qmean_ref['lower-1'] = qmean_ref.y_rolling3_mean - qmean_ref.y_rolling3_std
qmean_ref['upper+1'] = qmean_ref.y_rolling3_mean + qmean_ref.y_rolling3_std
qmean_ref['lower-2'] = qmean_ref.y_rolling3_mean - 2*qmean_ref.y_rolling3_std
qmean_ref['upper+2'] = qmean_ref.y_rolling3_mean + 2*qmean_ref.y_rolling3_std

In [8]:
source = ColumnDataSource(qmean_ref.reset_index())
source2 = ColumnDataSource(alphafold_models)
source3 = ColumnDataSource(modeller_models)

TOOLTIPS = [
            ("model_name", "@model_name"),
            ("qmean4", "@qmean4_zscore"),
            ("protein_size", "@protein_size"),
            ("qmean4 normalized", "@qmean4_norm"),
            ("qmeanDisco", "@qmeandisco"),
            ]

p = figure(width=900, height=600, x_range=(10, 200), tooltips=TOOLTIPS, y_range=(0, 1.5))
p.title.text = "Rolling Standard Deviation"
#p.xgrid.grid_line_color=None
#p.ygrid.grid_line_alpha=0.5

#p.varea(x="protein_size", y1="lower-2", y2="upper+2", source=source, fill_alpha=0.1)
#glyph = VArea(x=qmean_ref["protein_size"], y1=qmean_ref["lower-2"], y2=qmean_ref["upper+2"], fill_alpha=0.1)

band1 = Band(base="protein_size", lower="lower-1", upper="upper+1", source=source,
            fill_alpha=0.1, fill_color='gray', line_color=None)

p.scatter("protein_size", "qmean4_norm", color='red', marker="dot", size=12, source=source2)
p.scatter("protein_size", "qmean4_norm", color='blue', marker="dot", size=12, source=source3)

p.line("protein_size", "y_rolling3_mean", line_dash=(10, 7), line_width=2, source=source)

band2 = Band(base="protein_size", lower="lower-2", upper="upper+2", source=source,
            fill_alpha=0.3, fill_color=None, line_color="black")
p.add_layout(band1)
p.add_layout(band2)
p.add_tools(LassoSelectTool())


show(p)

ERROR:bokeh.core.validation.check:E-1001 (BAD_COLUMN_NAME): Glyph refers to nonexistent column name. This could either be due to a misspelling or typo, or due to an expected column being missing. : y='y_rolling3_mean' [no close matches] {renderer: GlyphRenderer(id='p1225', ...)}


In [5]:
source = ColumnDataSource(qmean_ref.reset_index())
source2 = ColumnDataSource(models)

TOOLTIPS = [
            ("model_name", "@model_name"),
            ("qmean4", "@qmean4_zscore"),
            ("protein_size", "@protein_size"),
            ("qmean4 normalized", "@qmean4_norm"),
            ("qmeanDisco", "@qmeandisco"),
            ]

p = figure(width=900, height=600, x_range=(10, 200), tools='hover,wheel_zoom', tooltips=TOOLTIPS, y_range=(0, 1.5))
p.title.text = "Rolling Standard Deviation"
#p.xgrid.grid_line_color=None
#p.ygrid.grid_line_alpha=0.5

p.scatter("protein_size", "qmean4_norm", color='red', marker="dot", size=18, source=source2)

p.scatter("protein_size", "qmean4_norm", source=source, marker="dot", size=20)

#band = Band(base="protein_size", lower="lower", upper="upper", source=source,
            #fill_alpha=0.3, fill_color="yellow", line_color="black")
#p.add_layout(band)

show(p)

In [None]:
fig, axs = plt.subplots(2, 2, figsize=(12, 8))

n_bins = np.histogram_bin_edges(df.protein_size, bins='fd')
axs[0,0].hist(df.protein_size, bins=n_bins)
axs[0,0].set_title('Protein size')

n_bins = np.histogram_bin_edges(df.qmean4_zscore, bins='fd')
axs[0,1].hist(df.qmean4_zscore, bins=n_bins)
axs[0,1].set_title('qmean4_zscore')

n_bins = np.histogram_bin_edges(df.qmean4_norm, bins='scott')
axs[1,0].hist(df.qmean4_norm, bins=n_bins)
axs[1,0].set_title('qmean4_norm')

n_bins = np.histogram_bin_edges(df.qmeandisco, bins='fd')
axs[1,1].hist(df.qmeandisco, bins=n_bins)
axs[1,1].set_title('qmeandisco')

fig.tight_layout()
plt.show()

In [None]:
fig, ax = plt.subplots()
#ax.boxplot(qmean_ref.qmean4_norm)
ax.plot(qmean_ref.protein_size, qmean_ref.qmean4_norm, 'o')
ax.plot(models.protein_size, models.qmean4_norm, '^', color='red')
ax.set_ylim(0, 1.5)
plt.show()