In [None]:
import pathlib

import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import numpy as np
from numpy.polynomial.polynomial import Polynomial
import pandas as pd
import seaborn as sns

In [None]:
np.random.seed(917)

In [None]:
def cm2inch(*tupl):
    inch = 2.54
    if isinstance(tupl[0], tuple):
        return tuple(i/inch for i in tupl[0])
    else:
        return tuple(i/inch for i in tupl)

In [None]:
# define color pallete (cvd-friendly)
blue = '#005AB5'
red = '#DC3220'
gray = '#D0D0D0'

In [None]:
# Scientific to Common
namedict = {
    'Homo sapiens': 'Human',
    'Anas platyrhynchos': 'Duck',
    'Bos taurus': 'Cow',
    'Camelus dromedarius': 'Dromedary',
    'Canis lupus familiaris': 'Dog',
    'Capra hircus': 'Goat',
    'Carassius auratus': 'Goldfish',
    'Cavia porcellus': 'Guinea pig',
    'Columba livia': 'Pidgeon',
    'Crocodylus porosus': 'Crocodile',
    'Equus asinus': 'Donkey',
    'Equus caballus': 'Horse',
    'Erinaceus europaeus': 'Hedgehog',
    'Felis catus': 'Cat',
    'Gallus gallus': 'Chicken',
    'Macaca mulatta': 'Macaque',
    'Manis javanica': 'Pangolin',
    'Mesocricetus auratus': 'Hamster',
    'Mus musculus': 'Mouse',
    'Mustela putorius furo': 'Ferret',
    'Oryctolagus cuniculus': 'Rabbit',
    'Ovis aries': 'Sheep',
    'Paguma larvata': 'Civet',
    'Pan troglodytes': 'Chimpanzee',
    'Panthera tigris altaica': 'Siberian Tiger',
    'Pongo abelii': 'Orangutan',
    'Rattus norvegicus': 'Rat',
    'Rhinolophus sinicus': 'Horseshoe Bat',
    'Serinus canaria': 'Canary',
    'Sus scrofa': 'Pig'
}

# Lower case and underscore
namedict = {
    '_'.join(k.lower().split()): k
    for k, v in namedict.items()
}

In [None]:
rootdir = pathlib.Path('.').resolve(strict=True)
datadir = rootdir.parent / 'refinement'

In [None]:
df = pd.read_csv(datadir / 'clusters_top10.stat', delimiter='\s+', index_col='species')
df.info()

In [None]:
positive = [
    'homo_sapiens',
    'felis_catus',
    'manis_javanica',
    'mesocricetus_auratus',
    'mustela_putorius_furo',
    'paguma_larvata',
    'panthera_tigris_altaica',
    'rhinolophus_sinicus',
    'bos_taurus',
    'ovis_aries',
    'camelus_dromedarius',
    'oryctolagus_cuniculus',
    'equus_caballus', 
]
positive_df = df.loc[positive, :]

negative = [
    'anas_platyrhynchos',
    'gallus_gallus',
    'mus_musculus',
    'cavia_porcellus',
    'rattus_norvegicus',
]
negative_df = df.loc[negative, :]

In [None]:
# Replace species
# df.index = map(namedict.get, df.index)
# Sort by increasing sequence sim interface
df.sort_values(by='haddock-score', inplace=True)

In [None]:
df.head()

## Figure 2

In [None]:
fig, (ax1, ax2, ax3, ax4) = plt.subplots(
    nrows=1, ncols=4,
    figsize=cm2inch(13.2, 4),  # w,h
    constrained_layout=True,
    sharey=True,
    dpi=600  # uncomment for PDF
)

# Move to numpy arrays
hs = df['haddock-score'].values
evdw = df['Evdw'].values
eelec = df['Eelec'].values
edesolv = df['Edesolv'].values
bsa = df['BSA'].values

pos_hs = positive_df['haddock-score']
neg_hs = negative_df['haddock-score']

# Overall styles
line_kwargs = {
    'color': 'black',
    'linestyle': '--',
    'linewidth': 0.5
}

rsq_coeff = []

# van der Waals
lstsq = Polynomial.fit(evdw, hs, 1)
x_fit, y_fit = lstsq.linspace()

ax1.scatter(evdw, hs, color=gray)
ax1.scatter(positive_df['Evdw'], pos_hs, color=blue)
ax1.scatter(negative_df['Evdw'], neg_hs, color=red)
ax1.plot(x_fit, y_fit, **line_kwargs)

rsq_coeff.append(
    np.corrcoef(evdw, hs)[0, 1] ** 2
)

# Electrostatics
lstsq = Polynomial.fit(eelec, hs, 1)
x_fit, y_fit = lstsq.linspace()

ax2.scatter(eelec, hs, color=gray)
ax2.scatter(positive_df['Eelec'], pos_hs, color=blue)
ax2.scatter(negative_df['Eelec'], neg_hs, color=red)
ax2.plot(x_fit, y_fit, **line_kwargs)

rsq_coeff.append(
    np.corrcoef(eelec, hs)[0, 1] ** 2
)

# Desolvation Energy
lstsq = Polynomial.fit(edesolv, hs, 1)
x_fit, y_fit = lstsq.linspace()

ax3.scatter(edesolv, hs, color=gray)
ax3.scatter(positive_df['Edesolv'], pos_hs, color=blue)
ax3.scatter(negative_df['Edesolv'], neg_hs, color=red)
ax3.plot(x_fit, y_fit, **line_kwargs)

rsq_coeff.append(
    np.corrcoef(edesolv, hs)[0, 1] ** 2
)

# Buried Surface Area
lstsq = Polynomial.fit(bsa, hs, 1)
x_fit, y_fit = lstsq.linspace()

ax4.scatter(bsa, hs, color=gray)
ax4.scatter(positive_df['BSA'], pos_hs, color=blue)
ax4.scatter(negative_df['BSA'], neg_hs, color=red)
ax4.plot(x_fit, y_fit, **line_kwargs)

rsq_coeff.append(
    np.corrcoef(bsa, hs)[0, 1] ** 2
)

# Style axes
ax1.set_ylabel(
    'HADDOCK Score (a.u.)',
    fontsize=9
)
ax1.yaxis.set_major_locator(ticker.MaxNLocator(5))  # eh...

axes = (ax1, ax2, ax3, ax4)
for idx, ax in enumerate(axes):

    ax.margins(x=0)
    
    for label in ax.get_xticklabels():
#         label.set_rotation(90)
        label.set_fontsize(8)

    for label in ax.get_yticklabels():
        label.set_fontsize(8)
        
    # Find N ticks divisible by 5
    # MaxNLocator is tripping up...
    nticks = 3
    xdata = ax.lines[0].get_xdata()  # 0th series is scatter
    datamin, datamax = xdata.min(), xdata.max()
    xmin = round(datamin / 10) * 10  # round to nearest neighbor of 10
    xmax = round(datamax / 10) * 10
      
    xticks = [
        round(x/5) * 5 for x in
        np.rint(np.linspace(xmin, xmax, num=nticks))
    ]  # get n evenly spaced ticks, divisible by 5

    ax.xaxis.set_ticks(xticks)
    ax.set_title(
        r'$R^2$ = {0:3.2f}'.format(rsq_coeff[idx]),
        fontsize=9
    )

In [None]:
fig.savefig('Figure_2-new.pdf');