In [None]:
import pathlib

import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import numpy as np
from numpy.polynomial.polynomial import Polynomial
import pandas as pd
import seaborn as sns

In [None]:
np.random.seed(917)

In [None]:
def cm2inch(*tupl):
    inch = 2.54
    if isinstance(tupl[0], tuple):
        return tuple(i/inch for i in tupl[0])
    else:
        return tuple(i/inch for i in tupl)

In [None]:
# define color pallete (cvd-friendly)
blue = '#005AB5'
red = '#DC3220'
gray = '#D0D0D0'

In [None]:
# Scientific to Common
namedict = {
    'Homo sapiens': 'Human',
    'Anas platyrhynchos': 'Duck',
    'Bos taurus': 'Cow',
    'Camelus dromedarius': 'Dromedary',
    'Canis lupus familiaris': 'Dog',
    'Capra hircus': 'Goat',
    'Carassius auratus': 'Goldfish',
    'Cavia porcellus': 'Guinea pig',
    'Columba livia': 'Pidgeon',
    'Crocodylus porosus': 'Crocodile',
    'Equus asinus': 'Donkey',
    'Equus caballus': 'Horse',
    'Erinaceus europaeus': 'Hedgehog',
    'Felis catus': 'Cat',
    'Gallus gallus': 'Chicken',
    'Macaca mulatta': 'Macaque',
    'Manis javanica': 'Pangolin',
    'Mesocricetus auratus': 'Hamster',
    'Mus musculus': 'Mouse',
    'Mustela putorius furo': 'Ferret',
    'Oryctolagus cuniculus': 'Rabbit',
    'Ovis aries': 'Sheep',
    'Paguma larvata': 'Civet',
    'Pan troglodytes': 'Chimpanzee',
    'Panthera tigris altaica': 'Siberian Tiger',
    'Pongo abelii': 'Orangutan',
    'Rattus norvegicus': 'Rat',
    'Rhinolophus sinicus': 'Horseshoe Bat',
    'Serinus canaria': 'Canary',
    'Sus scrofa': 'Pig'
}

# Lower case and underscore
namedict = {
    '_'.join(k.lower().split()): k
    for k, v in namedict.items()
}

In [None]:
rootdir = pathlib.Path('.').resolve(strict=True)
datadir = rootdir.parent / 'refinement'

In [None]:
df = pd.read_csv(datadir / 'clusters_top10.stat', delimiter='\s+', index_col='species')
df.index = df.index.map(namedict.get)
df.head(2)

In [None]:
seq_df = pd.read_csv(datadir / 'seq.analysis', index_col='Species')
seq_df.drop(index='ACE2_HUMAN_6M17', inplace=True)
seq_df.head()

In [None]:
df['Seq Sim Interface'] = seq_df['Seq Sim Interface']
df.head()

In [None]:
# Masks
positive = [
    'Homo sapiens',
    'Felis catus',
    'Manis javanica',
    'Mesocricetus auratus',
    'Mustela putorius furo',
    'Paguma larvata',
    'Panthera tigris altaica',
    'Rhinolophus sinicus',
    'Bos taurus',
    'Ovis aries',
    'Camelus dromedarius',
    'Oryctolagus cuniculus',
    'Equus caballus', 
]
positive_df = df.loc[positive, :]

negative = [
    'Anas platyrhynchos',
    'Gallus gallus',
    'Mus musculus',
    'Cavia porcellus',
    'Rattus norvegicus',
]
negative_df = df.loc[negative, :]

In [None]:
# Replace species
# df.index = map(namedict.get, df.index)
# Sort by increasing sequence sim interface
df.sort_values(by='haddock-score', inplace=True)

In [None]:
df.head()

## Figure

In [None]:
fig, ax1 = plt.subplots(
    nrows=1, ncols=1,
    figsize=cm2inch(10, 6.68),  # w,h
    constrained_layout=True,
#     dpi=600  # for notebook viewing only
)

# Move to numpy arrays
hs = df['haddock-score'].values

pos_hs = positive_df['haddock-score']
neg_hs = negative_df['haddock-score']

# Overall styles
line_kwargs = {
    'color': 'black',
    'linestyle': '--',
    'linewidth': 0.5
}

lstsq = Polynomial.fit(df['Seq Sim Interface'], hs, 1)
x_fit, y_fit = lstsq.linspace()

ax1.scatter(df['Seq Sim Interface'], hs, color=gray)
ax1.scatter(positive_df['Seq Sim Interface'], pos_hs, color=blue)
ax1.scatter(negative_df['Seq Sim Interface'], neg_hs, color=red)
ax1.plot(x_fit, y_fit, **line_kwargs)

rsq_coeff = np.corrcoef(df['Seq Sim Interface'], hs)[0, 1] ** 2

# Style axes
ax1.set_ylabel(
    'HADDOCK Score (a.u.)',
    fontsize=9
)

ax1.set_xlabel(
    'ACE2 Interface Sequence Similarity (%)',
    fontsize=9
)

ax1.xaxis.set_major_locator(ticker.MaxNLocator(5))
ax1.yaxis.set_major_locator(ticker.MaxNLocator(5))

for label in ax1.get_xticklabels():
    label.set_fontsize(8)

for label in ax1.get_yticklabels():
    label.set_fontsize(8)

ax1.set_title(
    r'$R^2$ = {0:3.2f}'.format(rsq_coeff),
    fontsize=9
)

In [None]:
fig.savefig('Figure_S2.pdf');