In [None]:
#Written by Rita Afriyie Boateng
'''
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import kruskal
'''

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
##Plotes Analysis for Haplotypes

# Load the CSV file
file_path = 'https://github.com/GhedinSGS/SARS-CoV-2_TissueDiversity/tree/main/data/MD_simulation/Overall_RMSD_Haplotypes.csv'
data = pd.read_csv(file_path)

# Calculate the mean of each column except Delta-AY.119
column_means = data.drop(columns=['Delta-AY.119']).mean()

# Sort columns by mean value, keeping Delta-AY.119 first
sorted_columns = ['Delta-AY.119'] + list(column_means.sort_values().index)

# Reorder the data according to the sorted columns
sorted_data = data[sorted_columns]

# Calculate mean values for each column
means = sorted_data.mean()

# Create boxplot
plt.figure(figsize=(10, 6))

# Plot Delta-AY.119 with one color
plt.boxplot(sorted_data['Delta-AY.119'], positions=[1], patch_artist=True, boxprops=dict(facecolor="lightblue"))

# Plot the rest of the data with another color
plt.boxplot(sorted_data.drop(columns=['Delta-AY.119']).values, positions=range(2, len(sorted_columns) + 1), patch_artist=True, boxprops=dict(facecolor="lightgreen"))

# Annotate mean values above each box plot (outside the box) with vertical text
for i, mean_value in enumerate(means):
    plt.text(i + 1, sorted_data.max().max() + 0.2, f'{mean_value:.2f}', ha='center', va='bottom', fontsize=8, color='black', rotation=90)

# Set labels and title
plt.xticks(ticks=range(1, len(sorted_columns) + 1), labels=sorted_columns, rotation=45)
plt.title('Box Plot Sorted by Mean Value (Delta-AY.119 First) with Vertical Mean Annotations Outside')
plt.xlabel('Columns')
plt.ylabel('Values')

# Adjust y-axis to make space for annotations
plt.ylim(0, sorted_data.max().max() + 0.5)

# Show the plot
plt.tight_layout()
plt.show()


In [None]:
##PLot for local analysis for Haplotypes

# Load the CSV files
data_ace2 = pd.read_csv('https://github.com/GhedinSGS/SARS-CoV-2_TissueDiversity/tree/main/data/MD_simulation/SARS_COV2_ACE2_rmsd_haplotypes.csv')
data_ntd = pd.read_csv('https://github.com/GhedinSGS/SARS-CoV-2_TissueDiversity/tree/main/data/MD_simulation/SARS_COV2_ACE2_rmsd_haplotypes.csv')
data_rbd = pd.read_csv('https://github.com/GhedinSGS/SARS-CoV-2_TissueDiversity/tree/main/data/MD_simulation/SARS_COV2_ACE2_rmsd_haplotypes.csv')

# Filter out specific systems from each dataset
systems_to_remove = ['E406D-G446V-L455F-T573I', 'E406D-V445A-S477N-Q493K']

# Remove the specified systems from each dataset
data_ace2_filtered = data_ace2[~data_ace2['Systems'].isin(systems_to_remove)]
data_ntd_filtered = data_ntd[~data_ntd['Systems'].isin(systems_to_remove)]
data_rbd_filtered = data_rbd[~data_rbd['Systems'].isin(systems_to_remove)]

# Label the datasets accordingly
data_rbd_filtered['Dataset'] = 'RMSD of RBD'
data_ntd_filtered['Dataset'] = 'RMSD of NTD'
data_ace2_filtered['Dataset'] = 'RMSD of ACE2'


combined_filtered_data = pd.concat([data_rbd_filtered, data_ntd_filtered, data_ace2_filtere])

# Combine all filtered datasets
combined_filtered_data = pd.concat([data_ace2_filtered])

# Define a dictionary for renaming the system names
rename_dict = {
    'V445A-S477N-Q493K': 'H1',
    'G446V': 'H2',
    'R19T-V445A-S477N-Q493K': 'H3',
    'R19T-V445A-S477N-K478E-Q493K': 'H4',
    'V445A-S477N-Q493K-P561S': 'H5',
    'K417R-V445A-Y453F-G476S': 'H6',
    'R19T-V445A-S477N': 'H7',
    'R19T-K417R-V445A-Y453F-G476S': 'H8',
    'G446V-F486S': 'H9',
    'V445A-Q493K': 'H10'
}

# Apply the renaming to the 'Systems' column in the combined data
combined_filtered_data['Systems'] = combined_filtered_data['Systems'].replace(rename_dict)

# Define the desired order of Systems
order = ['Delta-AY.119','H9','H1', 'H3', 'H2', 'H4', 'H10', 'H7', 'H6', 'H5', 'H8']

# Set up the plot with increased figure size
plt.figure(figsize=(8, 6))

# Create the box plot for corrected and renamed data with a specific order
ax = sns.boxplot(x='Systems', y='RMSD (nm)', hue='Dataset', data=combined_filtered_data, palette='Paired', width=0.5, dodge=True, order=order)

plt.title('Local RMSD Patterns')
plt.xlabel('Systems')
plt.ylabel('RMSD (nm)')
plt.xticks(rotation=45, ha='right', fontsize=10)

plt.legend(title='Dataset')
plt.tight_layout()

# Save the figure as a PNG file
plt.savefig('local_RMSD_Patterns.pdf', dpi=300, format='pdf', bbox_inches='tight')

# Show the plot
plt.show()


In [None]:
##Plotes for Isolates

# Load the CSV files into dataframes
ace2_rmsd_df = pd.read_csv('https://github.com/GhedinSGS/SARS-CoV-2_TissueDiversity/tree/main/data/MD_simulation/SARS_COV2_ACE2_rmsd_isolate.csv')
ntd_rmsd_df = pd.read_csv('https://github.com/GhedinSGS/SARS-CoV-2_TissueDiversity/tree/main/data/MD_simulation/SARS_COV2_NTD_rmsd_isolate.csv')
rbd_rmsd_df = pd.read_csv('https://github.com/GhedinSGS/SARS-CoV-2_TissueDiversity/tree/main/data/MD_simulation/SARS_COV2_RBD_rmsd_isolate.csv')

# Combining the three datasets into one for a comparative box plot
combined_df = pd.concat([
    ace2_rmsd_df.melt(var_name='Residue', value_name='RMSD').assign(Dataset='RMSD of ACE2'),
    ntd_rmsd_df.melt(var_name='Residue', value_name='RMSD').assign(Dataset='RMSD of NTD'),
    rbd_rmsd_df.melt(var_name='Residue', value_name='RMSD').assign(Dataset='RMSD of RBD')
])

# Reordering the residues with Delta first as specified
order_with_delta = ['Delta-AY.119', 'H9', 'H1', 'H3', 'H2', 'H4', 'H10', 'H7', 'H6', 'H5', 'H8']


# Reordering the residues with Delta first as specified
order_with_delta = ['Delta-AY.119', 'Isolate 2', 'Isolate 1', 'Isolate 3', 'Isolate 4']

# Plotting the combined box plot with the specified order
plt.figure(figsize=(9, 7))
sns.boxplot(data=combined_df, x='Residue', y='RMSD', hue='Dataset', order=order_with_delta)
plt.title('RBD, NTD and ACE2 RMSD pattern')
plt.xlabel('Site Name')
plt.ylabel('RMSD (nm)')
plt.xticks(rotation=45)
plt.legend(title='', loc='upper right')

# Adjust layout and save
plt.tight_layout()
plt.savefig('RMSD_isolate.pdf', format='pdf')

# Show the plot
plt.show()