This code analyzes the unsupervised output of the model (likelihood of binding to non tested proteins).

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os

# Dynamically determine the base path
base_path = os.getcwd()  # Get the current working directory
intermediate_steps_path = os.path.join(base_path, "Intermediate_steps")

# Dynamically locate the required files
core_predictions_path = os.path.join(intermediate_steps_path, "core_predictions.csv")
protein_likelihoods_path = os.path.join(intermediate_steps_path, "protein_likelihoods.csv")

# Check if the files exist
if not os.path.exists(core_predictions_path) or not os.path.exists(protein_likelihoods_path):
    raise FileNotFoundError("Required files not found in the specified Intermediate_steps directory.")

# Load the data
core_predictions = pd.read_csv(core_predictions_path)
protein_likelihoods = pd.read_csv(protein_likelihoods_path)

# Merge core_predictions and protein_likelihoods based on common identifiers
merged_data = core_predictions.merge(protein_likelihoods, on=["Folder_Name", "RNA_Number"])

# Get likelihood column names dynamically
likelihood_columns = [col for col in protein_likelihoods.columns if col.startswith("Protein_Class_")]

# Normalize likelihoods per row by dividing by the likelihood of the ground truth class
def normalize_likelihoods(row):
    ground_truth_class = row["Protein_Ground_Class"]
    ground_truth_likelihood = row[f"Protein_Class_{ground_truth_class}_Likelihood"]
    normalized = row[likelihood_columns] / ground_truth_likelihood * 100
    return normalized

# Apply normalization
normalized_likelihoods = merged_data.apply(normalize_likelihoods, axis=1)
normalized_likelihoods.columns = likelihood_columns

# Add normalized likelihoods back to the merged dataframe
for col in likelihood_columns:
    merged_data[f"Normalized_{col}"] = normalized_likelihoods[col]

# Prepare the data for plotting
binding_data = merged_data[merged_data["Binding_Ground_Truth"] == 1]
non_binding_data = merged_data[merged_data["Binding_Ground_Truth"] == 0]

# Create the dot plot
plt.figure(figsize=(15, 7))

# Loop through the classes and plot the data for binding and non-binding
for class_index, class_column in enumerate(likelihood_columns):
    # X-axis for each class
    x_binding = np.full(binding_data.shape[0], class_index + 1)
    y_binding = binding_data[f"Normalized_{class_column}"]

    x_non_binding = np.full(non_binding_data.shape[0], class_index + 1)
    y_non_binding = non_binding_data[f"Normalized_{class_column}"]

    # Plot binding in one color, non-binding in another
    plt.scatter(x_binding, y_binding, color="blue", alpha=0.6, label="Binding" if class_index == 0 else "")
    plt.scatter(x_non_binding, y_non_binding, color="orange", alpha=0.6, label="Non-binding" if class_index == 0 else "")

# Format the plot
plt.xticks(range(1, len(likelihood_columns) + 1), labels=[f"Class {i}" for i in range(len(likelihood_columns))])
plt.xlabel("Protein Classes")
plt.ylabel("Normalized Likelihood (%)")
plt.title("Binding vs Non-binding Normalized Likelihoods per Protein Class")
plt.legend()
plt.grid(alpha=0.3)
plt.tight_layout()
plt.show()
