In [1]:
import pandas as pd
import glob
import os

# Define columns
columns = ["qseqid", "sseqid", "pident", "length", "mismatch", "gapopen",  
           "qstart", "qend", "sstart", "send", "evalue", "bitscore"]

# Dictionary to store DataFrames
dfs = {}

# Print to debug which files are being processed
filepaths = glob.glob("secrets/*.tab")
print("Found files:", filepaths)  # Debug statement

# Load each .tab file from the 'rbhresults' directory and store it in the dictionary
for filepath in filepaths:
    # Extract the prefix from the filename (e.g., "aj" from "rbhresults/ajrbh.tab")
    prefix = os.path.basename(filepath)[:2]
    # Load the file into a DataFrame
    df = pd.read_csv(filepath, sep='\t', names=columns)
    # Store the DataFrame in the dictionary with the prefix as the key
    dfs[f"{prefix}df"] = df

# Print out the keys of the dictionary to verify all DataFrames were loaded
print("DataFrames loaded:", list(dfs.keys()))  # Debug statement

# Example: Print the head of each DataFrame to verify
for name, dataframe in dfs.items():
    print(f"{name} head:")
    print(dataframe.head())

Found files: ['secrets\\ajsec.tab', 'secrets\\drsec.tab', 'secrets\\efsec.tab', 'secrets\\ensec.tab', 'secrets\\mbsec.tab', 'secrets\\mdsec.tab', 'secrets\\mlsec.tab', 'secrets\\mmsec.tab', 'secrets\\mosec.tab', 'secrets\\pasec.tab', 'secrets\\pdsec.tab', 'secrets\\pgsec.tab', 'secrets\\phsec.tab', 'secrets\\pksec.tab', 'secrets\\pnsec.tab', 'secrets\\pvsec.tab', 'secrets\\rasec.tab', 'secrets\\rfsec.tab', 'secrets\\sbsec.tab']
DataFrames loaded: ['ajdf', 'drdf', 'efdf', 'endf', 'mbdf', 'mddf', 'mldf', 'mmdf', 'modf', 'padf', 'pddf', 'pgdf', 'phdf', 'pkdf', 'pndf', 'pvdf', 'radf', 'rfdf', 'sbdf']
ajdf head:
           qseqid            sseqid  pident  length  mismatch  gapopen  \
0     NP_005802.1  ajXP_037000028.1   0.987     407         5        0   
1  XP_006719257.1  ajXP_037000028.1   0.987     407         5        0   
2     NP_004855.2  ajXP_036997943.2   0.618     309       117        0   
3     NP_057288.1  ajXP_037014354.1   0.790     429        90        0   
4     NP_065685

In [2]:
# Define DataFrames in the global namespace using the prefix as variable names
for prefix, df in dfs.items():
    globals()[prefix] = df

In [3]:
from Bio import SeqIO
import pandas as pd

# File paths
bat_fasta_file = "realmergedprots.faa"  # This is the merged bat proteome file
output_file = "rbh_all_bats_secrets.fasta"

# Load bat sequences with duplicates handled
def load_fasta_with_duplicates(fasta_file):
    sequences = {}
    for record in SeqIO.parse(fasta_file, "fasta"):
        if record.id not in sequences:
            sequences[record.id] = record
        else:
            print(f"Duplicate ID found: {record.id}, ignoring this entry.")
    return sequences

# Load the bat sequences
bat_sequences = load_fasta_with_duplicates(bat_fasta_file)

# List of DataFrames (replace 'dfs' with your dictionary or variable holding the DataFrames)
dataframe_list = [ajdf, drdf, efdf, endf, mbdf, mddf, mldf, mmdf, modf, padf, pddf, pgdf, phdf, pkdf, pndf, pvdf, radf, rfdf, sbdf]

# Open the output FASTA file
with open(output_file, "w") as output_fasta:
    # Loop over each DataFrame
    for df in dataframe_list:
        # Process each sseqid in the DataFrame
        for _, row in df.iterrows():
            sseqid_full = row["sseqid"]  # Keep the full ID with prefix
            
            # Check if the sseqid_full is in the bat sequences
            if sseqid_full in bat_sequences:
                sseq = bat_sequences[sseqid_full].seq
                sdesc = bat_sequences[sseqid_full].description
                
                # Determine the ID source based on the prefix if needed
                prefix = sseqid_full[:2]
                if prefix == "pn" or prefix == 'en':
                    id_source = "GenBank"
                else:
                    id_source = "RefSeq"
                
                # Write the bat protein sequence to the output file
                output_fasta.write(f">{sseqid_full} {sdesc} [{id_source}]\n{sseq}\n")
            else:
                print(f"SseqID {sseqid_full} not found in bat proteome.")

print(f"All bat protein sequences written to {output_file}")

All bat protein sequences written to rbh_all_bats_secrets.fasta


In [4]:
import pandas as pd
import glob
import os
rbh_df = pd.read_csv("rbhbatsecretsclustershighid.tsv", sep="\t", header=None, names=["representative", "member"])

aggregated_df = rbh_df.groupby("representative")["member"].apply(list).reset_index()

print(aggregated_df.head())

aggregated_df.to_csv("aggregated_SECRETS_clusters.tsv", sep="\t", index=False)

     representative                                             member
0  ajXP_036981969.1  [ajXP_036981969.1, ajXP_053514474.1, ajXP_0369...
1  ajXP_036982243.2                                 [ajXP_036982243.2]
2  ajXP_036982257.2               [ajXP_036982257.2, ajXP_036982248.2]
3  ajXP_036982516.2                                 [ajXP_036982516.2]
4  ajXP_036982712.2                                 [ajXP_036982712.2]


In [5]:
import pandas as pd
import numpy as np
from sklearn.metrics import silhouette_score

# Load the alignment data
alignment_df = pd.read_csv("secrets_cluster_aligns.tsv", sep="\t", 
                           names=["query", "target", "evalue", "pident", "alnlen", "qstart", "qend", "tstart", "tend"])

# Load clustering information with representative and members
aggregated_df = pd.read_csv("aggregated_SECRETS_clusters.tsv", sep="\t")

# Filter clusters with more than 1 unique protein and at least 2 different species
filtered_clusters = []
for _, row in aggregated_df.iterrows():
    members = row["member"].strip("[]").replace("'", "").split(", ")
    unique_prefixes = set(m[:2] for m in members)  # Get unique species prefixes

    if len(members) > 1 and len(unique_prefixes) > 1:
        filtered_clusters.append({"representative": row["representative"], "members": members})

# Create a combined list of all proteins across clusters
all_proteins = {protein for cluster in filtered_clusters for protein in cluster["members"]}
protein_index = {protein: idx for idx, protein in enumerate(all_proteins)}
num_proteins = len(all_proteins)

# Initialize distance matrices for pident and evalue
pident_matrix = np.zeros((num_proteins, num_proteins))
evalue_matrix = np.zeros((num_proteins, num_proteins))

# Populate matrices based on pairwise alignments
for _, row in alignment_df.iterrows():
    if row["query"] in protein_index and row["target"] in protein_index:
        i, j = protein_index[row["query"]], protein_index[row["target"]]
        pident_matrix[i, j] = row["pident"] / 100  # Convert to fraction
        pident_matrix[j, i] = row["pident"] / 100
        evalue_matrix[i, j] = -np.log(row["evalue"] + 1e-300)
        evalue_matrix[j, i] = -np.log(row["evalue"] + 1e-300)

# Convert similarity to distance for silhouette calculation
pident_distance = 1 - pident_matrix
evalue_distance = 1 - (evalue_matrix / np.max(evalue_matrix))

# Create labels based on cluster membership
labels = np.full(num_proteins, -1)  # Initialize with -1 for unassigned
for cluster_id, cluster in enumerate(filtered_clusters):
    for member in cluster["members"]:
        if member in protein_index:
            labels[protein_index[member]] = cluster_id  # Assign cluster ID as label

# Filter out unassigned proteins
valid_indices = labels != -1
# Convert similarity to distance for silhouette calculation
pident_distance = 1 - pident_matrix
evalue_distance = 1 - (evalue_matrix / np.max(evalue_matrix))  # Normalize evalue distances

# Set diagonal to zero for both distance matrices
np.fill_diagonal(pident_distance, 0)
np.fill_diagonal(evalue_distance, 0)

# Assign labels for silhouette calculation
labels = labels[valid_indices]  # Ensure labels are only for valid indices

# Calculate silhouette scores for both pident and evalue distances
pident_silhouette_score = silhouette_score(pident_distance, labels, metric="precomputed")
evalue_silhouette_score = silhouette_score(evalue_distance, labels, metric="precomputed")

# Display results
print("Silhouette Score based on pident:", pident_silhouette_score)
print("Silhouette Score based on evalue:", evalue_silhouette_score)

Silhouette Score based on pident: 0.10992582380010277
Silhouette Score based on evalue: 0.07308145066046258


In [6]:
import pandas as pd
import numpy as np
from sklearn.metrics import silhouette_score

# Load the alignment data
alignment_df = pd.read_csv("secrets_cluster_aligns.tsv", sep="\t", 
                           names=["query", "target", "evalue", "pident", "alnlen", "qstart", "qend", "tstart", "tend"])

# Load clustering information with representative and members
aggregated_df = pd.read_csv("aggregated_SECRETS_clusters.tsv", sep="\t")

# Filter clusters with more than 1 unique protein and at least 2 different species
filtered_clusters = []
for _, row in aggregated_df.iterrows():
    members = row["member"].strip("[]").replace("'", "").split(", ")
    unique_prefixes = set(m[:2] for m in members)  # Get unique species prefixes

    if len(members) > 1 and len(unique_prefixes) > 1:
        filtered_clusters.append({"representative": row["representative"], "members": members})

# Create a combined list of all proteins across clusters
all_proteins = {protein for cluster in filtered_clusters for protein in cluster["members"]}
protein_index = {protein: idx for idx, protein in enumerate(all_proteins)}
num_proteins = len(all_proteins)

# Initialize distance matrices for pident and evalue
pident_matrix = np.zeros((num_proteins, num_proteins))
evalue_matrix = np.zeros((num_proteins, num_proteins))

# Populate matrices based on pairwise alignments
for _, row in alignment_df.iterrows():
    if row["query"] in protein_index and row["target"] in protein_index:
        i, j = protein_index[row["query"]], protein_index[row["target"]]
        pident_matrix[i, j] = row["pident"] / 100  # Convert to fraction
        pident_matrix[j, i] = row["pident"] / 100
        evalue_matrix[i, j] = -np.log(row["evalue"] + 1e-300)
        evalue_matrix[j, i] = -np.log(row["evalue"] + 1e-300)

# Symmetry checks
assert np.allclose(pident_matrix, pident_matrix.T), "pident_matrix is not symmetric"
assert np.allclose(evalue_matrix, evalue_matrix.T), "evalue_matrix is not symmetric"

# Convert similarity to distance for silhouette calculation
pident_distance = 1 - pident_matrix
evalue_distance = 1 - (evalue_matrix / np.max(evalue_matrix))

# Ensure the diagonal is zero
np.fill_diagonal(pident_distance, 0)
np.fill_diagonal(evalue_distance, 0)

# Check distance matrix ranges
print("pident_distance min/max:", pident_distance.min(), pident_distance.max())
print("evalue_distance min/max:", evalue_distance.min(), evalue_distance.max())

# Create labels based on cluster membership
labels = np.full(num_proteins, -1)  # Initialize with -1 for unassigned
for cluster_id, cluster in enumerate(filtered_clusters):
    for member in cluster["members"]:
        if member in protein_index:
            labels[protein_index[member]] = cluster_id  # Assign cluster ID as label

# Filter out unassigned proteins only once
valid_indices = labels != -1
pident_distance = pident_distance[valid_indices][:, valid_indices]
evalue_distance = evalue_distance[valid_indices][:, valid_indices]
labels = labels[valid_indices]

# Re-check diagonal for zero values
np.fill_diagonal(pident_distance, 0)
np.fill_diagonal(evalue_distance, 0)

# Calculate silhouette scores for both pident and evalue distances
pident_silhouette_score = silhouette_score(pident_distance, labels, metric="precomputed")
evalue_silhouette_score = silhouette_score(evalue_distance, labels, metric="precomputed")

# Display results
print("Silhouette Score based on pident:", pident_silhouette_score)
print("Silhouette Score based on evalue:", evalue_silhouette_score)

pident_distance min/max: 0.0 1.0
evalue_distance min/max: 0.0 1.0
Silhouette Score based on pident: 0.10992582380010277
Silhouette Score based on evalue: 0.07308145066046258


In [7]:
import pandas as pd
import numpy as np
from sklearn.metrics import silhouette_score, silhouette_samples

# Load the alignment data
alignment_df = pd.read_csv("secrets_cluster_aligns.tsv", sep="\t", 
                           names=["query", "target", "evalue", "pident", "alnlen", "qstart", "qend", "tstart", "tend"])

# Load clustering information with representative and members
aggregated_df = pd.read_csv("aggregated_SECRETS_clusters.tsv", sep="\t")

# Filter clusters with more than 1 unique protein and at least 2 different species
filtered_clusters = []
for _, row in aggregated_df.iterrows():
    members = row["member"].strip("[]").replace("'", "").split(", ")
    unique_prefixes = set(m[:2] for m in members)  # Get unique species prefixes

    if len(members) > 1 and len(unique_prefixes) > 1:
        filtered_clusters.append({"representative": row["representative"], "members": members})

# Create a combined list of all proteins across clusters
all_proteins = {protein for cluster in filtered_clusters for protein in cluster["members"]}
protein_index = {protein: idx for idx, protein in enumerate(all_proteins)}
num_proteins = len(all_proteins)

# Initialize distance matrices for pident and evalue
pident_matrix = np.zeros((num_proteins, num_proteins))
evalue_matrix = np.zeros((num_proteins, num_proteins))

# Populate matrices based on pairwise alignments
for _, row in alignment_df.iterrows():
    if row["query"] in protein_index and row["target"] in protein_index:
        i, j = protein_index[row["query"]], protein_index[row["target"]]
        pident_matrix[i, j] = row["pident"] / 100  # Convert to fraction
        pident_matrix[j, i] = row["pident"] / 100
        evalue_matrix[i, j] = -np.log(row["evalue"] + 1e-300)
        evalue_matrix[j, i] = -np.log(row["evalue"] + 1e-300)

# Convert similarity to distance for silhouette calculation
pident_distance = 1 - pident_matrix
evalue_distance = 1 - (evalue_matrix / np.max(evalue_matrix))

# Set diagonal to zero for both distance matrices
np.fill_diagonal(pident_distance, 0)
np.fill_diagonal(evalue_distance, 0)

# Create labels based on cluster membership
labels = np.full(num_proteins, -1)  # Initialize with -1 for unassigned
for cluster_id, cluster in enumerate(filtered_clusters):
    for member in cluster["members"]:
        if member in protein_index:
            labels[protein_index[member]] = cluster_id  # Assign cluster ID as label

# Filter out unassigned proteins
valid_indices = labels != -1
pident_distance = pident_distance[valid_indices][:, valid_indices]
evalue_distance = evalue_distance[valid_indices][:, valid_indices]
labels = labels[valid_indices]

# Calculate silhouette scores for each sample based on pident and evalue distances
pident_silhouette_samples = silhouette_samples(pident_distance, labels, metric="precomputed")
evalue_silhouette_samples = silhouette_samples(evalue_distance, labels, metric="precomputed")

# Create a DataFrame to store individual silhouette scores with cluster labels
silhouette_df = pd.DataFrame({
    "protein": [protein for protein, index in protein_index.items() if valid_indices[index]],
    "cluster": labels,
    "pident_silhouette": pident_silhouette_samples,
    "evalue_silhouette": evalue_silhouette_samples
})

# Calculate average silhouette score for each cluster
cluster_quality = silhouette_df.groupby("cluster").agg({
    "pident_silhouette": "mean",
    "evalue_silhouette": "mean"
}).reset_index()

# Display results
print("Overall Silhouette Score based on pident:", silhouette_score(pident_distance, labels, metric="precomputed"))
print("Overall Silhouette Score based on evalue:", silhouette_score(evalue_distance, labels, metric="precomputed"))
print("\nAverage Silhouette Scores for Each Cluster:")
print(cluster_quality)

Overall Silhouette Score based on pident: 0.10992582380010277
Overall Silhouette Score based on evalue: 0.07308145066046258

Average Silhouette Scores for Each Cluster:
     cluster  pident_silhouette  evalue_silhouette
0          0           0.200956           0.079232
1          1           0.000000           0.000000
2          2           0.000000           0.000000
3          3           0.000000           0.000000
4          4           0.000000           0.000000
..       ...                ...                ...
892      892           0.078613           0.066680
893      893           0.000000           0.000000
894      894           0.000000           0.000000
895      895           0.038712           0.036874
896      896           0.000000           0.000000

[897 rows x 3 columns]


In [8]:
from sklearn.metrics import silhouette_score, silhouette_samples

# Ensure diagonal elements are zero for silhouette calculation
np.fill_diagonal(pident_distance, 0)
np.fill_diagonal(evalue_distance, 0)

# Filter out unassigned proteins and keep original protein IDs
valid_protein_ids = [protein for protein, index in protein_index.items() if valid_indices[index]]
labels = labels[valid_indices]
pident_distance = pident_distance[valid_indices][:, valid_indices]
evalue_distance = evalue_distance[valid_indices][:, valid_indices]

# Calculate silhouette scores for each sample based on pident and evalue distances
pident_silhouette_samples = silhouette_samples(pident_distance, labels, metric="precomputed")
evalue_silhouette_samples = silhouette_samples(evalue_distance, labels, metric="precomputed")

# Create a DataFrame to store protein IDs, cluster labels, and silhouette scores
silhouette_df = pd.DataFrame({
    "protein": valid_protein_ids,  # Proper protein IDs instead of indexes
    "cluster": labels,
    "pident_silhouette": pident_silhouette_samples,
    "evalue_silhouette": evalue_silhouette_samples
})

# Calculate the average silhouette score for each cluster
cluster_quality = silhouette_df.groupby("cluster").agg({
    "pident_silhouette": "mean",
    "evalue_silhouette": "mean"
}).reset_index()

# Display results
print("Average Silhouette Scores for Each Cluster:")
print(cluster_quality)

Average Silhouette Scores for Each Cluster:
     cluster  pident_silhouette  evalue_silhouette
0          0           0.200956           0.079232
1          1           0.000000           0.000000
2          2           0.000000           0.000000
3          3           0.000000           0.000000
4          4           0.000000           0.000000
..       ...                ...                ...
892      892           0.078613           0.066680
893      893           0.000000           0.000000
894      894           0.000000           0.000000
895      895           0.038712           0.036874
896      896           0.000000           0.000000

[897 rows x 3 columns]


In [9]:
# Create a dictionary that maps each cluster ID to the list of members (protein IDs) in that cluster
cluster_members_dict = {
    cluster_id: cluster["members"]
    for cluster_id, cluster in enumerate(filtered_clusters)
}

# Map each row in silhouette_df to its respective cluster members
silhouette_df["cluster_members"] = silhouette_df["cluster"].map(cluster_members_dict)

# Group by cluster to get average silhouette scores and list of cluster members
cluster_quality = silhouette_df.groupby("cluster").agg({
    "pident_silhouette": "mean",
    "evalue_silhouette": "mean",
    "cluster_members": "first"  # List of members in each cluster
}).reset_index()

# Display results
print("Average Silhouette Scores and Members for Each Cluster:")
print(cluster_quality)

Average Silhouette Scores and Members for Each Cluster:
     cluster  pident_silhouette  evalue_silhouette  \
0          0           0.200956           0.079232   
1          1           0.000000           0.000000   
2          2           0.000000           0.000000   
3          3           0.000000           0.000000   
4          4           0.000000           0.000000   
..       ...                ...                ...   
892      892           0.078613           0.066680   
893      893           0.000000           0.000000   
894      894           0.000000           0.000000   
895      895           0.038712           0.036874   
896      896           0.000000           0.000000   

                                       cluster_members  
0    [ajXP_036982891.1, drXP_045058106.2, ajXP_0369...  
1    [ajXP_036982894.2, phXP_045690325.1, drXP_0244...  
2    [ajXP_036984044.1, ajXP_036984045.1, pdXP_0283...  
3    [ajXP_036984048.1, drXP_024428099.1, phXP_0456...  
4    [ajXP

In [10]:

# Define longevity-associated prefixes
short_living_prefixes = {'sb', 'pn', 'pk', 'mo', 'pd', 'pv', 'pa'}
average_living_prefixes = {'en', 'ef', 'ph', 'aj', 'ra', 'pg'}
long_living_prefixes = {'mb', 'md', 'ml', 'mm', 'dr', 'rf'}

# Function to determine the longevity category based on prefix
def get_longevity_category(prefix):
    if prefix in short_living_prefixes:
        return "short"
    elif prefix in average_living_prefixes:
        return "average"
    elif prefix in long_living_prefixes:
        return "long"
    else:
        return "unknown"

# Add a new column to silhouette_df with the longevity category and species prefix
silhouette_df["longevity"] = silhouette_df["protein"].apply(lambda x: get_longevity_category(x[:2]))
silhouette_df["species_prefix"] = silhouette_df["protein"].apply(lambda x: x[:2])

# Group by cluster and analyze the longevity distribution, species diversity, and silhouette score
cluster_longevity = silhouette_df.groupby("cluster").apply(
    lambda df: pd.Series({
        "members": df["cluster_members"].iloc[0],  # List of cluster members
        "cluster_size": len(df),  # Total number of proteins in the cluster
        "num_species": df["species_prefix"].nunique(),  # Number of unique species (prefixes)
        "short_living_percentage": (df["longevity"] == "short").mean(),
        "long_living_percentage": (df["longevity"] == "long").mean(),
        "average_silhouette_pident": df["pident_silhouette"].mean(),
        "average_silhouette_evalue": df["evalue_silhouette"].mean()
    })
).reset_index()

# Filter clusters to meet the criteria:
# - At least 3 proteins in the cluster
# - At least 3 different species (prefixes)
# - Either at least 75% short-living or long-living members
# - At least one of the silhouette scores is higher than 0.25
longevity_associated_clusters = cluster_longevity[
    ((cluster_longevity["short_living_percentage"] >= 0.75) | 
     (cluster_longevity["long_living_percentage"] >= 0.75)) & 
    (cluster_longevity["cluster_size"] >= 3) & 
    (cluster_longevity["num_species"] >= 3) &
    ((cluster_longevity["average_silhouette_pident"] > 0.25) |
     (cluster_longevity["average_silhouette_evalue"] > 0.25))
]

# Display the longevity-associated clusters with silhouette scores
print("Longevity-Associated Clusters with at least 3 Proteins, 3 Species, and Silhouette Score > 0.25 for either measure:")
print(longevity_associated_clusters)


Longevity-Associated Clusters with at least 3 Proteins, 3 Species, and Silhouette Score > 0.25 for either measure:
     cluster                                            members  cluster_size  \
72        72  [efXP_008150328.2, mmXP_036185700.1, mbXP_0058...             5   
82        82  [efXP_008158474.1, efXP_008158474.1, efXP_0081...             4   
89        89  [efXP_027997859.1, mmXP_036194648.1, mlXP_0236...             4   
90        90  [efXP_027997861.1, mmXP_036194650.1, mlXP_0236...             4   
142      142  [mbXP_005860280.1, mbXP_005860280.1, mmXP_0361...             3   
144      144  [mbXP_005860933.1, mlXP_006088898.1, mmXP_0362...             4   
145      145  [mbXP_005861536.1, mlXP_006101181.1, mdXP_0595...             4   
149      149  [mbXP_005865977.1, mmXP_036157647.1, efXP_0081...             5   
151      151  [mbXP_005867728.1, mmXP_036198121.1, mbXP_0143...             5   
153      153  [mbXP_005870616.1, mlXP_006089124.1, mmXP_0361...            

  cluster_longevity = silhouette_df.groupby("cluster").apply(


In [11]:
longevity_associated_clusters.to_csv("longevitysecretsclust.txt", sep="\t", index=False)

In [12]:
%matplotlib qt5

In [14]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import ast

# Define longevity-associated prefixes
short_living_prefixes = {'sb', 'pn', 'pk', 'mo', 'pd', 'pv', 'pa'}
average_living_prefixes = {'en', 'ef', 'ph', 'aj', 'ra', 'pg'}
long_living_prefixes = {'mb', 'md', 'ml', 'mm', 'dr', 'rf'}

# Function to determine the longevity category based on prefix
def get_longevity_category(prefix):
    if prefix in short_living_prefixes:
        return "green"
    elif prefix in average_living_prefixes:
        return "yellow"
    elif prefix in long_living_prefixes:
        return "red"
    else:
        return "gray"

# Load your data
alignment_df = pd.read_csv("secrets_cluster_aligns.tsv", sep="\t", 
                           names=["query", "target", "evalue", "pident", "alnlen", "qstart", "qend", "tstart", "tend"])

aggregated_df = pd.read_csv("aggregated_SECRETS_clusters.tsv", sep="\t")

# Calculate silhouette scores, filter clusters, and prepare longevity_associated_clusters as in previous code
# Assume longevity_associated_clusters is prepared and has the columns "cluster", "members", and "average_silhouette_pident"

# Set up the plot with a grid layout
plt.figure(figsize=(20, 20))  # Larger figure size for better spacing
grid_size = int(np.ceil(np.sqrt(len(longevity_associated_clusters))))  # Determine grid size (e.g., 5x5 for 25 clusters)
spacing = 7  # Adjust this value for more space between clusters

for idx, (index, cluster) in enumerate(longevity_associated_clusters.iterrows()):
    # Convert members column to list if it's in string format
    if isinstance(cluster["members"], str):
        members = ast.literal_eval(cluster["members"])  # Safely convert string to list
    else:
        members = cluster["members"]
    
    silhouette_score = cluster["average_silhouette_pident"]
    
    # Define grid position for each cluster
    row, col = divmod(idx, grid_size)
    cluster_position = (col * spacing, -row * spacing)  # Adjust y-axis to go downward for each row

    circle_radius = 0.6 + 0.1 * len(members)  # Adjusted radius based on cluster size
    circle = plt.Circle(cluster_position, circle_radius, color='black', fill=False, 
                        linewidth=silhouette_score * 4)  # Scale outline by silhouette score
    plt.gca().add_artist(circle)
    
    # Set the first member as the representative
    representative = members[0] if members else None

    # Position proteins within the cluster circle
    angles = np.linspace(0, 2 * np.pi, len(members), endpoint=False)
    for i, protein in enumerate(members):
        protein_color = get_longevity_category(protein[:2])  # Color by longevity
        x = cluster_position[0] + circle_radius * 0.7 * np.cos(angles[i])
        y = cluster_position[1] + circle_radius * 0.7 * np.sin(angles[i])

        # Make the representative protein larger
        marker_size = 12 if protein == representative else 8
        plt.plot(x, y, 'o', color=protein_color, markersize=marker_size)
        
        # Add labels to each protein
        plt.text(x, y, protein, fontsize=6, ha='center', va='center')

        # Draw connections between all proteins in the cluster
        for j in range(i + 1, len(members)):
            x2 = cluster_position[0] + circle_radius * 0.7 * np.cos(angles[j])
            y2 = cluster_position[1] + circle_radius * 0.7 * np.sin(angles[j])
            plt.plot([x, x2], [y, y2], color="gray", linewidth=1)  # Uniform line width for all connections

# Display plot
plt.axis('off')
plt.title("Longevity-Associated Clusters Network (Grid Layout, Only Within-Cluster Connections)")
plt.show()


In [21]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import ast
import matplotlib.cm as cm
import matplotlib.colors as mcolors

# Define longevity-associated prefixes
short_living_prefixes = {'sb', 'pn', 'pk', 'mo', 'pd', 'pv', 'pa'}
average_living_prefixes = {'en', 'ef', 'ph', 'aj', 'ra', 'pg'}
long_living_prefixes = {'mb', 'md', 'ml', 'mm', 'dr', 'rf'}

# Function to determine the longevity category based on prefix
def get_longevity_category(prefix):
    if prefix in short_living_prefixes:
        return "green"
    elif prefix in average_living_prefixes:
        return "yellow"
    elif prefix in long_living_prefixes:
        return "red"
    else:
        return "gray"

# Load your data
alignment_df = pd.read_csv("secrets_cluster_aligns.tsv", sep="\t", 
                           names=["query", "target", "evalue", "pident", "alnlen", "qstart", "qend", "tstart", "tend"])

aggregated_df = pd.read_csv("aggregated_SECRETS_clusters.tsv", sep="\t")

# Calculate silhouette scores, filter clusters, and prepare longevity_associated_clusters as in previous code
# Assume longevity_associated_clusters is prepared and has the columns "cluster", "members", and "average_silhouette_pident"

# Set up the plot with a grid layout
fig, ax = plt.subplots(figsize=(20, 20))  # Larger figure size for better spacing
grid_size = int(np.ceil(np.sqrt(len(longevity_associated_clusters))))  # Determine grid size (e.g., 5x5 for 25 clusters)
spacing = 7  # Adjust this value for more space between clusters

# Set up color mapping for silhouette scores
norm = mcolors.Normalize(vmin=longevity_associated_clusters["average_silhouette_pident"].min(),
                         vmax=longevity_associated_clusters["average_silhouette_pident"].max())
cmap = cm.get_cmap('viridis')

# Dummy scatter for color bar
sm = plt.cm.ScalarMappable(cmap=cmap, norm=norm)
sm.set_array([])  # This is needed to create a color bar

for idx, (index, cluster) in enumerate(longevity_associated_clusters.iterrows()):
    # Convert members column to list if it's in string format
    if isinstance(cluster["members"], str):
        members = ast.literal_eval(cluster["members"])  # Safely convert string to list
    else:
        members = cluster["members"]
    
    silhouette_score = cluster["average_silhouette_pident"]
    
    # Define grid position for each cluster
    row, col = divmod(idx, grid_size)
    cluster_position = (col * spacing, -row * spacing)  # Adjust y-axis to go downward for each row

    circle_radius = 0.6 + 0.1 * len(members)  # Adjusted radius based on cluster size
    outline_color = cmap(norm(silhouette_score))  # Map silhouette score to color
    circle = plt.Circle(cluster_position, circle_radius, color=outline_color, fill=False, 
                        linewidth=2 + silhouette_score * 2)  # Outline width based on silhouette score
    ax.add_artist(circle)
    
    # Set the first member as the representative
    representative = members[0] if members else None

    # Position proteins within the cluster circle
    angles = np.linspace(0, 2 * np.pi, len(members), endpoint=False)
    for i, protein in enumerate(members):
        protein_color = get_longevity_category(protein[:2])  # Color by longevity
        x = cluster_position[0] + circle_radius * 0.7 * np.cos(angles[i])
        y = cluster_position[1] + circle_radius * 0.7 * np.sin(angles[i])

        # Make the representative protein larger
        marker_size = 8 #12 if protein == representative else 8
        ax.plot(x, y, 'o', color=protein_color, markersize=marker_size)
        
        # Add labels to each protein
        ax.text(x, y, protein, fontsize=6, ha='center', va='center')

        # Draw connections between all proteins in the cluster
        for j in range(i + 1, len(members)):
            x2 = cluster_position[0] + circle_radius * 0.7 * np.cos(angles[j])
            y2 = cluster_position[1] + circle_radius * 0.7 * np.sin(angles[j])
            ax.plot([x, x2], [y, y2], color="gray", linewidth=1)  # Uniform line width for all connections

# Add color bar outside the plot
cbar = fig.colorbar(sm, ax=ax, orientation='vertical', fraction=0.02, pad=0.04)
cbar.set_label("Silhouette Score (pident-based)", rotation=90)

# Disable axis labels and ticks
ax.axis('off')

plt.title("Longevity-Associated Clusters Network (Grid Layout, Only Within-Cluster Connections)")
plt.show()




  cmap = cm.get_cmap('viridis')


In [22]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import ast
import matplotlib.cm as cm
import matplotlib.colors as mcolors

# Define longevity-associated prefixes
short_living_prefixes = {'sb', 'pn', 'pk', 'mo', 'pd', 'pv', 'pa'}
average_living_prefixes = {'en', 'ef', 'ph', 'aj', 'ra', 'pg'}
long_living_prefixes = {'mb', 'md', 'ml', 'mm', 'dr', 'rf'}

# Function to determine the longevity category based on prefix
def get_longevity_category(prefix):
    if prefix in short_living_prefixes:
        return "green"
    elif prefix in average_living_prefixes:
        return "yellow"
    elif prefix in long_living_prefixes:
        return "red"
    else:
        return "gray"

# Load your data
alignment_df = pd.read_csv("secrets_cluster_aligns.tsv", sep="\t", 
                           names=["query", "target", "evalue", "pident", "alnlen", "qstart", "qend", "tstart", "tend"])

aggregated_df = pd.read_csv("aggregated_SECRETS_clusters.tsv", sep="\t")

# Assume longevity_associated_clusters is prepared and has the columns "cluster", "members", and "average_silhouette_pident"

# Set up the plot with a grid layout
fig, ax = plt.subplots(figsize=(20, 20))
grid_size = int(np.ceil(np.sqrt(len(longevity_associated_clusters))))
spacing = 7

# Set up color mapping for silhouette scores
norm = mcolors.Normalize(vmin=longevity_associated_clusters["average_silhouette_pident"].min(),
                         vmax=longevity_associated_clusters["average_silhouette_pident"].max())
cmap = cm.get_cmap('viridis')

# Dummy scatter for color bar
sm = plt.cm.ScalarMappable(cmap=cmap, norm=norm)
sm.set_array([])

for idx, (index, cluster) in enumerate(longevity_associated_clusters.iterrows()):
    # Convert members column to list if it's in string format
    if isinstance(cluster["members"], str):
        members = list(set(ast.literal_eval(cluster["members"])))  # Remove duplicates
    else:
        members = list(set(cluster["members"]))

    silhouette_score = cluster["average_silhouette_pident"]
    
    # Define grid position for each cluster
    row, col = divmod(idx, grid_size)
    cluster_position = (col * spacing, -row * spacing)

    circle_radius = 0.6 + 0.1 * len(members)
    outline_color = cmap(norm(silhouette_score))
    circle = plt.Circle(cluster_position, circle_radius, color=outline_color, fill=False, linewidth=2 + silhouette_score * 2)
    ax.add_artist(circle)
    
    # Position proteins within the cluster circle
    angles = np.linspace(0, 2 * np.pi, len(members), endpoint=False)
    protein_positions = {}

    for i, protein in enumerate(members):
        protein_color = get_longevity_category(protein[:2])
        x = cluster_position[0] + circle_radius * 0.7 * np.cos(angles[i])
        y = cluster_position[1] + circle_radius * 0.7 * np.sin(angles[i])
        protein_positions[protein] = (x, y)
        ax.plot(x, y, 'o', color=protein_color, markersize=8)
        ax.text(x, y, protein, fontsize=6, ha='center', va='center')

    # Draw connections between all proteins in the cluster with a uniform line width
    for i, protein1 in enumerate(members):
        for j in range(i + 1, len(members)):
            protein2 = members[j]
            x1, y1 = protein_positions[protein1]
            x2, y2 = protein_positions[protein2]
            ax.plot([x1, x2], [y1, y2], color="gray", linewidth=1)  # Uniform line width for all connections

# Add color bar outside the plot
cbar = fig.colorbar(sm, ax=ax, orientation='vertical', fraction=0.02, pad=0.04)
cbar.set_label("Silhouette Score (pident-based)", rotation=90)

# Disable axis labels and ticks
ax.axis('off')

plt.title("Longevity-Associated Clusters Network (Grid Layout, Only Within-Cluster Connections)")
plt.show()


  cmap = cm.get_cmap('viridis')
