In [1]:
import pandas as pd
import glob
import os

# Define columns
columns = ["qseqid", "sseqid", "pident", "length", "mismatch", "gapopen",  
           "qstart", "qend", "sstart", "send", "evalue", "bitscore"]

# Dictionary to store DataFrames
dfs = {}

# Print to debug which files are being processed
filepaths = glob.glob("ecmaffil/*.tab")
print("Found files:", filepaths)  # Debug statement

# Load each .tab file from the 'rbhresults' directory and store it in the dictionary
for filepath in filepaths:
    # Extract the prefix from the filename (e.g., "aj" from "rbhresults/ajrbh.tab")
    prefix = os.path.basename(filepath)[:2]
    # Load the file into a DataFrame
    df = pd.read_csv(filepath, sep='\t', names=columns)
    # Store the DataFrame in the dictionary with the prefix as the key
    dfs[f"{prefix}df"] = df

# Print out the keys of the dictionary to verify all DataFrames were loaded
print("DataFrames loaded:", list(dfs.keys()))  # Debug statement

# Example: Print the head of each DataFrame to verify
for name, dataframe in dfs.items():
    print(f"{name} head:")
    print(dataframe.head())


Found files: ['ecmaffil\\ajecm.tab', 'ecmaffil\\drecm.tab', 'ecmaffil\\efecm.tab', 'ecmaffil\\enecm.tab', 'ecmaffil\\mbecm.tab', 'ecmaffil\\mdecm.tab', 'ecmaffil\\mlecm.tab', 'ecmaffil\\mmecm.tab', 'ecmaffil\\moecm.tab', 'ecmaffil\\paecm.tab', 'ecmaffil\\pdecm.tab', 'ecmaffil\\pgecm.tab', 'ecmaffil\\phecm.tab', 'ecmaffil\\pkecm.tab', 'ecmaffil\\pnecm.tab', 'ecmaffil\\pvecm.tab', 'ecmaffil\\raecm.tab', 'ecmaffil\\rfecm.tab', 'ecmaffil\\sbecm.tab']
DataFrames loaded: ['ajdf', 'drdf', 'efdf', 'endf', 'mbdf', 'mddf', 'mldf', 'mmdf', 'modf', 'padf', 'pddf', 'pgdf', 'phdf', 'pkdf', 'pndf', 'pvdf', 'radf', 'rfdf', 'sbdf']
ajdf head:
        qseqid            sseqid  pident  length  mismatch  gapopen  qstart  \
0  NP_002988.3  ajXP_053523075.1   0.722     310        85        0       1   
1  NP_002989.2  ajXP_036986998.2   0.905     201        19        0       1   
2  NP_055469.3  ajXP_037005331.2   0.914     442        38        0       1   
3  NP_005561.1  ajXP_037007848.2   0.905     518  

In [2]:
# Define DataFrames in the global namespace using the prefix as variable names
for prefix, df in dfs.items():
    globals()[prefix] = df

In [3]:
sbdf.head()

Unnamed: 0,qseqid,sseqid,pident,length,mismatch,gapopen,qstart,qend,sstart,send,evalue,bitscore
0,XP_006715792.1,sbXP_066129620.1,0.857,828,118,0,1,828,48,873,0.0,1454
1,XP_006715792.1,sbXP_066129621.1,0.857,828,118,0,1,828,48,873,0.0,1454
2,XP_006715792.1,sbXP_066129622.1,0.857,828,118,0,1,828,48,873,0.0,1454
3,NP_443138.2,sbXP_066114975.1,0.961,824,32,0,1,820,1,824,0.0,1646
4,NP_443138.2,sbXP_066115007.1,0.961,824,32,0,1,820,1,824,0.0,1646


In [4]:
from Bio import SeqIO
import pandas as pd

# File paths
bat_fasta_file = "realmergedprots.faa"  # This is the merged bat proteome file
output_file = "rbh_all_bats_ecmaffil.fasta"

# Load bat sequences with duplicates handled
def load_fasta_with_duplicates(fasta_file):
    sequences = {}
    for record in SeqIO.parse(fasta_file, "fasta"):
        if record.id not in sequences:
            sequences[record.id] = record
        else:
            print(f"Duplicate ID found: {record.id}, ignoring this entry.")
    return sequences

# Load the bat sequences
bat_sequences = load_fasta_with_duplicates(bat_fasta_file)

# List of DataFrames (replace 'dfs' with your dictionary or variable holding the DataFrames)
dataframe_list = [ajdf, drdf, efdf, endf, mbdf, mddf, mldf, mmdf, modf, padf, pddf, pgdf, phdf, pkdf, pndf, pvdf, radf, rfdf, sbdf]

# Open the output FASTA file
with open(output_file, "w") as output_fasta:
    # Loop over each DataFrame
    for df in dataframe_list:
        # Process each sseqid in the DataFrame
        for _, row in df.iterrows():
            sseqid_full = row["sseqid"]  # Keep the full ID with prefix
            
            # Check if the sseqid_full is in the bat sequences
            if sseqid_full in bat_sequences:
                sseq = bat_sequences[sseqid_full].seq
                sdesc = bat_sequences[sseqid_full].description
                
                # Determine the ID source based on the prefix if needed
                prefix = sseqid_full[:2]
                if prefix == "pn" or prefix == 'en':
                    id_source = "GenBank"
                else:
                    id_source = "RefSeq"
                
                # Write the bat protein sequence to the output file
                output_fasta.write(f">{sseqid_full} {sdesc} [{id_source}]\n{sseq}\n")
            else:
                print(f"SseqID {sseqid_full} not found in bat proteome.")

print(f"All bat protein sequences written to {output_file}")

All bat protein sequences written to rbh_all_bats_ecmaffil.fasta


In [2]:
import pandas as pd
import glob
import os
rbh_df = pd.read_csv("rbhecmaffilclustershighid.tsv", sep="\t", header=None, names=["representative", "member"])

aggregated_df = rbh_df.groupby("representative")["member"].apply(list).reset_index()

print(aggregated_df.head())

aggregated_df.to_csv("aggregated_ECMAFFIL_clusters.tsv", sep="\t", index=False)

     representative                                             member
0  ajXP_036981807.2  [ajXP_036981807.2, ajXP_036981816.2, ajXP_0369...
1  ajXP_036981962.2  [ajXP_036981962.2, pdXP_028358868.1, pdXP_0358...
2  ajXP_036983531.2                                 [ajXP_036983531.2]
3  ajXP_036984287.2               [ajXP_036984287.2, ajXP_036984287.2]
4  ajXP_036984952.1  [ajXP_036984952.1, pdXP_028374642.2, rfXP_0329...


In [7]:
%pip install scikit-learn

Collecting scikit-learn
  Downloading scikit_learn-1.5.2-cp312-cp312-win_amd64.whl.metadata (13 kB)
Collecting joblib>=1.2.0 (from scikit-learn)
  Downloading joblib-1.4.2-py3-none-any.whl.metadata (5.4 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Downloading threadpoolctl-3.5.0-py3-none-any.whl.metadata (13 kB)
Downloading scikit_learn-1.5.2-cp312-cp312-win_amd64.whl (11.0 MB)
   ---------------------------------------- 0.0/11.0 MB ? eta -:--:--
   ---------------------------------------- 0.0/11.0 MB ? eta -:--:--
   ---------------------------------------- 0.1/11.0 MB 1.5 MB/s eta 0:00:08
    --------------------------------------- 0.3/11.0 MB 2.0 MB/s eta 0:00:06
   - -------------------------------------- 0.4/11.0 MB 2.3 MB/s eta 0:00:05
   - -------------------------------------- 0.5/11.0 MB 2.2 MB/s eta 0:00:05
   -- ------------------------------------- 0.7/11.0 MB 2.6 MB/s eta 0:00:05
   -- ------------------------------------- 0.8/11.0 MB 2.6 MB/s eta 0:00:04
   -

In [3]:
import pandas as pd
import numpy as np
from sklearn.metrics import silhouette_score

# Load the alignment data
alignment_df = pd.read_csv("ecmaffil_cluster_aligns.tsv", sep="\t", 
                           names=["query", "target", "evalue", "pident", "alnlen", "qstart", "qend", "tstart", "tend"])

# Load clustering information with representative and members
aggregated_df = pd.read_csv("aggregated_ECMAFFIL_clusters.tsv", sep="\t")

# Filter clusters with more than 1 unique protein and at least 2 different species
filtered_clusters = []
for _, row in aggregated_df.iterrows():
    members = row["member"].strip("[]").replace("'", "").split(", ")
    unique_prefixes = set(m[:2] for m in members)  # Get unique species prefixes

    if len(members) > 1 and len(unique_prefixes) > 1:
        filtered_clusters.append({"representative": row["representative"], "members": members})

# Create a combined list of all proteins across clusters
all_proteins = {protein for cluster in filtered_clusters for protein in cluster["members"]}
protein_index = {protein: idx for idx, protein in enumerate(all_proteins)}
num_proteins = len(all_proteins)

# Initialize distance matrices for pident and evalue
pident_matrix = np.zeros((num_proteins, num_proteins))
evalue_matrix = np.zeros((num_proteins, num_proteins))

# Populate matrices based on pairwise alignments
for _, row in alignment_df.iterrows():
    if row["query"] in protein_index and row["target"] in protein_index:
        i, j = protein_index[row["query"]], protein_index[row["target"]]
        pident_matrix[i, j] = row["pident"] / 100  # Convert to fraction
        pident_matrix[j, i] = row["pident"] / 100
        evalue_matrix[i, j] = -np.log(row["evalue"] + 1e-300)
        evalue_matrix[j, i] = -np.log(row["evalue"] + 1e-300)

# Convert similarity to distance for silhouette calculation
pident_distance = 1 - pident_matrix
evalue_distance = 1 - (evalue_matrix / np.max(evalue_matrix))

# Create labels based on cluster membership
labels = np.full(num_proteins, -1)  # Initialize with -1 for unassigned
for cluster_id, cluster in enumerate(filtered_clusters):
    for member in cluster["members"]:
        if member in protein_index:
            labels[protein_index[member]] = cluster_id  # Assign cluster ID as label

# Filter out unassigned proteins
valid_indices = labels != -1
# Convert similarity to distance for silhouette calculation
pident_distance = 1 - pident_matrix
evalue_distance = 1 - (evalue_matrix / np.max(evalue_matrix))  # Normalize evalue distances

# Set diagonal to zero for both distance matrices
np.fill_diagonal(pident_distance, 0)
np.fill_diagonal(evalue_distance, 0)

# Assign labels for silhouette calculation
labels = labels[valid_indices]  # Ensure labels are only for valid indices

# Calculate silhouette scores for both pident and evalue distances
pident_silhouette_score = silhouette_score(pident_distance, labels, metric="precomputed")
evalue_silhouette_score = silhouette_score(evalue_distance, labels, metric="precomputed")

# Display results
print("Silhouette Score based on pident:", pident_silhouette_score)
print("Silhouette Score based on evalue:", evalue_silhouette_score)




Silhouette Score based on pident: 0.1308272679803392
Silhouette Score based on evalue: 0.09561641401269973


In [26]:
import pandas as pd
import numpy as np
from sklearn.metrics import silhouette_score

# Load the alignment data
alignment_df = pd.read_csv("ecmaffil_cluster_aligns.tsv", sep="\t", 
                           names=["query", "target", "evalue", "pident", "alnlen", "qstart", "qend", "tstart", "tend"])

# Load clustering information with representative and members
aggregated_df = pd.read_csv("aggregated_ECMAFFIL_clusters.tsv", sep="\t")

# Filter clusters with more than 1 unique protein and at least 2 different species
filtered_clusters = []
for _, row in aggregated_df.iterrows():
    members = row["member"].strip("[]").replace("'", "").split(", ")
    unique_prefixes = set(m[:2] for m in members)  # Get unique species prefixes

    if len(members) > 1 and len(unique_prefixes) > 1:
        filtered_clusters.append({"representative": row["representative"], "members": members})

# Create a combined list of all proteins across clusters
all_proteins = {protein for cluster in filtered_clusters for protein in cluster["members"]}
protein_index = {protein: idx for idx, protein in enumerate(all_proteins)}
num_proteins = len(all_proteins)

# Initialize distance matrices for pident and evalue
pident_matrix = np.zeros((num_proteins, num_proteins))
evalue_matrix = np.zeros((num_proteins, num_proteins))

# Populate matrices based on pairwise alignments
for _, row in alignment_df.iterrows():
    if row["query"] in protein_index and row["target"] in protein_index:
        i, j = protein_index[row["query"]], protein_index[row["target"]]
        pident_matrix[i, j] = row["pident"] / 100  # Convert to fraction
        pident_matrix[j, i] = row["pident"] / 100
        evalue_matrix[i, j] = -np.log(row["evalue"] + 1e-300)
        evalue_matrix[j, i] = -np.log(row["evalue"] + 1e-300)

# Symmetry checks
assert np.allclose(pident_matrix, pident_matrix.T), "pident_matrix is not symmetric"
assert np.allclose(evalue_matrix, evalue_matrix.T), "evalue_matrix is not symmetric"

# Convert similarity to distance for silhouette calculation
pident_distance = 1 - pident_matrix
evalue_distance = 1 - (evalue_matrix / np.max(evalue_matrix))

# Ensure the diagonal is zero
np.fill_diagonal(pident_distance, 0)
np.fill_diagonal(evalue_distance, 0)

# Check distance matrix ranges
print("pident_distance min/max:", pident_distance.min(), pident_distance.max())
print("evalue_distance min/max:", evalue_distance.min(), evalue_distance.max())

# Create labels based on cluster membership
labels = np.full(num_proteins, -1)  # Initialize with -1 for unassigned
for cluster_id, cluster in enumerate(filtered_clusters):
    for member in cluster["members"]:
        if member in protein_index:
            labels[protein_index[member]] = cluster_id  # Assign cluster ID as label

# Filter out unassigned proteins only once
valid_indices = labels != -1
pident_distance = pident_distance[valid_indices][:, valid_indices]
evalue_distance = evalue_distance[valid_indices][:, valid_indices]
labels = labels[valid_indices]

# Re-check diagonal for zero values
np.fill_diagonal(pident_distance, 0)
np.fill_diagonal(evalue_distance, 0)

# Calculate silhouette scores for both pident and evalue distances
pident_silhouette_score = silhouette_score(pident_distance, labels, metric="precomputed")
evalue_silhouette_score = silhouette_score(evalue_distance, labels, metric="precomputed")

# Display results
print("Silhouette Score based on pident:", pident_silhouette_score)
print("Silhouette Score based on evalue:", evalue_silhouette_score)


pident_distance min/max: 0.0 1.0
evalue_distance min/max: 0.0 1.0
Silhouette Score based on pident: 0.1308272679803392
Silhouette Score based on evalue: 0.09561641401269973


In [4]:
import pandas as pd
import numpy as np
from sklearn.metrics import silhouette_score, silhouette_samples

# Load the alignment data
alignment_df = pd.read_csv("ecmaffil_cluster_aligns.tsv", sep="\t", 
                           names=["query", "target", "evalue", "pident", "alnlen", "qstart", "qend", "tstart", "tend"])

# Load clustering information with representative and members
aggregated_df = pd.read_csv("aggregated_ECMAFFIL_clusters.tsv", sep="\t")

# Filter clusters with more than 1 unique protein and at least 2 different species
filtered_clusters = []
for _, row in aggregated_df.iterrows():
    members = row["member"].strip("[]").replace("'", "").split(", ")
    unique_prefixes = set(m[:2] for m in members)  # Get unique species prefixes

    if len(members) > 1 and len(unique_prefixes) > 1:
        filtered_clusters.append({"representative": row["representative"], "members": members})

# Create a combined list of all proteins across clusters
all_proteins = {protein for cluster in filtered_clusters for protein in cluster["members"]}
protein_index = {protein: idx for idx, protein in enumerate(all_proteins)}
num_proteins = len(all_proteins)

# Initialize distance matrices for pident and evalue
pident_matrix = np.zeros((num_proteins, num_proteins))
evalue_matrix = np.zeros((num_proteins, num_proteins))

# Populate matrices based on pairwise alignments
for _, row in alignment_df.iterrows():
    if row["query"] in protein_index and row["target"] in protein_index:
        i, j = protein_index[row["query"]], protein_index[row["target"]]
        pident_matrix[i, j] = row["pident"] / 100  # Convert to fraction
        pident_matrix[j, i] = row["pident"] / 100
        evalue_matrix[i, j] = -np.log(row["evalue"] + 1e-300)
        evalue_matrix[j, i] = -np.log(row["evalue"] + 1e-300)

# Convert similarity to distance for silhouette calculation
pident_distance = 1 - pident_matrix
evalue_distance = 1 - (evalue_matrix / np.max(evalue_matrix))

# Set diagonal to zero for both distance matrices
np.fill_diagonal(pident_distance, 0)
np.fill_diagonal(evalue_distance, 0)

# Create labels based on cluster membership
labels = np.full(num_proteins, -1)  # Initialize with -1 for unassigned
for cluster_id, cluster in enumerate(filtered_clusters):
    for member in cluster["members"]:
        if member in protein_index:
            labels[protein_index[member]] = cluster_id  # Assign cluster ID as label

# Filter out unassigned proteins
valid_indices = labels != -1
pident_distance = pident_distance[valid_indices][:, valid_indices]
evalue_distance = evalue_distance[valid_indices][:, valid_indices]
labels = labels[valid_indices]

# Calculate silhouette scores for each sample based on pident and evalue distances
pident_silhouette_samples = silhouette_samples(pident_distance, labels, metric="precomputed")
evalue_silhouette_samples = silhouette_samples(evalue_distance, labels, metric="precomputed")

# Create a DataFrame to store individual silhouette scores with cluster labels
silhouette_df = pd.DataFrame({
    "protein": [protein for protein, index in protein_index.items() if valid_indices[index]],
    "cluster": labels,
    "pident_silhouette": pident_silhouette_samples,
    "evalue_silhouette": evalue_silhouette_samples
})

# Calculate average silhouette score for each cluster
cluster_quality = silhouette_df.groupby("cluster").agg({
    "pident_silhouette": "mean",
    "evalue_silhouette": "mean"
}).reset_index()

# Display results
print("Overall Silhouette Score based on pident:", silhouette_score(pident_distance, labels, metric="precomputed"))
print("Overall Silhouette Score based on evalue:", silhouette_score(evalue_distance, labels, metric="precomputed"))
print("\nAverage Silhouette Scores for Each Cluster:")
print(cluster_quality)


Overall Silhouette Score based on pident: 0.1308272679803392
Overall Silhouette Score based on evalue: 0.09561641401269973

Average Silhouette Scores for Each Cluster:
     cluster  pident_silhouette  evalue_silhouette
0          0           0.000000           0.000000
1          1           0.157764           0.114562
2          2           0.000000           0.000000
3          3           0.000000           0.000000
4          4           0.476833           0.500000
..       ...                ...                ...
410      410           0.033798           0.035573
411      411           0.000000           0.000000
412      412           0.024565           0.032569
413      413           0.057053           0.023718
414      414           0.008234           0.005363

[415 rows x 3 columns]


In [5]:
from sklearn.metrics import silhouette_score, silhouette_samples

# Ensure diagonal elements are zero for silhouette calculation
np.fill_diagonal(pident_distance, 0)
np.fill_diagonal(evalue_distance, 0)

# Filter out unassigned proteins and keep original protein IDs
valid_protein_ids = [protein for protein, index in protein_index.items() if valid_indices[index]]
labels = labels[valid_indices]
pident_distance = pident_distance[valid_indices][:, valid_indices]
evalue_distance = evalue_distance[valid_indices][:, valid_indices]

# Calculate silhouette scores for each sample based on pident and evalue distances
pident_silhouette_samples = silhouette_samples(pident_distance, labels, metric="precomputed")
evalue_silhouette_samples = silhouette_samples(evalue_distance, labels, metric="precomputed")

# Create a DataFrame to store protein IDs, cluster labels, and silhouette scores
silhouette_df = pd.DataFrame({
    "protein": valid_protein_ids,  # Proper protein IDs instead of indexes
    "cluster": labels,
    "pident_silhouette": pident_silhouette_samples,
    "evalue_silhouette": evalue_silhouette_samples
})

# Calculate the average silhouette score for each cluster
cluster_quality = silhouette_df.groupby("cluster").agg({
    "pident_silhouette": "mean",
    "evalue_silhouette": "mean"
}).reset_index()

# Display results
print("Average Silhouette Scores for Each Cluster:")
print(cluster_quality)


Average Silhouette Scores for Each Cluster:
     cluster  pident_silhouette  evalue_silhouette
0          0           0.000000           0.000000
1          1           0.157764           0.114562
2          2           0.000000           0.000000
3          3           0.000000           0.000000
4          4           0.476833           0.500000
..       ...                ...                ...
410      410           0.033798           0.035573
411      411           0.000000           0.000000
412      412           0.024565           0.032569
413      413           0.057053           0.023718
414      414           0.008234           0.005363

[415 rows x 3 columns]


In [6]:
# Create a dictionary that maps each cluster ID to the list of members (protein IDs) in that cluster
cluster_members_dict = {
    cluster_id: cluster["members"]
    for cluster_id, cluster in enumerate(filtered_clusters)
}

# Map each row in silhouette_df to its respective cluster members
silhouette_df["cluster_members"] = silhouette_df["cluster"].map(cluster_members_dict)

# Group by cluster to get average silhouette scores and list of cluster members
cluster_quality = silhouette_df.groupby("cluster").agg({
    "pident_silhouette": "mean",
    "evalue_silhouette": "mean",
    "cluster_members": "first"  # List of members in each cluster
}).reset_index()

# Display results
print("Average Silhouette Scores and Members for Each Cluster:")
print(cluster_quality)


Average Silhouette Scores and Members for Each Cluster:
     cluster  pident_silhouette  evalue_silhouette  \
0          0           0.000000           0.000000   
1          1           0.157764           0.114562   
2          2           0.000000           0.000000   
3          3           0.000000           0.000000   
4          4           0.476833           0.500000   
..       ...                ...                ...   
410      410           0.033798           0.035573   
411      411           0.000000           0.000000   
412      412           0.024565           0.032569   
413      413           0.057053           0.023718   
414      414           0.008234           0.005363   

                                       cluster_members  
0    [ajXP_036981962.2, pdXP_028358868.1, pdXP_0358...  
1    [ajXP_036984952.1, pdXP_028374642.2, rfXP_0329...  
2    [ajXP_036986482.1, drXP_045051959.1, pdXP_0283...  
3    [ajXP_036995238.2, drXP_024412515.3, phXP_0456...  
4    [ajXP

In [7]:
silhouette_df.head(10)

Unnamed: 0,protein,cluster,pident_silhouette,evalue_silhouette,cluster_members
0,raXP_016014857.2,392,-0.010123,-0.012346,"[raXP_016014858.2, raXP_016014855.2, raXP_0160..."
1,phXP_045685890.1,20,0.056235,0.030453,"[drXP_024431115.1, phXP_045685890.1, pvXP_0113..."
2,drXP_053779399.1,31,0.0,0.0,"[drXP_053779400.1, drXP_024416922.2, drXP_0537..."
3,enKAK1329177.1,40,0.0,0.0,"[efXP_027987183.2, efXP_027987183.2, efXP_0279..."
4,pnCAK6440858.1,320,0.0,0.0,"[pkXP_036301078.1, pnCAK6440858.1]"
5,pkXP_036265229.1,302,0.969,0.709389,"[pkXP_036265229.1, pnCAK6444114.1]"
6,raXP_015995103.1,413,0.029212,0.012315,"[sbXP_066133315.1, moXP_036138040.1, mdXP_0595..."
7,pvXP_011363016.1,240,0.089909,0.090909,"[pgXP_039693759.1, pgXP_039693760.1, pgXP_0396..."
8,raXP_016018719.1,12,-0.056,-0.058824,"[ajXP_053526079.1, ajXP_053526079.1, pdXP_0283..."
9,pgXP_039697283.1,191,-0.015625,-0.005184,"[paXP_024894251.1, paXP_024894252.1, paXP_0248..."


In [8]:

# Define longevity-associated prefixes
short_living_prefixes = {'sb', 'pn', 'pk', 'mo', 'pd', 'pv', 'pa'}
average_living_prefixes = {'en', 'ef', 'ph', 'aj', 'ra', 'pg'}
long_living_prefixes = {'mb', 'md', 'ml', 'mm', 'dr', 'rf'}

# Function to determine the longevity category based on prefix
def get_longevity_category(prefix):
    if prefix in short_living_prefixes:
        return "short"
    elif prefix in average_living_prefixes:
        return "average"
    elif prefix in long_living_prefixes:
        return "long"
    else:
        return "unknown"

# Add a new column to silhouette_df with the longevity category and species prefix
silhouette_df["longevity"] = silhouette_df["protein"].apply(lambda x: get_longevity_category(x[:2]))
silhouette_df["species_prefix"] = silhouette_df["protein"].apply(lambda x: x[:2])

# Group by cluster and analyze the longevity distribution, species diversity, and silhouette score
cluster_longevity = silhouette_df.groupby("cluster").apply(
    lambda df: pd.Series({
        "members": df["cluster_members"].iloc[0],  # List of cluster members
        "cluster_size": len(df),  # Total number of proteins in the cluster
        "num_species": df["species_prefix"].nunique(),  # Number of unique species (prefixes)
        "short_living_percentage": (df["longevity"] == "short").mean(),
        "long_living_percentage": (df["longevity"] == "long").mean(),
        "average_silhouette_pident": df["pident_silhouette"].mean(),
        "average_silhouette_evalue": df["evalue_silhouette"].mean()
    })
).reset_index()

# Filter clusters to meet the criteria:
# - At least 3 proteins in the cluster
# - At least 3 different species (prefixes)
# - Either at least 75% short-living or long-living members
# - At least one of the silhouette scores is higher than 0.25
longevity_associated_clusters = cluster_longevity[
    ((cluster_longevity["short_living_percentage"] >= 0.75) | 
     (cluster_longevity["long_living_percentage"] >= 0.75)) & 
    (cluster_longevity["cluster_size"] >= 3) & 
    (cluster_longevity["num_species"] >= 3) &
    ((cluster_longevity["average_silhouette_pident"] > 0.25) |
     (cluster_longevity["average_silhouette_evalue"] > 0.25))
]

# Display the longevity-associated clusters with silhouette scores
print("Longevity-Associated Clusters with at least 3 Proteins, 3 Species, and Silhouette Score > 0.25 for either measure:")
print(longevity_associated_clusters)



Longevity-Associated Clusters with at least 3 Proteins, 3 Species, and Silhouette Score > 0.25 for either measure:
     cluster                                            members  cluster_size  \
51        51  [efXP_054585911.1, efXP_054585911.1, mmXP_0361...             7   
61        61  [mbXP_005859726.1, mbXP_014389815.1, mdXP_0595...             6   
64        64  [mbXP_005865977.1, efXP_008151575.2, mmXP_0361...             5   
65        65  [mbXP_005869664.1, mbXP_005869664.1, mlXP_0236...             5   
67        67  [mbXP_005871410.1, mlXP_006084398.1, mdXP_0595...             3   
68        68  [mbXP_005875093.1, mlXP_023614664.1, mmXP_0361...             3   
74        74  [mbXP_014386174.1, mdXP_059512329.1, mmXP_0361...             3   
88        88  [mdXP_059525433.1, mlXP_006107817.3, mmXP_0361...             3   
92        92  [mdXP_059536911.1, mdXP_059536910.1, mlXP_0143...             5   
96        96  [mdXP_059549656.1, mlXP_006081093.1, mmXP_0361...            

  cluster_longevity = silhouette_df.groupby("cluster").apply(


Basically no longevity-related clusters. Maybe due to --min-seq-id 0.95 being too high. However, lowering this parameter leads to unspecific clusters.


In [28]:
longevity_associated_clusters.to_csv("longevityecmaffilclust.txt", sep="\t", index=False)

In [9]:
longevity_associated_clusters

Unnamed: 0,cluster,members,cluster_size,num_species,short_living_percentage,long_living_percentage,average_silhouette_pident,average_silhouette_evalue
51,51,"[efXP_054585911.1, efXP_054585911.1, mmXP_0361...",7,3,0.0,0.857143,0.273429,0.285714
61,61,"[mbXP_005859726.1, mbXP_014389815.1, mdXP_0595...",6,4,0.0,1.0,0.2556,0.142307
64,64,"[mbXP_005865977.1, efXP_008151575.2, mmXP_0361...",5,5,0.0,0.8,0.2857,0.217733
65,65,"[mbXP_005869664.1, mbXP_005869664.1, mlXP_0236...",5,5,0.0,0.8,0.3887,0.4
67,67,"[mbXP_005871410.1, mlXP_006084398.1, mdXP_0595...",3,3,0.0,1.0,0.660333,0.357502
68,68,"[mbXP_005875093.1, mlXP_023614664.1, mmXP_0361...",3,3,0.0,1.0,0.658,0.277214
74,74,"[mbXP_014386174.1, mdXP_059512329.1, mmXP_0361...",3,3,0.0,1.0,0.268641,0.638889
88,88,"[mdXP_059525433.1, mlXP_006107817.3, mmXP_0361...",3,3,0.0,1.0,0.647,0.520361
92,92,"[mdXP_059536911.1, mdXP_059536910.1, mlXP_0143...",5,4,0.0,1.0,0.3835,0.231022
96,96,"[mdXP_059549656.1, mlXP_006081093.1, mmXP_0361...",3,3,0.0,1.0,0.641,0.666667


In [16]:
from sklearn.metrics import silhouette_samples

# Ensure the distance matrices have zero on the diagonal for correct silhouette calculations
np.fill_diagonal(pident_distance, 0)
np.fill_diagonal(evalue_distance, 0)

# Calculate silhouette scores for each sample based on pident and evalue distances
pident_silhouette_samples = silhouette_samples(pident_distance, labels, metric="precomputed")
evalue_silhouette_samples = silhouette_samples(evalue_distance, labels, metric="precomputed")

# Create a DataFrame to store individual silhouette scores with cluster labels
silhouette_df = pd.DataFrame({
    "protein": [protein for protein, index in protein_index.items() if valid_indices[index]],
    "cluster": labels,
    "pident_silhouette": pident_silhouette_samples,
    "evalue_silhouette": evalue_silhouette_samples
})

# Calculate average silhouette score for each cluster
cluster_quality = silhouette_df.groupby("cluster").agg({
    "pident_silhouette": "mean",
    "evalue_silhouette": "mean"
}).reset_index()

# Display results
print("Average Silhouette Scores for Each Cluster")
print(cluster_quality)


Average Silhouette Scores for Each Cluster
     cluster  pident_silhouette  evalue_silhouette
0          0           0.000000           0.000000
1          1           0.157764           0.114562
2          2           0.000000           0.000000
3          3           0.000000           0.000000
4          4           0.476833           0.500000
..       ...                ...                ...
410      410           0.033798           0.035573
411      411           0.000000           0.000000
412      412           0.024565           0.032569
413      413           0.057053           0.023718
414      414           0.008234           0.005363

[415 rows x 3 columns]


**LOWER MIN SEQ ID**

In [29]:
rbh_df = pd.read_csv("rbhecmaffilclusterslowid.tsv", sep="\t", header=None, names=["representative", "member"])

aggregated_df = rbh_df.groupby("representative")["member"].apply(list).reset_index()

print(aggregated_df.head())

aggregated_df.to_csv("aggregated_ECMAFFIL_clusterslowid.tsv", sep="\t", index=False)

     representative                                             member
0  ajXP_036987728.2  [ajXP_036987728.2, drXP_045041782.1, pdXP_0358...
1  ajXP_036994466.2                                 [ajXP_036994466.2]
2  ajXP_036995581.2  [ajXP_036995581.2, ajXP_036995581.2, ajXP_0369...
3  ajXP_036997277.2  [ajXP_036997277.2, drXP_024413463.2, pdXP_0283...
4  ajXP_037007824.2  [ajXP_037007824.2, phXP_045676151.1, pdXP_0358...


In [30]:
import pandas as pd
import numpy as np
from sklearn.metrics import silhouette_score

# Load the alignment data
alignment_df = pd.read_csv("ecmaffillow_cluster_aligns.tsv", sep="\t", 
                           names=["query", "target", "evalue", "pident", "alnlen", "qstart", "qend", "tstart", "tend"])

# Load clustering information with representative and members
aggregated_df = pd.read_csv("aggregated_ECMAFFIL_clusterslowid.tsv", sep="\t")

# Filter clusters with more than 1 unique protein and at least 2 different species
filtered_clusters = []
for _, row in aggregated_df.iterrows():
    members = row["member"].strip("[]").replace("'", "").split(", ")
    unique_prefixes = set(m[:2] for m in members)  # Get unique species prefixes

    if len(members) > 1 and len(unique_prefixes) > 1:
        filtered_clusters.append({"representative": row["representative"], "members": members})

# Create a combined list of all proteins across clusters
all_proteins = {protein for cluster in filtered_clusters for protein in cluster["members"]}
protein_index = {protein: idx for idx, protein in enumerate(all_proteins)}
num_proteins = len(all_proteins)

# Initialize distance matrices for pident and evalue
pident_matrix = np.zeros((num_proteins, num_proteins))
evalue_matrix = np.zeros((num_proteins, num_proteins))

# Populate matrices based on pairwise alignments
for _, row in alignment_df.iterrows():
    if row["query"] in protein_index and row["target"] in protein_index:
        i, j = protein_index[row["query"]], protein_index[row["target"]]
        pident_matrix[i, j] = row["pident"] / 100  # Convert to fraction
        pident_matrix[j, i] = row["pident"] / 100
        evalue_matrix[i, j] = -np.log(row["evalue"] + 1e-300)
        evalue_matrix[j, i] = -np.log(row["evalue"] + 1e-300)

# Convert similarity to distance for silhouette calculation
pident_distance = 1 - pident_matrix
evalue_distance = 1 - (evalue_matrix / np.max(evalue_matrix))

# Create labels based on cluster membership
labels = np.full(num_proteins, -1)  # Initialize with -1 for unassigned
for cluster_id, cluster in enumerate(filtered_clusters):
    for member in cluster["members"]:
        if member in protein_index:
            labels[protein_index[member]] = cluster_id  # Assign cluster ID as label

# Filter out unassigned proteins
valid_indices = labels != -1
# Convert similarity to distance for silhouette calculation
pident_distance = 1 - pident_matrix
evalue_distance = 1 - (evalue_matrix / np.max(evalue_matrix))  # Normalize evalue distances

# Set diagonal to zero for both distance matrices
np.fill_diagonal(pident_distance, 0)
np.fill_diagonal(evalue_distance, 0)

# Assign labels for silhouette calculation
labels = labels[valid_indices]  # Ensure labels are only for valid indices

# Calculate silhouette scores for both pident and evalue distances
pident_silhouette_score = silhouette_score(pident_distance, labels, metric="precomputed")
evalue_silhouette_score = silhouette_score(evalue_distance, labels, metric="precomputed")

# Display results
print("Silhouette Score based on pident:", pident_silhouette_score)
print("Silhouette Score based on evalue:", evalue_silhouette_score)


Silhouette Score based on pident: 0.06641958380570624
Silhouette Score based on evalue: 0.05392867750899812


In [31]:
import pandas as pd
import numpy as np
from sklearn.metrics import silhouette_score, silhouette_samples

# Load the alignment data
alignment_df = pd.read_csv("ecmaffillow_cluster_aligns.tsv", sep="\t", 
                           names=["query", "target", "evalue", "pident", "alnlen", "qstart", "qend", "tstart", "tend"])

# Load clustering information with representative and members
aggregated_df = pd.read_csv("aggregated_ECMAFFIL_clusterslowid.tsv", sep="\t")

# Filter clusters with more than 1 unique protein and at least 2 different species
filtered_clusters = []
for _, row in aggregated_df.iterrows():
    members = row["member"].strip("[]").replace("'", "").split(", ")
    unique_prefixes = set(m[:2] for m in members)  # Get unique species prefixes

    if len(members) > 1 and len(unique_prefixes) > 1:
        filtered_clusters.append({"representative": row["representative"], "members": members})

# Create a combined list of all proteins across clusters
all_proteins = {protein for cluster in filtered_clusters for protein in cluster["members"]}
protein_index = {protein: idx for idx, protein in enumerate(all_proteins)}
num_proteins = len(all_proteins)

# Initialize distance matrices for pident and evalue
pident_matrix = np.zeros((num_proteins, num_proteins))
evalue_matrix = np.zeros((num_proteins, num_proteins))

# Populate matrices based on pairwise alignments
for _, row in alignment_df.iterrows():
    if row["query"] in protein_index and row["target"] in protein_index:
        i, j = protein_index[row["query"]], protein_index[row["target"]]
        pident_matrix[i, j] = row["pident"] / 100  # Convert to fraction
        pident_matrix[j, i] = row["pident"] / 100
        evalue_matrix[i, j] = -np.log(row["evalue"] + 1e-300)
        evalue_matrix[j, i] = -np.log(row["evalue"] + 1e-300)

# Convert similarity to distance for silhouette calculation
pident_distance = 1 - pident_matrix
evalue_distance = 1 - (evalue_matrix / np.max(evalue_matrix))

# Set diagonal to zero for both distance matrices
np.fill_diagonal(pident_distance, 0)
np.fill_diagonal(evalue_distance, 0)

# Create labels based on cluster membership
labels = np.full(num_proteins, -1)  # Initialize with -1 for unassigned
for cluster_id, cluster in enumerate(filtered_clusters):
    for member in cluster["members"]:
        if member in protein_index:
            labels[protein_index[member]] = cluster_id  # Assign cluster ID as label

# Filter out unassigned proteins
valid_indices = labels != -1
pident_distance = pident_distance[valid_indices][:, valid_indices]
evalue_distance = evalue_distance[valid_indices][:, valid_indices]
labels = labels[valid_indices]

# Calculate silhouette scores for each sample based on pident and evalue distances
pident_silhouette_samples = silhouette_samples(pident_distance, labels, metric="precomputed")
evalue_silhouette_samples = silhouette_samples(evalue_distance, labels, metric="precomputed")

# Create a DataFrame to store individual silhouette scores with cluster labels
silhouette_df = pd.DataFrame({
    "protein": [protein for protein, index in protein_index.items() if valid_indices[index]],
    "cluster": labels,
    "pident_silhouette": pident_silhouette_samples,
    "evalue_silhouette": evalue_silhouette_samples
})

# Calculate average silhouette score for each cluster
cluster_quality = silhouette_df.groupby("cluster").agg({
    "pident_silhouette": "mean",
    "evalue_silhouette": "mean"
}).reset_index()

# Display results
print("Overall Silhouette Score based on pident:", silhouette_score(pident_distance, labels, metric="precomputed"))
print("Overall Silhouette Score based on evalue:", silhouette_score(evalue_distance, labels, metric="precomputed"))
print("\nAverage Silhouette Scores for Each Cluster:")
print(cluster_quality)

Overall Silhouette Score based on pident: 0.06641958380570624
Overall Silhouette Score based on evalue: 0.05392867750899812

Average Silhouette Scores for Each Cluster:
     cluster  pident_silhouette  evalue_silhouette
0          0           0.551980           0.625000
1          1           0.403000           0.062791
2          2           0.135667           0.078834
3          3          -0.223427          -0.226867
4          4          -0.407292          -0.082816
..       ...                ...                ...
261      261           0.214611           0.047046
262      262           0.004279           0.002513
263      263           0.052156           0.021752
264      264           0.179178           0.054967
265      265           0.399214           0.263694

[266 rows x 3 columns]


In [32]:
from sklearn.metrics import silhouette_score, silhouette_samples

# Ensure diagonal elements are zero for silhouette calculation
np.fill_diagonal(pident_distance, 0)
np.fill_diagonal(evalue_distance, 0)

# Filter out unassigned proteins and keep original protein IDs
valid_protein_ids = [protein for protein, index in protein_index.items() if valid_indices[index]]
labels = labels[valid_indices]
pident_distance = pident_distance[valid_indices][:, valid_indices]
evalue_distance = evalue_distance[valid_indices][:, valid_indices]

# Calculate silhouette scores for each sample based on pident and evalue distances
pident_silhouette_samples = silhouette_samples(pident_distance, labels, metric="precomputed")
evalue_silhouette_samples = silhouette_samples(evalue_distance, labels, metric="precomputed")

# Create a DataFrame to store protein IDs, cluster labels, and silhouette scores
silhouette_df = pd.DataFrame({
    "protein": valid_protein_ids,  # Proper protein IDs instead of indexes
    "cluster": labels,
    "pident_silhouette": pident_silhouette_samples,
    "evalue_silhouette": evalue_silhouette_samples
})

# Calculate the average silhouette score for each cluster
cluster_quality = silhouette_df.groupby("cluster").agg({
    "pident_silhouette": "mean",
    "evalue_silhouette": "mean"
}).reset_index()

# Display results
print("Average Silhouette Scores for Each Cluster:")
print(cluster_quality)


Average Silhouette Scores for Each Cluster:
     cluster  pident_silhouette  evalue_silhouette
0          0           0.551980           0.625000
1          1           0.403000           0.062791
2          2           0.135667           0.078834
3          3          -0.223427          -0.226867
4          4          -0.407292          -0.082816
..       ...                ...                ...
261      261           0.214611           0.047046
262      262           0.004279           0.002513
263      263           0.052156           0.021752
264      264           0.179178           0.054967
265      265           0.399214           0.263694

[266 rows x 3 columns]


In [33]:
# Create a dictionary that maps each cluster ID to the list of members (protein IDs) in that cluster
cluster_members_dict = {
    cluster_id: cluster["members"]
    for cluster_id, cluster in enumerate(filtered_clusters)
}

# Map each row in silhouette_df to its respective cluster members
silhouette_df["cluster_members"] = silhouette_df["cluster"].map(cluster_members_dict)

# Group by cluster to get average silhouette scores and list of cluster members
cluster_quality = silhouette_df.groupby("cluster").agg({
    "pident_silhouette": "mean",
    "evalue_silhouette": "mean",
    "cluster_members": "first"  # List of members in each cluster
}).reset_index()

# Display results
print("Average Silhouette Scores and Members for Each Cluster:")
print(cluster_quality)


Average Silhouette Scores and Members for Each Cluster:
     cluster  pident_silhouette  evalue_silhouette  \
0          0           0.551980           0.625000   
1          1           0.403000           0.062791   
2          2           0.135667           0.078834   
3          3          -0.223427          -0.226867   
4          4          -0.407292          -0.082816   
..       ...                ...                ...   
261      261           0.214611           0.047046   
262      262           0.004279           0.002513   
263      263           0.052156           0.021752   
264      264           0.179178           0.054967   
265      265           0.399214           0.263694   

                                       cluster_members  
0    [ajXP_036987728.2, drXP_045041782.1, pdXP_0358...  
1    [ajXP_036997277.2, drXP_024413463.2, pdXP_0283...  
2    [ajXP_037007824.2, phXP_045676151.1, pdXP_0358...  
3    [ajXP_037021117.2, phXP_045687379.1, phXP_0456...  
4    [ajXP

In [36]:

# Define longevity-associated prefixes
short_living_prefixes = {'sb', 'pn', 'pk', 'mo', 'pd', 'pv', 'pa'}
average_living_prefixes = {'en', 'ef', 'ph', 'aj', 'ra', 'pg'}
long_living_prefixes = {'mb', 'md', 'ml', 'mm', 'dr', 'rf'}

# Function to determine the longevity category based on prefix
def get_longevity_category(prefix):
    if prefix in short_living_prefixes:
        return "short"
    elif prefix in average_living_prefixes:
        return "average"
    elif prefix in long_living_prefixes:
        return "long"
    else:
        return "unknown"

# Add a new column to silhouette_df with the longevity category and species prefix
silhouette_df["longevity"] = silhouette_df["protein"].apply(lambda x: get_longevity_category(x[:2]))
silhouette_df["species_prefix"] = silhouette_df["protein"].apply(lambda x: x[:2])

# Group by cluster and analyze the longevity distribution, species diversity, and silhouette score
cluster_longevity = silhouette_df.groupby("cluster").apply(
    lambda df: pd.Series({
        "members": df["cluster_members"].iloc[0],  # List of cluster members
        "cluster_size": len(df),  # Total number of proteins in the cluster
        "num_species": df["species_prefix"].nunique(),  # Number of unique species (prefixes)
        "short_living_percentage": (df["longevity"] == "short").mean(),
        "long_living_percentage": (df["longevity"] == "long").mean(),
        "average_silhouette_pident": df["pident_silhouette"].mean(),
        "average_silhouette_evalue": df["evalue_silhouette"].mean()
    })
).reset_index()

# Filter clusters to meet the criteria:
# - At least 3 proteins in the cluster
# - At least 3 different species (prefixes)
# - Either at least 75% short-living or long-living members
# - At least one of the silhouette scores is higher than 0.25
longevity_associated_clusters = cluster_longevity[
    ((cluster_longevity["short_living_percentage"] >= 0.75) | 
     (cluster_longevity["long_living_percentage"] >= 0.75)) & 
    (cluster_longevity["cluster_size"] >= 3) & 
    (cluster_longevity["num_species"] >= 3) &
    ((cluster_longevity["average_silhouette_pident"] > 0.1) |
     (cluster_longevity["average_silhouette_evalue"] > 0.1))
]

# Display the longevity-associated clusters with silhouette scores
print("Longevity-Associated Clusters with at least 3 Proteins, 3 Species, and Silhouette Score > 0.25 for either measure:")
print(longevity_associated_clusters)

Longevity-Associated Clusters with at least 3 Proteins, 3 Species, and Silhouette Score > 0.25 for either measure:
     cluster                                            members  cluster_size  \
65        65  [mbXP_014385769.1, mlXP_006097188.1, mdXP_0595...             3   
67        67  [mbXP_014394036.1, mdXP_059551677.1, mmXP_0361...             3   
79        79  [mdXP_059536866.1, mbXP_014399304.1, mmXP_0361...             4   
89        89  [mdXP_059568003.1, mmXP_036184401.1, mmXP_0361...             8   
261      261  [sbXP_066124233.1, moXP_036129550.1, moXP_0361...             6   

     num_species  short_living_percentage  long_living_percentage  \
65             3                 0.000000                1.000000   
67             3                 0.000000                1.000000   
79             4                 0.000000                1.000000   
89             4                 0.000000                1.000000   
261            3                 0.833333            

  cluster_longevity = silhouette_df.groupby("cluster").apply(


In [37]:
longevity_associated_clusters.to_csv("longevityecmaffilclustLOW.txt", sep="\t", index=False)

regenerating islet-derived protein 4 is similar within short-living Saccopteryx bileniata and Molossus molossus (also, sadly, long-living Rhinolophus ferrumequinum)

In [10]:
%matplotlib qt5

In [11]:
import networkx as nx
import matplotlib.pyplot as plt

# Filter silhouette_df to include only longevity-associated clusters
longevity_clusters = longevity_associated_clusters["cluster"].tolist()
filtered_silhouette_df = silhouette_df[silhouette_df["cluster"].isin(longevity_clusters)]

# Create a new NetworkX graph
G = nx.Graph()

# Add nodes for each protein, color by longevity category
for _, row in filtered_silhouette_df.iterrows():
    protein = row["protein"]
    cluster_id = row["cluster"]
    longevity = row["longevity"]
    color = 'green' if longevity == 'short' else 'yellow' if longevity == 'average' else 'red'
    G.add_node(protein, cluster=cluster_id, color=color)

# Add intra-cluster edges based on pident within each cluster
for cluster_id in longevity_clusters:
    cluster_proteins = filtered_silhouette_df[filtered_silhouette_df["cluster"] == cluster_id]
    for i, row1 in cluster_proteins.iterrows():
        for j, row2 in cluster_proteins.iterrows():
            if i < j:  # Avoid duplicate edges and self-loops
                query, target = row1["protein"], row2["protein"]
                # Get the pident score between query and target (or a representative score)
                alignment_row = alignment_df[((alignment_df["query"] == query) & (alignment_df["target"] == target)) |
                                             ((alignment_df["query"] == target) & (alignment_df["target"] == query))]
                if not alignment_row.empty:
                    pident = alignment_row["pident"].iloc[0]
                    G.add_edge(query, target, weight=pident / 10)  # Scale down pident to avoid excessive line thickness

# Add inter-cluster edges based on average silhouette score for each cluster
for i, cluster1 in enumerate(longevity_clusters):
    for j, cluster2 in enumerate(longevity_clusters):
        if i < j:  # Avoid duplicate edges
            cluster1_members = filtered_silhouette_df[filtered_silhouette_df["cluster"] == cluster1]["protein"].tolist()
            cluster2_members = filtered_silhouette_df[filtered_silhouette_df["cluster"] == cluster2]["protein"].tolist()
            silhouette_score = longevity_associated_clusters[
                longevity_associated_clusters["cluster"] == cluster1
            ]["average_silhouette_pident"].values[0]

            # Connect all proteins between clusters with silhouette score as weight
            for protein1 in cluster1_members:
                for protein2 in cluster2_members:
                    G.add_edge(protein1, protein2, weight=silhouette_score, style="bold")

# Draw the network
plt.figure(figsize=(12, 12))
pos = nx.spring_layout(G, seed=42)

# Draw nodes with color by longevity
node_colors = [G.nodes[node]['color'] for node in G.nodes()]
nx.draw_networkx_nodes(G, pos, node_size=100, node_color=node_colors)

# Draw edges with thickness based on weights
for u, v, data in G.edges(data=True):
    width = data['weight'] if 'style' not in data else data['weight'] * 3  # Make inter-cluster edges thicker
    nx.draw_networkx_edges(G, pos, edgelist=[(u, v)], width=width, edge_color="black" if 'style' in data else "gray")

# Add node labels
nx.draw_networkx_labels(G, pos, font_size=8)

plt.title("Longevity-Associated Clusters Network")
plt.show()


In [17]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import ast

# Define longevity-associated prefixes
short_living_prefixes = {'sb', 'pn', 'pk', 'mo', 'pd', 'pv', 'pa'}
average_living_prefixes = {'en', 'ef', 'ph', 'aj', 'ra', 'pg'}
long_living_prefixes = {'mb', 'md', 'ml', 'mm', 'dr', 'rf'}

# Function to determine the longevity category based on prefix
def get_longevity_category(prefix):
    if prefix in short_living_prefixes:
        return "green"
    elif prefix in average_living_prefixes:
        return "yellow"
    elif prefix in long_living_prefixes:
        return "red"
    else:
        return "gray"

# Load your data
alignment_df = pd.read_csv("ecmaffillow_cluster_aligns.tsv", sep="\t", 
                           names=["query", "target", "evalue", "pident", "alnlen", "qstart", "qend", "tstart", "tend"])

aggregated_df = pd.read_csv("aggregated_ECMAFFIL_clusterslowid.tsv", sep="\t")

# Calculate silhouette scores, filter clusters, and prepare longevity_associated_clusters as in previous code
# Assume longevity_associated_clusters is prepared and has the columns "cluster", "members", and "average_silhouette_pident"

# Set up the plot
plt.figure(figsize=(16, 16))  # Increased figure size
expanded_radius = 10  # Increase this value for more space between clusters
cluster_positions = np.linspace(0, 2 * np.pi, len(longevity_associated_clusters), endpoint=False)

for idx, (index, cluster) in enumerate(longevity_associated_clusters.iterrows()):
    # Convert members column to list if it's in string format
    if isinstance(cluster["members"], str):
        members = ast.literal_eval(cluster["members"])  # Safely convert string to list
    else:
        members = cluster["members"]
    
    silhouette_score = cluster["average_silhouette_pident"]
    
    # Define cluster circle properties
    cluster_position = (expanded_radius * np.cos(cluster_positions[idx]), expanded_radius * np.sin(cluster_positions[idx]))
    circle_radius = 0.6 + 0.1 * len(members)  # Adjusted radius
    circle = plt.Circle(cluster_position, circle_radius, color='black', fill=False, 
                        linewidth=silhouette_score * 4)  # Scale outline by silhouette score
    plt.gca().add_artist(circle)
    
    # Set the first member as the representative
    representative = members[0] if members else None

    # Position proteins within the cluster circle
    angles = np.linspace(0, 2 * np.pi, len(members), endpoint=False)
    for i, protein in enumerate(members):
        protein_color = get_longevity_category(protein[:2])  # Color by longevity
        x = cluster_position[0] + circle_radius * 0.7 * np.cos(angles[i])
        y = cluster_position[1] + circle_radius * 0.7 * np.sin(angles[i])

        # Make the representative protein larger
        marker_size = 12 if protein == representative else 8
        plt.plot(x, y, 'o', color=protein_color, markersize=marker_size)
        
        # Add labels to each protein
        plt.text(x, y, protein, fontsize=6, ha='center', va='center')

        # Draw connections between all proteins in the cluster
        for j in range(i + 1, len(members)):
            x2 = cluster_position[0] + circle_radius * 0.7 * np.cos(angles[j])
            y2 = cluster_position[1] + circle_radius * 0.7 * np.sin(angles[j])
            plt.plot([x, x2], [y, y2], color="gray", linewidth=1)  # Uniform line width for all connections

# Display plot
plt.axis('off')
plt.title("Longevity-Associated Clusters Network (Only Within-Cluster Connections)")
plt.show()




In [36]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import ast

# Define longevity-associated prefixes
short_living_prefixes = {'sb', 'pn', 'pk', 'mo', 'pd', 'pv', 'pa'}
average_living_prefixes = {'en', 'ef', 'ph', 'aj', 'ra', 'pg'}
long_living_prefixes = {'mb', 'md', 'ml', 'mm', 'dr', 'rf'}

# Function to determine the longevity category based on prefix
def get_longevity_category(prefix):
    if prefix in short_living_prefixes:
        return "green"
    elif prefix in average_living_prefixes:
        return "yellow"
    elif prefix in long_living_prefixes:
        return "red"
    else:
        return "gray"

# Load your data
alignment_df = pd.read_csv("ecmaffillow_cluster_aligns.tsv", sep="\t", 
                           names=["query", "target", "evalue", "pident", "alnlen", "qstart", "qend", "tstart", "tend"])

aggregated_df = pd.read_csv("aggregated_ECMAFFIL_clusters.tsv", sep="\t")

# Calculate silhouette scores, filter clusters, and prepare longevity_associated_clusters as in previous code
# Assume longevity_associated_clusters is prepared and has the columns "cluster", "members", and "average_silhouette_pident"

# Set up the plot with a grid layout
plt.figure(figsize=(20, 20))  # Larger figure size for better spacing
grid_size = int(np.ceil(np.sqrt(len(longevity_associated_clusters))))  # Determine grid size (e.g., 5x5 for 25 clusters)
spacing = 7  # Adjust this value for more space between clusters

for idx, (index, cluster) in enumerate(longevity_associated_clusters.iterrows()):
    # Convert members column to list if it's in string format
    if isinstance(cluster["members"], str):
        members = ast.literal_eval(cluster["members"])  # Safely convert string to list
    else:
        members = cluster["members"]
    
    silhouette_score = cluster["average_silhouette_pident"]
    
    # Define grid position for each cluster
    row, col = divmod(idx, grid_size)
    cluster_position = (col * spacing, -row * spacing)  # Adjust y-axis to go downward for each row

    circle_radius = 0.6 + 0.1 * len(members)  # Adjusted radius based on cluster size
    circle = plt.Circle(cluster_position, circle_radius, color='black', fill=False, 
                        linewidth=silhouette_score * 4)  # Scale outline by silhouette score
    plt.gca().add_artist(circle)
    
    # Set the first member as the representative
    representative = members[0] if members else None

    # Position proteins within the cluster circle
    angles = np.linspace(0, 2 * np.pi, len(members), endpoint=False)
    for i, protein in enumerate(members):
        protein_color = get_longevity_category(protein[:2])  # Color by longevity
        x = cluster_position[0] + circle_radius * 0.7 * np.cos(angles[i])
        y = cluster_position[1] + circle_radius * 0.7 * np.sin(angles[i])

        # Make the representative protein larger
        marker_size = 12 if protein == representative else 8
        plt.plot(x, y, 'o', color=protein_color, markersize=marker_size)
        
        # Add labels to each protein
        plt.text(x, y, protein, fontsize=6, ha='center', va='center')

        # Draw connections between all proteins in the cluster
        for j in range(i + 1, len(members)):
            x2 = cluster_position[0] + circle_radius * 0.7 * np.cos(angles[j])
            y2 = cluster_position[1] + circle_radius * 0.7 * np.sin(angles[j])
            plt.plot([x, x2], [y, y2], color="gray", linewidth=1)  # Uniform line width for all connections

# Display plot
plt.axis('off')
plt.title("Longevity-Associated Clusters Network (Grid Layout, Only Within-Cluster Connections)")
plt.show()


In [37]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import ast
import matplotlib.cm as cm
import matplotlib.colors as mcolors

# Define longevity-associated prefixes
short_living_prefixes = {'sb', 'pn', 'pk', 'mo', 'pd', 'pv', 'pa'}
average_living_prefixes = {'en', 'ef', 'ph', 'aj', 'ra', 'pg'}
long_living_prefixes = {'mb', 'md', 'ml', 'mm', 'dr', 'rf'}

# Function to determine the longevity category based on prefix
def get_longevity_category(prefix):
    if prefix in short_living_prefixes:
        return "green"
    elif prefix in average_living_prefixes:
        return "yellow"
    elif prefix in long_living_prefixes:
        return "red"
    else:
        return "gray"

# Load your data
alignment_df = pd.read_csv("ecmaffillow_cluster_aligns.tsv", sep="\t", 
                           names=["query", "target", "evalue", "pident", "alnlen", "qstart", "qend", "tstart", "tend"])

aggregated_df = pd.read_csv("aggregated_ECMAFFIL_clusters.tsv", sep="\t")

# Calculate silhouette scores, filter clusters, and prepare longevity_associated_clusters as in previous code
# Assume longevity_associated_clusters is prepared and has the columns "cluster", "members", and "average_silhouette_pident"

# Set up the plot with a grid layout
fig, ax = plt.subplots(figsize=(20, 20))  # Larger figure size for better spacing
grid_size = int(np.ceil(np.sqrt(len(longevity_associated_clusters))))  # Determine grid size (e.g., 5x5 for 25 clusters)
spacing = 7  # Adjust this value for more space between clusters

# Set up color mapping for silhouette scores
norm = mcolors.Normalize(vmin=longevity_associated_clusters["average_silhouette_pident"].min(),
                         vmax=longevity_associated_clusters["average_silhouette_pident"].max())
cmap = cm.get_cmap('viridis')

# Dummy scatter for color bar
sm = plt.cm.ScalarMappable(cmap=cmap, norm=norm)
sm.set_array([])  # This is needed to create a color bar

for idx, (index, cluster) in enumerate(longevity_associated_clusters.iterrows()):
    # Convert members column to list if it's in string format
    if isinstance(cluster["members"], str):
        members = ast.literal_eval(cluster["members"])  # Safely convert string to list
    else:
        members = cluster["members"]
    
    silhouette_score = cluster["average_silhouette_pident"]
    
    # Define grid position for each cluster
    row, col = divmod(idx, grid_size)
    cluster_position = (col * spacing, -row * spacing)  # Adjust y-axis to go downward for each row

    circle_radius = 0.6 + 0.1 * len(members)  # Adjusted radius based on cluster size
    outline_color = cmap(norm(silhouette_score))  # Map silhouette score to color
    circle = plt.Circle(cluster_position, circle_radius, color=outline_color, fill=False, 
                        linewidth=2 + silhouette_score * 2)  # Outline width based on silhouette score
    ax.add_artist(circle)
    
    # Set the first member as the representative
    representative = members[0] if members else None

    # Position proteins within the cluster circle
    angles = np.linspace(0, 2 * np.pi, len(members), endpoint=False)
    for i, protein in enumerate(members):
        protein_color = get_longevity_category(protein[:2])  # Color by longevity
        x = cluster_position[0] + circle_radius * 0.7 * np.cos(angles[i])
        y = cluster_position[1] + circle_radius * 0.7 * np.sin(angles[i])

        # Make the representative protein larger
        marker_size = 8 #12 if protein == representative else 8
        ax.plot(x, y, 'o', color=protein_color, markersize=marker_size)
        
        # Add labels to each protein
        ax.text(x, y, protein, fontsize=6, ha='center', va='center')

        # Draw connections between all proteins in the cluster
        for j in range(i + 1, len(members)):
            x2 = cluster_position[0] + circle_radius * 0.7 * np.cos(angles[j])
            y2 = cluster_position[1] + circle_radius * 0.7 * np.sin(angles[j])
            ax.plot([x, x2], [y, y2], color="gray", linewidth=1)  # Uniform line width for all connections

# Add color bar outside the plot
cbar = fig.colorbar(sm, ax=ax, orientation='vertical', fraction=0.02, pad=0.04)
cbar.set_label("Silhouette Score (pident-based)", rotation=90)

# Disable axis labels and ticks
ax.axis('off')

plt.title("Longevity-Associated Clusters Network (Grid Layout, Only Within-Cluster Connections)")
plt.show()


  cmap = cm.get_cmap('viridis')


In [39]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import ast
import matplotlib.cm as cm
import matplotlib.colors as mcolors

# Define longevity-associated prefixes
short_living_prefixes = {'sb', 'pn', 'pk', 'mo', 'pd', 'pv', 'pa'}
average_living_prefixes = {'en', 'ef', 'ph', 'aj', 'ra', 'pg'}
long_living_prefixes = {'mb', 'md', 'ml', 'mm', 'dr', 'rf'}

# Function to determine the longevity category based on prefix
def get_longevity_category(prefix):
    if prefix in short_living_prefixes:
        return "green"
    elif prefix in average_living_prefixes:
        return "yellow"
    elif prefix in long_living_prefixes:
        return "red"
    else:
        return "gray"

# Load your data
alignment_df = pd.read_csv("ecmaffillow_cluster_aligns.tsv", sep="\t", 
                           names=["query", "target", "evalue", "pident", "alnlen", "qstart", "qend", "tstart", "tend"])

aggregated_df = pd.read_csv("aggregated_ECMAFFIL_clusters.tsv", sep="\t")

# Assume longevity_associated_clusters is prepared and has the columns "cluster", "members", and "average_silhouette_pident"

# Set up the plot with a grid layout
fig, ax = plt.subplots(figsize=(20, 20))
grid_size = int(np.ceil(np.sqrt(len(longevity_associated_clusters))))
spacing = 7

# Set up color mapping for silhouette scores
norm = mcolors.Normalize(vmin=longevity_associated_clusters["average_silhouette_pident"].min(),
                         vmax=longevity_associated_clusters["average_silhouette_pident"].max())
cmap = cm.get_cmap('viridis')

# Dummy scatter for color bar
sm = plt.cm.ScalarMappable(cmap=cmap, norm=norm)
sm.set_array([])

for idx, (index, cluster) in enumerate(longevity_associated_clusters.iterrows()):
    # Convert members column to list if it's in string format
    if isinstance(cluster["members"], str):
        members = list(set(ast.literal_eval(cluster["members"])))  # Remove duplicates
    else:
        members = list(set(cluster["members"]))

    silhouette_score = cluster["average_silhouette_pident"]
    
    # Define grid position for each cluster
    row, col = divmod(idx, grid_size)
    cluster_position = (col * spacing, -row * spacing)

    circle_radius = 0.6 + 0.1 * len(members)
    outline_color = cmap(norm(silhouette_score))
    circle = plt.Circle(cluster_position, circle_radius, color=outline_color, fill=False, linewidth=2 + silhouette_score * 2)
    ax.add_artist(circle)
    
    # Position proteins within the cluster circle
    angles = np.linspace(0, 2 * np.pi, len(members), endpoint=False)
    protein_positions = {}

    for i, protein in enumerate(members):
        protein_color = get_longevity_category(protein[:2])
        x = cluster_position[0] + circle_radius * 0.7 * np.cos(angles[i])
        y = cluster_position[1] + circle_radius * 0.7 * np.sin(angles[i])
        protein_positions[protein] = (x, y)
        ax.plot(x, y, 'o', color=protein_color, markersize=8)
        ax.text(x, y, protein, fontsize=6, ha='center', va='center')

    # Draw connections between all proteins in the cluster with a uniform line width
    for i, protein1 in enumerate(members):
        for j in range(i + 1, len(members)):
            protein2 = members[j]
            x1, y1 = protein_positions[protein1]
            x2, y2 = protein_positions[protein2]
            ax.plot([x1, x2], [y1, y2], color="gray", linewidth=1)  # Uniform line width for all connections

# Add color bar outside the plot
cbar = fig.colorbar(sm, ax=ax, orientation='vertical', fraction=0.02, pad=0.04)
cbar.set_label("Silhouette Score (pident-based)", rotation=90)

# Disable axis labels and ticks
ax.axis('off')

plt.title("Longevity-Associated Clusters Network (Grid Layout, Only Within-Cluster Connections)")
plt.show()



  cmap = cm.get_cmap('viridis')


In [29]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import ast

# Define longevity-associated prefixes and color mapping
short_living_prefixes = {'sb', 'pn', 'pk', 'mo', 'pd', 'pv', 'pa'}
average_living_prefixes = {'en', 'ef', 'ph', 'aj', 'ra', 'pg'}
long_living_prefixes = {'mb', 'md', 'ml', 'mm', 'dr', 'rf'}

# Function to determine color based on longevity category
def get_longevity_category(prefix):
    if prefix in short_living_prefixes:
        return "green"
    elif prefix in average_living_prefixes:
        return "yellow"
    elif prefix in long_living_prefixes:
        return "red"
    else:
        return "gray"

# Assuming `longevity_associated_clusters` and `pident_distance` are prepared and ready for use

# Calculate inter-cluster average identity scores
cluster_similarities = {}
for i, cluster1 in enumerate(longevity_associated_clusters["cluster"].unique()):
    for j, cluster2 in enumerate(longevity_associated_clusters["cluster"].unique()):
        if i < j:  # Only calculate for unique pairs
            cluster1_members = longevity_associated_clusters[longevity_associated_clusters["cluster"] == cluster1]["members"].values[0]
            cluster2_members = longevity_associated_clusters[longevity_associated_clusters["cluster"] == cluster2]["members"].values[0]
            cluster1_indices = [protein_index[member] for member in cluster1_members]
            cluster2_indices = [protein_index[member] for member in cluster2_members]
            inter_cluster_identities = [
                pident_matrix[i, j] for i in cluster1_indices for j in cluster2_indices if i != j
            ]
            # Calculate average identity between clusters
            avg_identity = np.mean(inter_cluster_identities) if inter_cluster_identities else 0
            cluster_similarities[(cluster1, cluster2)] = avg_identity

# Plot clusters in a grid with inter-cluster connections
plt.figure(figsize=(16, 16))
grid_size = int(np.ceil(np.sqrt(len(longevity_associated_clusters))))
cluster_positions = {}

# Plot each cluster
for idx, (index, cluster) in enumerate(longevity_associated_clusters.iterrows()):
    row, col = divmod(idx, grid_size)
    cluster_position = (col * 3, -row * 3)  # Adjust spacing for grid layout
    cluster_positions[cluster["cluster"]] = cluster_position
    
    # Define circle properties
    circle_radius = 0.6 + 0.1 * len(cluster["members"])
    circle = plt.Circle(cluster_position, circle_radius, color='black', fill=False, 
                        linewidth=cluster["average_silhouette_pident"] * 4)
    plt.gca().add_artist(circle)
    
    # Position proteins within the cluster circle
    angles = np.linspace(0, 2 * np.pi, len(cluster["members"]), endpoint=False)
    for i, protein in enumerate(cluster["members"]):
        protein_color = get_longevity_category(protein[:2])
        x = cluster_position[0] + circle_radius * 0.7 * np.cos(angles[i])
        y = cluster_position[1] + circle_radius * 0.7 * np.sin(angles[i])
        
        # Make representative protein larger
        marker_size = 12 if protein == cluster["members"][0] else 8
        plt.plot(x, y, 'o', color=protein_color, markersize=marker_size)
        plt.text(x, y, protein, fontsize=6, ha='center', va='center')
        
        # Draw internal connections within cluster based on pairwise identities
        for j in range(i + 1, len(cluster["members"])):
            x2 = cluster_position[0] + circle_radius * 0.7 * np.cos(angles[j])
            y2 = cluster_position[1] + circle_radius * 0.7 * np.sin(angles[j])
            plt.plot([x, x2], [y, y2], color="gray", linewidth=1)

# Add inter-cluster connections based on average identity scores
for (cluster1, cluster2), avg_identity in cluster_similarities.items():
    pos1 = cluster_positions[cluster1]
    pos2 = cluster_positions[cluster2]
    line_width = max(0.5, avg_identity * 5)  # Set minimum line width for visibility
    plt.plot([pos1[0], pos2[0]], [pos1[1], pos2[1]], color="blue", linewidth=line_width, alpha=0.6)

# Display the plot
plt.axis('off')
plt.title("Longevity-Associated Clusters Network (With Inter-Cluster Connections by Identity)")
plt.show()




In [32]:
import matplotlib.pyplot as plt
import networkx as nx
import numpy as np
import pandas as pd
from scipy.spatial import ConvexHull, QhullError

# Define longevity-associated colors
color_map = {
    "short": "green",
    "average": "yellow",
    "long": "red"
}

# Function to categorize longevity based on prefix
def get_longevity_category(prefix):
    if prefix in {"sb", "pn", "pk", "mo", "pd", "pv", "pa"}:
        return "short"
    elif prefix in {"en", "ef", "ph", "aj", "ra", "pg"}:
        return "average"
    elif prefix in {"mb", "md", "ml", "mm", "dr", "rf"}:
        return "long"
    return "unknown"

# Build a NetworkX graph for clusters
G = nx.Graph()

# Add nodes and intra-cluster edges
for cluster in filtered_clusters:
    members = cluster["members"]
    representative = members[0]
    
    # Get the longevity category, then map it to a color
    longevity_category = get_longevity_category(representative[:2])
    cluster_color = color_map.get(longevity_category, "gray")  # Default to gray if unknown
    
    # Add the representative node to the graph
    G.add_node(representative, color=cluster_color, size=300, label=representative)
    
    # Add each protein in the cluster to the graph and connect them with intra-cluster edges
    for i, protein1 in enumerate(members):
        # Add each protein node with the cluster color
        G.add_node(protein1, color=cluster_color, size=150, label=protein1)
        
        # Connect proteins within the cluster based on pairwise identities
        for j in range(i + 1, len(members)):
            protein2 = members[j]
            identity = pident_matrix[protein_index[protein1], protein_index[protein2]]
            if identity > 0:
                G.add_edge(protein1, protein2, weight=identity * 2)  # Scale for visualization

# Add inter-cluster edges based on silhouette scores or average identities between clusters
# `cluster_similarities` should be defined as {(cluster1, cluster2): avg_identity, ...}
for (cluster1, cluster2), avg_identity in cluster_similarities.items():
    if avg_identity > 0:  # Only draw edges with meaningful identity
        G.add_edge(cluster1, cluster2, weight=avg_identity, style="dotted")  # Dotted style for inter-cluster

# Draw the network with clusters arranged in a spring layout
plt.figure(figsize=(15, 15))
pos = nx.spring_layout(G, k=0.5)  # Adjust layout to avoid overlap

# Draw nodes with colors and labels
node_colors = [G.nodes[n]["color"] for n in G.nodes]
node_sizes = [G.nodes[n]["size"] for n in G.nodes]
nx.draw_networkx_nodes(G, pos, node_color=node_colors, node_size=node_sizes, alpha=0.8, edgecolors='black')
nx.draw_networkx_labels(G, pos, labels={n: G.nodes[n]["label"] for n in G.nodes}, font_size=8, font_color="white")

# Draw intra-cluster edges
intra_edges = [(u, v) for u, v, d in G.edges(data=True) if d.get("style") != "dotted"]
intra_weights = [G[u][v]["weight"] for u, v in intra_edges]
nx.draw_networkx_edges(G, pos, edgelist=intra_edges, width=intra_weights, alpha=0.6, edge_color="gray")

# Draw inter-cluster edges with dotted style
inter_edges = [(u, v) for u, v, d in G.edges(data=True) if d.get("style") == "dotted"]
inter_weights = [G[u][v]["weight"] for u, v in inter_edges]
nx.draw_networkx_edges(G, pos, edgelist=inter_edges, width=inter_weights, style="dotted", edge_color="blue")

# Optionally add convex hull around clusters, handling potential errors
for idx, cluster in enumerate(filtered_clusters):
    members = cluster["members"]
    cluster_positions = np.array([pos[protein] for protein in members])
    if len(cluster_positions) > 2:  # ConvexHull requires at least 3 points
        try:
            hull = ConvexHull(cluster_positions)
            hull_points = cluster_positions[hull.vertices]
            plt.plot(hull_points[:, 0], hull_points[:, 1], 'k--', lw=1)
        except QhullError:
            print(f"Skipping convex hull for cluster with members {members} due to Qhull precision error.")

plt.title("Longevity-Associated Clusters Network (With Inter-Cluster Connections by Identity)")
plt.axis("off")
plt.show()




Skipping convex hull for cluster with members ['efXP_008143519.2', 'efXP_008143519.2', 'efXP_008143519.2', 'efXP_008143519.2', 'efXP_008143519.2', 'pnCAK6436446.1', 'pnCAK6436446.1', 'pnCAK6436446.1', 'pnCAK6436446.1', 'pnCAK6436446.1'] due to Qhull precision error.
Skipping convex hull for cluster with members ['efXP_027987183.2', 'efXP_027987183.2', 'efXP_027987183.2', 'enKAK1329177.1', 'enKAK1329177.1', 'enKAK1329177.1'] due to Qhull precision error.
Skipping convex hull for cluster with members ['enKAK1332890.1', 'enKAK1332890.1', 'enKAK1332890.1', 'enKAK1332890.1', 'enKAK1332890.1', 'efXP_054584419.1', 'efXP_054584419.1', 'efXP_054584419.1', 'efXP_054584419.1', 'efXP_054584419.1'] due to Qhull precision error.
Skipping convex hull for cluster with members ['mdXP_059516705.1', 'mdXP_059516705.1', 'mdXP_059516705.1', 'mdXP_059516705.1', 'mmXP_036189764.1', 'mmXP_036189764.1', 'mmXP_036189764.1', 'mmXP_036189764.1', 'mmXP_036189764.1', 'mdXP_059516705.1'] due to Qhull precision error

In [34]:
import matplotlib.pyplot as plt
import networkx as nx
import numpy as np
import pandas as pd

# Example setup assuming `filtered_clusters` is a list of dictionaries with cluster data
# Each dictionary in `filtered_clusters` contains "members" and "representative" keys

# Define longevity-associated colors
color_map = {
    "short": "green",
    "average": "yellow",
    "long": "red"
}

# Function to categorize longevity based on prefix
def get_longevity_category(prefix):
    if prefix in {"sb", "pn", "pk", "mo", "pd", "pv", "pa"}:
        return "short"
    elif prefix in {"en", "ef", "ph", "aj", "ra", "pg"}:
        return "average"
    elif prefix in {"mb", "md", "ml", "mm", "dr", "rf"}:
        return "long"
    return "unknown"

# Build a NetworkX graph for representative proteins only
G = nx.Graph()

# Add representative nodes based on filtered_clusters
for cluster in filtered_clusters:
    representative = cluster["members"][0]  # Representative protein of the cluster
    longevity_category = get_longevity_category(representative[:2])
    cluster_color = color_map.get(longevity_category, "gray")
    
    # Add representative protein node to the graph
    G.add_node(representative, color=cluster_color, size=500, label=representative)

# Add inter-cluster edges based on average identities between clusters
# Assuming `cluster_similarities` is a dictionary {(cluster1, cluster2): avg_identity, ...}
for (cluster1, cluster2), avg_identity in cluster_similarities.items():
    if avg_identity > 0:  # Only draw edges with meaningful identity
        G.add_edge(cluster1, cluster2, weight=avg_identity)

# Draw the network with only representative nodes and inter-cluster connections
plt.figure(figsize=(10, 10))
pos = nx.spring_layout(G, k=0.5)  # Adjust layout for spacing

# Draw representative nodes with colors and labels
node_colors = [G.nodes[n]["color"] for n in G.nodes]
node_sizes = [G.nodes[n]["size"] for n in G.nodes]
nx.draw_networkx_nodes(G, pos, node_color=node_colors, node_size=node_sizes, alpha=0.8, edgecolors='black')
nx.draw_networkx_labels(G, pos, labels={n: G.nodes[n]["label"] for n in G.nodes}, font_size=8, font_color="white")

# Draw inter-cluster edges with weights representing average identity
edges = G.edges(data=True)
weights = [d['weight'] for _, _, d in edges]
nx.draw_networkx_edges(G, pos, edgelist=edges, width=weights, edge_color="blue", alpha=0.5)

plt.title("Longevity-Associated Clusters Network (Only Representative Proteins)")
plt.axis("off")
plt.show()



In [50]:
import networkx as nx
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
# Define color mapping
color_mapping = {
    'green': {'sb', 'pn', 'pk', 'mo', 'pd', 'pv', 'pa'},
    'yellow': {'en', 'ef', 'ph', 'aj', 'ra', 'pg'},
    'red': {'mb', 'md', 'ml', 'mm', 'dr', 'rf'}
}

# Function to determine color based on prefix
def get_color(prefix):
    for color, prefixes in color_mapping.items():
        if prefix in prefixes:
            return color
    return 'gray'  # Default color if prefix is not found
# Load the silhouette scores
silhouette_df = silhouette_df  # remember to use the one that is for high id

# Load the alignment data
alignment_df = pd.read_csv("ecmaffil_cluster_aligns.tsv", sep="\t", 
                           names=["query", "target", "evalue", "pident", "alnlen", "qstart", "qend", "tstart", "tend"])

# Load clustering information
aggregated_df = pd.read_csv("aggregated_ECMAFFIL_clusters.tsv", sep="\t")

G = nx.Graph()

# Add nodes for each protein and assign them to clusters with color
for _, row in silhouette_df.iterrows():
    protein = row["protein"]
    prefix = protein[:2]
    cluster_id = row["cluster"]
    color = get_color(prefix)
    G.add_node(protein, cluster=cluster_id, color=color, pident_silhouette=row["pident_silhouette"], evalue_silhouette=row["evalue_silhouette"])

# Add edges within clusters based on pident
for _, row in alignment_df.iterrows():
    query, target = row["query"], row["target"]
    if query in G.nodes and target in G.nodes and G.nodes[query]["cluster"] == G.nodes[target]["cluster"]:
        # Add an edge within the same cluster weighted by pident
        G.add_edge(query, target, weight=row["pident"], style="solid")

# Add edges between clusters based on silhouette scores
for cluster_id in silhouette_df["cluster"].unique():
    cluster_members = silhouette_df[silhouette_df["cluster"] == cluster_id]["protein"].tolist()
    cluster_silhouette = silhouette_df[silhouette_df["cluster"] == cluster_id][["pident_silhouette", "evalue_silhouette"]].mean().mean()
    
    for other_cluster_id in silhouette_df["cluster"].unique():
        if cluster_id < other_cluster_id:  # Avoid duplicate edges and self-loops
            other_cluster_members = silhouette_df[silhouette_df["cluster"] == other_cluster_id]["protein"].tolist()
            other_cluster_silhouette = silhouette_df[silhouette_df["cluster"] == other_cluster_id][["pident_silhouette", "evalue_silhouette"]].mean().mean()
            avg_silhouette_score = (cluster_silhouette + other_cluster_silhouette) / 2

            # Connect all proteins between clusters with silhouette score as weight
            for protein1 in cluster_members:
                for protein2 in other_cluster_members:
                    G.add_edge(protein1, protein2, weight=avg_silhouette_score, style="bold")

# Draw the graph
plt.figure(figsize=(12, 12))
pos = nx.spring_layout(G, k=0.15, seed=42)

# Draw nodes with color based on prefix mapping
node_colors = [G.nodes[node]['color'] for node in G.nodes()]
nx.draw_networkx_nodes(G, pos, node_size=100, node_color=node_colors)

# Draw edges with thickness based on weights
for u, v, data in G.edges(data=True):
    if data["style"] == "solid":
        nx.draw_networkx_edges(G, pos, edgelist=[(u, v)], width=data["weight"] / 10, edge_color="black")  # Scale pident for intra-cluster
    else:
        nx.draw_networkx_edges(G, pos, edgelist=[(u, v)], width=data["weight"], edge_color="gray")  # Scale silhouette score for inter-cluster

# Show node labels
nx.draw_networkx_labels(G, pos, font_size=8)

# Add title
plt.title("Protein Clusters as Fully Connected Network (Within and Between Clusters)")
plt.show()


: 