In [None]:
# First, let's read the contents of the uploaded alignment file to understand its format and identify where the similarity score might be.
file_path = '/mnt/data/aln'

# Open and read the contents of the file
with open(file_path, 'r') as file:
    contents = file.read()

# Display the first few lines of the file to determine the format and find the similarity score
print(contents[:1000])  # Displaying the first 1000 characters to get a good overview of the content format


In [None]:
from scipy.cluster.hierarchy import dendrogram, linkage
import matplotlib.pyplot as plt

# Assuming similarity scores range from 0 (no similarity) to 1 (identical), compute the distance as (1 - similarity score)
df['Distance'] = 1 - df['Similarity Score']

# Create a distance matrix
unique_ids = pd.unique(df[['Query ID', 'Target ID']].values.ravel('K'))
distance_matrix = pd.DataFrame(1, index=unique_ids, columns=unique_ids)  # Start with a matrix of all 1s (maximum distance)

# Fill the matrix with actual distances
for index, row in df.iterrows():
    distance_matrix.at[row['Query ID'], row['Target ID']] = row['Distance']
    distance_matrix.at[row['Target ID'], row['Query ID']] = row['Distance']

# Since the neighbor-joining method requires a condensed distance matrix (upper triangular),
# we will convert the full matrix to a condensed form required by scipy's linkage function.
condensed_matrix = distance_matrix.where(np.triu(np.ones(distance_matrix.shape), k=1).astype(bool)).stack()

# Perform hierarchical clustering using the neighbor-joining method
Z = linkage(condensed_matrix, 'average')  # Using average linkage for this example

# Plotting the dendrogram
plt.figure(figsize=(10, 8))
dendrogram(Z, labels=unique_ids, leaf_rotation=90, leaf_font_size=10)
plt.title('Dendrogram using Neighbor-Joining')
plt.xlabel('Species')
plt.ylabel('Distance')
plt.show()


In [None]:
import numpy as np

# Correcting the error by importing numpy and then converting the full matrix to a condensed form required by scipy's linkage function.
condensed_matrix = distance_matrix.where(np.triu(np.ones(distance_matrix.shape), k=1).astype(bool)).stack()

# Perform hierarchical clustering using the neighbor-joining method
Z = linkage(condensed_matrix, 'average')  # Using average linkage for this example

# Plotting the dendrogram
plt.figure(figsize=(10, 8))
dendrogram(Z, labels=unique_ids, leaf_rotation=90, leaf_font_size=10)
plt.title('Dendrogram using Neighbor-Joining')
plt.xlabel('Species')
plt.ylabel('Distance')
plt.show()


In [None]:
from scipy.cluster.hierarchy import to_tree

# Convert linkage matrix to a Newick string
def linkage_to_newick(Z, labels):
    tree = to_tree(Z, rd=False)
    def build_newick(node):
        if node.is_leaf():
            return labels[node.id]
        else:
            return f"({build_newick(node.left)},{build_newick(node.right)})"
    return build_newick(tree) + ";"

# Generate Newick string
newick_str = linkage_to_newick(Z, unique_ids)
newick_file_path = '/mnt/data/phylogenetic_tree.newick'

# Save to a file
with open(newick_file_path, 'w') as file:
    file.write(newick_str)

newick_file_path


In [None]:
# Function to modify labels: remove everything after the second underscore and replace underscores with spaces
def modify_labels(label):
    parts = label.split('_')
    if len(parts) > 2:
        modified_label = ' '.join(parts[:2])
    else:
        modified_label = ' '.join(parts)
    return modified_label

# Apply modifications to labels
modified_labels = [modify_labels(label) for label in unique_ids]

# Generate Newick string with modified labels
modified_newick_str = linkage_to_newick(Z, modified_labels)
modified_newick_file_path = '/mnt/data/modified_phylogenetic_tree.newick'

# Save to a file
with open(modified_newick_file_path, 'w') as file:
    file.write(modified_newick_str)

modified_newick_file_path
