In [None]:
import pandas as pd
import numpy as np
import os
from sklearn.metrics.pairwise import cosine_similarity


# Step 1: Load data

file_path = 'Cosine_similarity_calculation_data.csv' 
df = pd.read_csv(file_path)

print("Columns in the dataframe:", df.columns.tolist())
print("\nFirst 5 rows:\n", df.head())

# Select domain columns
selected_cols = df.loc[:, '2-Hacid_dh':'zf-dskA_traR'].columns
print(f"\nNumber of selected columns: {len(selected_cols)}")
print("Selected column names:", selected_cols.tolist())

domain_data = df[selected_cols].values
plasmid_ids = df['SeqID'].values  # Use SeqID as plasmid IDs


# Step 2: Compute cosine similarity

cosine_sim = cosine_similarity(domain_data)
cosine_sim_df = pd.DataFrame(cosine_sim, index=plasmid_ids, columns=plasmid_ids)


# Step 3: Save cosine similarity matrix

output_dir = "Cosine_similarity"
os.makedirs(output_dir, exist_ok=True)
output_path = os.path.join(output_dir, "cosine_similarity.csv")
cosine_sim_df.to_csv(output_path)
print(f"Cosine similarity matrix saved at: {output_path}")


# Step 4: Convert cosine similarity to edge list

output_edge_list_dir = "Edge_lists"
os.makedirs(output_edge_list_dir, exist_ok=True)

# Exclude self-edges and duplicates
mask = np.triu(np.ones_like(cosine_sim_df.values, dtype=bool), k=1)
i, j = np.where(mask)

edge_list_df = pd.DataFrame({
    'Source': cosine_sim_df.index[i],
    'Target': cosine_sim_df.columns[j],
    'Weight': cosine_sim_df.values[i, j]
})

# Save full edge list
output_file_full = os.path.join(output_edge_list_dir, "edge_list_full.csv")
edge_list_df.to_csv(output_file_full, index=False)
print(f"Full edge list saved at: {output_file_full}")
print(f"Number of edges included: {len(edge_list_df)}")


# Step 5: Filter edges by threshold (>=0.5)

threshold = 0.5
filtered_edges = edge_list_df[edge_list_df['Weight'] >= threshold]

# Save filtered edge list
output_file_filtered = os.path.join(output_edge_list_dir, f"edge_list_threshold_{threshold}.csv")
filtered_edges.to_csv(output_file_filtered, index=False)
print(f"Filtered edge list with threshold >= {threshold} saved at: {output_file_filtered}")
print(f"Number of edges after filtering: {len(filtered_edges)}")
