In [None]:
import os
import pandas as pd

# Define the file path
file_path = r'Virus_pair_combination.xlsx' # Place this Excel file in the same folder as this script

# Read the Excel file containing virus pairs
df = pd.read_excel(file_path)

# Function to calculate overlap coefficient
def overlap_coefficient(set_A, set_B):
    intersection = len(set_A.intersection(set_B))
    min_size = min(len(set_A), len(set_B))
    if min_size == 0:
        return 0
    else:
        return intersection / min_size

# Function to calculate Jaccard index
def jaccard_index(set_A, set_B):
    intersection = len(set_A.intersection(set_B))
    union = len(set_A.union(set_B))
    if union == 0:
        return 0
    else:
        return intersection / union

# Store results
results = []

# Directory containing individual virus gene list text files
file_dir = '.'  # Current folder (place all virus txt files here)

# Iterate over each row in the dataframe
for index, row in df.iterrows():
    # Read virus gene lists from individual text files
    vir_a_file = os.path.join(file_dir, row['Virus_A'] + '.txt')
    vir_b_file = os.path.join(file_dir, row['Virus_B'] + '.txt')
    
    virus_A = set(open(vir_a_file).read().splitlines())
    virus_B = set(open(vir_b_file).read().splitlines())
    
    # Calculate overlap coefficient (C) and Jaccard index (J)
    C = overlap_coefficient(virus_A, virus_B)
    J = jaccard_index(virus_A, virus_B)
    
    # Store the results
    results.append({'Virus_A': row['Virus_A'], 'Virus_B': row['Virus_B'], 'Overlap Coefficient': C, 'Jaccard Index': J})

# Convert results to dataframe
results_df = pd.DataFrame(results)

# Write results to a new CSV file
results_df.to_csv(
    os.path.join(file_dir, 'virus_pairs_results.csv'),
    index=False
)
