In [None]:
import pandas as pd
import networkx as nx
from collections import defaultdict
import matplotlib.pyplot as plt


krogan_df = pd.read_excel("2020-03-18_Krogan_SARSCoV2_27baits.xlsx")
human_df = pd.read_csv("human_ppi_edges.csv")

G = nx.Graph()

for _, row in krogan_df.iterrows():
    virus = row['Bait']
    human = row['PreyGene']
    G.add_edge(virus, human, interaction='viral-human')

for _, row in human_df.iterrows():
    G.add_edge(row['protein1_gene'], row['protein2_gene'], interaction='human-human')



In [None]:
# Read the file without header (since it's just one column)
ortholog_raw = pd.read_csv("human_mouse_orthologs.csv", header=None)

# Split the only column into actual columns
ortholog_df = ortholog_raw[0].str.replace('"', '').str.split(",", expand=True)

# Drop the duplicated header row (actual header is already set)
ortholog_df = ortholog_df.drop(index=0).reset_index(drop=True)


In [None]:
# Optional: filter to 1-to-1 orthologs only
ortholog_df = ortholog_df[ortholog_df["Mouse homology type"] == "ortholog_one2one"]

# Create the mapping
ortholog_map = dict(zip(ortholog_df["Gene name"], ortholog_df["Mouse gene name"]))


In [None]:
viral_nodes = krogan_df['Bait'].unique()
viral_subnetworks = {}

for v in viral_nodes:
    neighbors = list(G.neighbors(v))
    subgraph_nodes = [v] + neighbors
    subG = G.subgraph(subgraph_nodes)
    viral_subnetworks[v] = subG


In [None]:
from operator import itemgetter

results = []
def is_conserved(h1, h2):
    m1 = ortholog_map.get(h1)
    m2 = ortholog_map.get(h2)
    if not m1 or not m2:
        return False
    return tuple(sorted([m1, m2])) in mouse_edges

for human in set(krogan_df['PreyGene']):
    viral_partners = krogan_df[krogan_df['PreyGene'] == human]['Bait'].nunique()
    degree = G.degree(human)
    
    # Check conservation
    conserved = False
    for neighbor in G.neighbors(human):
        if G.edges[human, neighbor]['interaction'] == 'human-human' and is_conserved(human, neighbor):
            conserved = True
            break
    
    results.append({
        'gene': human,
        'viral_partners': viral_partners,
        'degree': degree,
        'conserved': conserved
    })

df_results = pd.DataFrame(results)
df_results = df_results.sort_values(by=['viral_partners', 'degree'], ascending=False)


In [None]:
df_results.to_csv("ranked_targets.csv", index=False)

nx.draw(viral_subnetworks['SARS-CoV2 E'], with_labels=True, node_size=300)
plt.show()


In [None]:
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import mannwhitneyu

human_df = pd.read_csv("human_ppi_edges.csv")  # must have protein1_gene, protein2_gene

G = nx.Graph()
for _, row in human_df.iterrows():
    G.add_edge(row['protein1_gene'], row['protein2_gene'])

centrality = nx.degree_centrality(G)
centrality_df = pd.DataFrame.from_dict(centrality, orient='index', columns=['centrality']).reset_index()
centrality_df = centrality_df.rename(columns={'index': 'gene'})


In [None]:
import pandas as pd
import networkx as nx

human_df = pd.read_csv("human_ppi_edges.csv")  

G = nx.Graph()
for _, row in human_df.iterrows():
    G.add_edge(row['protein1_gene'], row['protein2_gene'])

centrality = nx.degree_centrality(G)
centrality_df = pd.DataFrame.from_dict(centrality, orient='index', columns=['centrality']).reset_index()
centrality_df = centrality_df.rename(columns={'index': 'gene'})

centrality_df.to_csv("hub_genes.csv", index=False)


In [None]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

centrality_df = pd.read_csv("hub_genes.csv")  # gene, centrality
centrality_df = centrality_df[['gene', 'centrality']]

conservation_df = pd.read_csv("viral_target_conservation_ratios.csv")  # gene, conservation_ratio
conservation_df = conservation_df[['gene', 'conservation_ratio']]


In [None]:
merged_df = pd.merge(centrality_df, conservation_df, on='gene', how='inner')
print(f"✅ Merged {len(merged_df)} genes with both centrality and conservation scores")
print(merged_df.head())



In [None]:
scaler = MinMaxScaler()

merged_df[['centrality_norm', 'conservation_norm']] = scaler.fit_transform(
    merged_df[['centrality', 'conservation_ratio']]
)


In [None]:
merged_df['final_score'] = 0.5 * merged_df['centrality_norm'] + 0.5 * merged_df['conservation_norm']

merged_df = merged_df.sort_values(by='final_score', ascending=False)


In [None]:
merged_df.to_csv("final_prioritized_viral_targets.csv", index=False)

print("🎯 Top candidates:")
print(merged_df[['gene', 'centrality', 'conservation_ratio', 'final_score']].head(10))


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

merged_df = pd.read_csv("final_prioritized_viral_targets.csv")

# Scatter plot
plt.figure(figsize=(8,6))
sns.scatterplot(
    x='centrality', 
    y='conservation_ratio', 
    data=merged_df, 
    hue='final_score', 
    size='final_score', 
    palette='viridis', 
    sizes=(20, 200),
    edgecolor='k', 
    alpha=0.7
)

plt.title("Conservation Ratio vs Centrality for SARS-CoV-2 Targets", fontsize=14)
plt.xlabel("Degree Centrality (Human PPI)", fontsize=12)
plt.ylabel("Conservation Ratio (Human-Mouse)", fontsize=12)
plt.legend(title="Final Score", bbox_to_anchor=(1.05, 1), loc='upper left')
plt.grid(True)
plt.tight_layout()
plt.savefig("scatter_conservation_centrality.png", dpi=300)
plt.show()
