In [None]:
import pandas as pd

ppi = pd.read_csv("10090.protein.links.v12.0.txt", sep=" ")
ppi = ppi[ppi['combined_score'] > 700] 

ppi[['protein1', 'protein2']] = ppi[['protein1', 'protein2']].apply(lambda col: col.str.split('.').str[1])
ppi.to_csv("mouse_ppi_edges.csv", index=False)


In [None]:
! pip install openpyxl

In [None]:
import pandas as pd

df = pd.read_excel("2020-03-18_Krogan_SARSCoV2_27baits.xlsx")
print(df.head())


In [None]:
! pip install biopython


In [None]:
from time import sleep
import requests

def safe_get_gene_symbol(enspid, max_retries=3):
    for attempt in range(max_retries):
        try:
            url = f"https://mygene.info/v3/query?q={enspid}&fields=symbol"
            response = requests.get(url, timeout=10)
            if response.ok:
                data = response.json()
                return data.get("hits", [{}])[0].get("symbol")
        except (requests.exceptions.SSLError, requests.exceptions.RequestException) as e:
            print(f"Attempt {attempt + 1} failed for {enspid}: {e}")
            sleep(2 ** attempt)  # Exponential backoff
    return None

In [None]:
import pandas as pd
import requests
import json
from time import sleep
from pathlib import Path

CACHE_FILE = "ensp_gene_cache.json"

def load_cache():
    """Load existing cache from file or return empty dict"""
    if Path(CACHE_FILE).exists():
        with open(CACHE_FILE, 'r') as f:
            return json.load(f)
    return {}

def save_cache(cache):
    """Save cache to file"""
    with open(CACHE_FILE, 'w') as f:
        json.dump(cache, f)

gene_cache = load_cache()

def safe_get_gene_symbol(enspid, max_retries=3):
    """
    Fetch gene symbol with caching, retries, and dual API fallback
    """
    if enspid in gene_cache:
        return gene_cache[enspid]
    
    for attempt in range(max_retries):
        try:
            url = f"https://mygene.info/v3/query?q={enspid}&fields=symbol"
            response = requests.get(url, timeout=10)
            
            if response.ok:
                data = response.json()
                hits = data.get("hits", [])
                if hits:
                    symbol = hits[0].get("symbol")
                    if symbol: 
                        gene_cache[enspid] = symbol
                        return symbol
            
            ensembl_url = f"https://rest.ensembl.org/lookup/id/{enspid}?content-type=application/json"
            ensembl_response = requests.get(ensembl_url, timeout=10)
            
            if ensembl_response.ok:
                ensembl_data = ensembl_response.json()
                symbol = ensembl_data.get("display_name")
                if symbol:  
                    gene_cache[enspid] = symbol
                    return symbol
                
        except Exception as e:
            print(f"Attempt {attempt + 1} failed for {enspid}: {e}")
            sleep(2 ** attempt) 
    
    print(f"Could not resolve {enspid} after {max_retries} attempts")
    gene_cache[enspid] = None  
    return None

human_df = pd.read_csv("human_ppi_edges.csv")

unique_ids = set(human_df['protein1'].unique()).union(set(human_df['protein2'].unique()))
print(f"Total unique ENSP IDs to resolve: {len(unique_ids)}")

for i, enspid in enumerate(unique_ids, 1):
    safe_get_gene_symbol(enspid)
    if i % 100 == 0:  
        save_cache(gene_cache)
        print(f"Processed {i}/{len(unique_ids)} IDs")

save_cache(gene_cache)

human_df["protein1_gene"] = human_df["protein1"].map(gene_cache)
human_df["protein2_gene"] = human_df["protein2"].map(gene_cache)

human_filtered = human_df.dropna(subset=["protein1_gene", "protein2_gene"])
human_filtered.to_csv("human_ppi_edges_filtered.csv", index=False)

print("\nProcessing complete!")
print(f"Original edges: {len(human_df)}")
print(f"Valid edges after mapping: {len(human_filtered)}")
print(f"Cache size: {len(gene_cache)} entries")

In [None]:
import json

with open('/Users/rajeevr/Downloads/DTIOG/ensp_gene_cache.json') as f:
    data = json.load(f)

print(len(data)) 


In [None]:
! pip install networkx

In [None]:
import pandas as pd
import requests
import json
from time import sleep
from pathlib import Path

# Cache setup
CACHE_FILE = "ensp_gene_cache_mouse.json"

def load_cache():
    """Load existing cache from file or return empty dict"""
    if Path(CACHE_FILE).exists():
        with open(CACHE_FILE, 'r') as f:
            return json.load(f)
    return {}

def save_cache(cache):
    """Save cache to file"""
    with open(CACHE_FILE, 'w') as f:
        json.dump(cache, f)

gene_cache = load_cache()

In [None]:

def safe_get_gene_symbol(enspid, max_retries=3):
    """
    Fetch gene symbol with caching, retries, and dual API fallback
    """
    if enspid in gene_cache:
        return gene_cache[enspid]
    
    for attempt in range(max_retries):
        try:
            url = f"https://mygene.info/v3/query?q={enspid}&fields=symbol"
            response = requests.get(url, timeout=10)
            
            if response.ok:
                data = response.json()
                hits = data.get("hits", [])
                if hits:
                    symbol = hits[0].get("symbol")
                    if symbol:  # Only cache valid symbols
                        gene_cache[enspid] = symbol
                        return symbol
            
            ensembl_url = f"https://rest.ensembl.org/lookup/id/{enspid}?content-type=application/json"
            ensembl_response = requests.get(ensembl_url, timeout=10)
            
            if ensembl_response.ok:
                ensembl_data = ensembl_response.json()
                symbol = ensembl_data.get("display_name")
                if symbol:  # Only cache valid symbols
                    gene_cache[enspid] = symbol
                    return symbol
                
        except Exception as e:
            print(f"Attempt {attempt + 1} failed for {enspid}: {e}")
            sleep(2 ** attempt)  # Exponential backoff
    
    print(f"Could not resolve {enspid} after {max_retries} attempts")
    gene_cache[enspid] = None  # Cache failures to avoid retrying
    return None

mouse_df = pd.read_csv("mouse_ppi_edges.csv")

unique_ids = set(mouse_df['protein1'].unique()).union(set(mouse_df['protein2'].unique()))
print(f"Total unique ENSP IDs to resolve: {len(unique_ids)}")

for i, enspid in enumerate(unique_ids, 1):
    safe_get_gene_symbol(enspid)
    if i % 100 == 0: 
        save_cache(gene_cache)
        print(f"Processed {i}/{len(unique_ids)} IDs")

save_cache(gene_cache)

mouse_df["protein1_gene"] = mouse_df["protein1"].map(gene_cache)
mouse_df["protein2_gene"] = mouse_df["protein2"].map(gene_cache)

mouse_filtered = mouse_df.dropna(subset=["protein1_gene", "protein2_gene"])
mouse_filtered.to_csv("mouse_ppi_edges_filtered.csv", index=False)

print("\nProcessing complete!")
print(f"Original edges: {len(mouse_df)}")
print(f"Valid edges after mapping: {len(mouse_filtered)}")
print(f"Cache size: {len(gene_cache)} entries")

In [None]:
import pandas as pd
import networkx as nx
from collections import defaultdict
import matplotlib.pyplot as plt


krogan_df = pd.read_excel("2020-03-18_Krogan_SARSCoV2_27baits.xlsx")
human_df = pd.read_csv("human_ppi_edges_filtered.csv")

G = nx.Graph()

for _, row in krogan_df.iterrows():
    virus = row['Bait']
    human = row['PreyGene']
    G.add_edge(virus, human, interaction='viral-human')

for _, row in human_df.iterrows():
    G.add_edge(row['protein1_gene'], row['protein2_gene'], interaction='human-human')



In [None]:
viral_nodes = krogan_df['Bait'].unique()
viral_subnetworks = {}

for v in viral_nodes:
    neighbors = list(G.neighbors(v))
    subgraph_nodes = [v] + neighbors
    subG = G.subgraph(subgraph_nodes)
    viral_subnetworks[v] = subG


In [None]:
from operator import itemgetter

results = []
def is_conserved(h1, h2):
    m1 = ortholog_map.get(h1)
    m2 = ortholog_map.get(h2)
    if not m1 or not m2:
        return False
    return tuple(sorted([m1, m2])) in mouse_edges

for human in set(krogan_df['PreyGene']):
    viral_partners = krogan_df[krogan_df['PreyGene'] == human]['Bait'].nunique()
    degree = G.degree(human)
    
    # Check conservation
    conserved = False
    for neighbor in G.neighbors(human):
        if G.edges[human, neighbor]['interaction'] == 'human-human' and is_conserved(human, neighbor):
            conserved = True
            break
    
    results.append({
        'gene': human,
        'viral_partners': viral_partners,
        'degree': degree,
        'conserved': conserved
    })

df_results = pd.DataFrame(results)
df_results = df_results.sort_values(by=['viral_partners', 'degree'], ascending=False)


In [None]:
df_results.to_csv("ranked_targets.csv", index=False)

nx.draw(viral_subnetworks['SARS-CoV2 E'], with_labels=True, node_size=300)
plt.show()


In [None]:
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import mannwhitneyu

human_df = pd.read_csv("human_ppi_edges_filtered.csv")  # must have protein1_gene, protein2_gene

G = nx.Graph()
for _, row in human_df.iterrows():
    G.add_edge(row['protein1_gene'], row['protein2_gene'])

centrality = nx.degree_centrality(G)
centrality_df = pd.DataFrame.from_dict(centrality, orient='index', columns=['centrality']).reset_index()
centrality_df = centrality_df.rename(columns={'index': 'gene'})


In [None]:
import pandas as pd
import networkx as nx

human_df = pd.read_csv("human_ppi_edges_filtered.csv")  

G = nx.Graph()
for _, row in human_df.iterrows():
    G.add_edge(row['protein1_gene'], row['protein2_gene'])

centrality = nx.degree_centrality(G)
centrality_df = pd.DataFrame.from_dict(centrality, orient='index', columns=['centrality']).reset_index()
centrality_df = centrality_df.rename(columns={'index': 'gene'})

centrality_df.to_csv("hub_genes.csv", index=False)


In [None]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

centrality_df = pd.read_csv("hub_genes.csv")  # gene, centrality
centrality_df = centrality_df[['gene', 'centrality']]

conservation_df = pd.read_csv("viral_target_conservation_ratios.csv")  # gene, conservation_ratio
conservation_df = conservation_df[['gene', 'conservation_ratio']]


In [None]:
merged_df = pd.merge(centrality_df, conservation_df, on='gene', how='inner')
print(f"✅ Merged {len(merged_df)} genes with both centrality and conservation scores")
print(merged_df.head())



In [None]:
scaler = MinMaxScaler()

merged_df[['centrality_norm', 'conservation_norm']] = scaler.fit_transform(
    merged_df[['centrality', 'conservation_ratio']]
)


In [None]:
merged_df['final_score'] = 0.5 * merged_df['centrality_norm'] + 0.5 * merged_df['conservation_norm']

merged_df = merged_df.sort_values(by='final_score', ascending=False)


In [None]:
merged_df.to_csv("final_prioritized_viral_targets.csv", index=False)

print("🎯 Top candidates:")
print(merged_df[['gene', 'centrality', 'conservation_ratio', 'final_score']].head(10))


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

merged_df = pd.read_csv("final_prioritized_viral_targets.csv")

# Scatter plot
plt.figure(figsize=(8,6))
sns.scatterplot(
    x='centrality', 
    y='conservation_ratio', 
    data=merged_df, 
    hue='final_score', 
    size='final_score', 
    palette='viridis', 
    sizes=(20, 200),
    edgecolor='k', 
    alpha=0.7
)

plt.title("Conservation Ratio vs Centrality for SARS-CoV-2 Targets", fontsize=14)
plt.xlabel("Degree Centrality (Human PPI)", fontsize=12)
plt.ylabel("Conservation Ratio (Human-Mouse)", fontsize=12)
plt.legend(title="Final Score", bbox_to_anchor=(1.05, 1), loc='upper left')
plt.grid(True)
plt.tight_layout()
plt.savefig("scatter_conservation_centrality.png", dpi=300)
plt.show()
