In [24]:
import gzip

alias_file = "/Volumes/my_expansion/9606.protein.aliases.v12.0.txt.gz"

# ENSP → HUGO Gene Symbol
protein_to_symbol = {}
ENSP_to_ENSG = {}

with gzip.open(alias_file, 'rt') as f:
    for line in f:
        parts = line.strip().split('\t')
        if len(parts) != 3:
            continue
        protein_id, alias, source = parts
        if source in {"Ensembl_HGNC_symbol","BioMart_HUGO",""}:
            protein_to_symbol[protein_id] = alias          
print(f"✅ Got {len(protein_to_symbol)} ENSP → HUGO gene symbol mappings.")

with gzip.open(alias_file, 'rt') as f:
    for line in f:
        parts = line.strip().split('\t')
        if len(parts) != 3:
            continue
        protein_id, alias, source = parts
        if source in {"Ensembl_HGNC_ensembl_gene_id", "Ensembl_gene"} and alias.startswith("ENSG"):
            ENSP_to_ENSG[protein_id] = alias
print(f"✅ Got {len(ENSP_to_ENSG)} ENSP → ENSG mappings.")

✅ Got 19409 ENSP → HUGO gene symbol mappings.
✅ Got 19699 ENSP → ENSG mappings.


In [26]:
import pandas as pd

ppi_file = "/Volumes/my_expansion/9606.protein.physical.links.full.v12.0.txt.gz"
ppi_outfile = "/Volumes/my_expansion/ppi_hugo.tsv"

ppi_df = pd.read_csv(ppi_file, sep=' ', usecols=['protein1', 'protein2', 'combined_score'])

# mapping both ends to gene symbol
ppi_df = ppi_df[
    (ppi_df['protein1'].isin(protein_to_symbol)) &
    (ppi_df['protein2'].isin(protein_to_symbol))
].copy()

# 映射成 gene name
ppi_df['GeneA'] = ppi_df['protein1'].map(protein_to_symbol)
ppi_df['GeneB'] = ppi_df['protein2'].map(protein_to_symbol)

# 最终保留三列：GeneA, GeneB, Score
ppi_final = ppi_df[['GeneA', 'GeneB', 'combined_score']]
ppi_final.to_csv(ppi_outfile, sep='\t', index=False)

print(f"✅ Done! Output saved to: {ppi_outfile} with {len(ppi_final)} edges.")


✅ Done! Output saved to: /Volumes/my_expansion/ppi_hugo.tsv with 1466856 edges.
