# STRING-DB Data Collection

Main link: https://string-db.org/

Schema: https://version-10-5.string-db.org/download/database.schema.v10.5.pdf

FAQ: https://version11.string-db.org/help/database/

Prerequisites:
- (TXT) file with all pairwise interactions: https://stringdb-downloads.org/download/protein.links.full.vXX.X.txt.gz
    - To speed up processing, we extract only biocurated connections with condition `database > 0 `, more about this: https://string-db.org/help//faq/#how-to-extract-high-confidence-07-interactions-from-information-on-combined-score-in-proteinlinkstxtgz
- (Table) Mappings: STRING ID -> Uniprot ID
- (Table) Mappings: Uniprot ID -> HGNC

In [1]:
import psycopg2
import pandas as pd

from tqdm import tqdm
import time
import json

import numpy as np

from itertools import combinations

In [2]:
tqdm.pandas()

## Opening STRING-filtered `.txt` file

In [3]:
nn_links_table_df = pd.read_table(
    'proteins_links_curated_experiments_v11.0.txt', 
    delimiter=' '
)

  nn_links_table_df = pd.read_table(


In [7]:
nn_links_table_df.dtypes

protein1                    object
protein2                    object
neighborhood                 int64
neighborhood_transferred     int64
fusion                       int64
cooccurence                  int64
homology                     int64
coexpression                 int64
coexpression_transferred     int64
experiments                  int64
experiments_transferred      int64
database                     int64
database_transferred         int64
textmining                   int64
textmining_transferred       int64
combined_score               int64
dtype: object

In [10]:
# making sure we have biocurated only connections
nn_links_table_curated_df = nn_links_table_df[
    nn_links_table_df['database'] > 0
][['protein1', 'protein2']]

In [11]:
nn_links_table_curated_df

Unnamed: 0,protein1,protein2
0,287.DR97_4287,287.DR97_1873
1,287.DR97_4287,287.DR97_3824
2,287.DR97_4288,287.DR97_4009
3,287.DR97_4288,287.DR97_5630
4,287.DR97_4288,287.DR97_1528
...,...,...
33722567,1549858.MC45_18470,1549858.MC45_02390
33722568,1549858.MC45_18470,1549858.MC45_07065
33722569,1549858.MC45_18470,1549858.MC45_10445
33722570,1549858.MC45_18470,1549858.MC45_08425


## Mappings 

Schema:

STRING ID -> Uniprot ID -> HGNC -> CUI

### Opening uniprot ID mapping

bash command: `gunzip -c idmapping.txt.gz | grep "STRING" > idmapping_uniprot.txt` (to extract STRING mappings)

In [12]:
idmapping_hgnc_df = pd.read_table(
    'uniprot_mappings/idmapping_uniprot_hgnc.txt',
    header=None,
)

In [13]:
idmapping_hgnc_df = idmapping_hgnc_df[
    idmapping_hgnc_df[1] == 'HGNC'
]

In [14]:
idmapping_string_df = pd.read_table(
    'uniprot_mappings/idmapping_string.txt',
    header=None,
)

In [15]:
idmapping_string_df

Unnamed: 0,0,1,2
0,Q43495,STRING,4081.Solyc01g009590.2.1
1,P32234,STRING,7227.FBpp0087084
2,Q92AT0,STRING,272626.lin1839
3,P81928,STRING,7227.FBpp0082370
4,P48347,STRING,3702.AT1G22300.1
...,...,...,...
33827405,A0A4W4ETD8,STRING,8005.ENSEEEP00000014429
33827406,A0A4W4FE84,STRING,8005.ENSEEEP00000022104
33827407,A0A4W4FLL4,STRING,8005.ENSEEEP00000024840
33827408,A0A4W4HUB7,STRING,8005.ENSEEEP00000052554


#### Adding mappings to STRING interactions (from Uniprot)

In [17]:
idmapping_string_dict = dict(
    zip(
        idmapping_string_df[2],
        idmapping_string_df[0],
    )
)

In [18]:
idmapping_hgnc_dict = dict(
    zip(
        idmapping_hgnc_df[0],
        idmapping_hgnc_df[2],
    )
)

In [21]:
nn_links_table_curated_df['prot1_uniprot'] = nn_links_table_curated_df[:]['protein1'].apply(
    lambda x: idmapping_string_dict.get(x)
)

nn_links_table_curated_df['prot2_uniprot'] = nn_links_table_curated_df[:]['protein2'].apply(
    lambda x: idmapping_string_dict.get(x)
)

In [22]:
nn_links_table_curated_df['prot1_hgnc'] = nn_links_table_curated_df[:]['prot1_uniprot'].apply(
    lambda x: idmapping_hgnc_dict.get(x)
)

nn_links_table_curated_df['prot2_hgnc'] = nn_links_table_curated_df[:]['prot2_uniprot'].apply(
    lambda x: idmapping_hgnc_dict.get(x)
)

In [23]:
nn_links_table_curated_df

Unnamed: 0,protein1,protein2,prot1_uniprot,prot2_uniprot,prot1_hgnc,prot2_hgnc
0,287.DR97_4287,287.DR97_1873,Q9HXY2,Q9HVL5,,
1,287.DR97_4287,287.DR97_3824,Q9HXY2,Q9HWY4,,
2,287.DR97_4288,287.DR97_4009,Q59640,Q9HXE9,,
3,287.DR97_4288,287.DR97_5630,Q59640,Q9I0U3,,
4,287.DR97_4288,287.DR97_1528,Q59640,Q9HW50,,
...,...,...,...,...,...,...
33722567,1549858.MC45_18470,1549858.MC45_02390,A0A097ELT7,A0A097ECZ2,,
33722568,1549858.MC45_18470,1549858.MC45_07065,A0A097ELT7,A0A097EF84,,
33722569,1549858.MC45_18470,1549858.MC45_10445,A0A097ELT7,A0A097EGM8,,
33722570,1549858.MC45_18470,1549858.MC45_08425,A0A097ELT7,A0A097EFN8,,


In [30]:
nn_links_table_hugo_df = nn_links_table_curated_df[['prot1_hgnc', 'prot2_hgnc']].dropna()

In [31]:
nn_links_table_uniprot_df = nn_links_table_curated_df[['prot1_uniprot', 'prot2_uniprot']].dropna()

In [46]:
string_db_id_set = set(nn_links_table_uniprot_df['prot1_uniprot']).union(nn_links_table_uniprot_df['prot2_uniprot'])
len(string_db_id_set)

2261133

In [47]:
nn_links_table_hugo_df

Unnamed: 0,prot1_hgnc,prot2_hgnc
5909958,HGNC:658,HGNC:1689
5909959,HGNC:658,HGNC:9758
5909960,HGNC:658,HGNC:6081
5909961,HGNC:658,HGNC:28889
5909962,HGNC:658,HGNC:2234
...,...,...
6311391,HGNC:15302,HGNC:711
6311392,HGNC:15302,HGNC:712
6311393,HGNC:15302,HGNC:4410
6311394,HGNC:15302,HGNC:9382


### Mapping to UMLS

In [24]:
mrconso_path = (
    '../../UMLS_Metathesaurus/mrconso_and_semtypes_2022AA_df.pkl'
)

In [25]:
mrconso_st_df = pd.read_pickle(mrconso_path)

In [26]:
mrconso_hgnc_df = mrconso_st_df[
    mrconso_st_df['SAB'] == 'HGNC'
][['CUI', 'STR', 'CODE']]

In [28]:
mrconso_hgnc_df

Unnamed: 0,CUI,STR,CODE
235919,C0008288,CIPC gene,HGNC:20365
235921,C0008288,CIPC,HGNC:20365
235923,C0008288,"CLOCK-interacting protein, circadian",HGNC:20365
235925,C0008288,CLOCK interacting pacemaker,HGNC:20365
235926,C0008288,KIAA1737,HGNC:20365
...,...,...,...
16596953,C5446057,TPM2P1,HGNC:55137
16596954,C5446057,TPM2 pseudogene 1,HGNC:55137
16596955,C5446058,TRMT1P1 gene,HGNC:55145
16596957,C5446058,TRMT1 pseudogene 1,HGNC:55145


In [29]:
mrconso_hgnc_to_cui_dict = (
    mrconso_hgnc_df[['CUI', 'CODE']]
    .groupby('CODE')
    .agg(set)
    ['CUI']
    .to_dict()
)

In [32]:
nn_links_table_hugo_df['prot1_cui'] = (
    nn_links_table_hugo_df['prot1_hgnc'].apply(
        lambda x: mrconso_hgnc_to_cui_dict.get(x)
    )
)

nn_links_table_hugo_df['prot2_cui'] = (
    nn_links_table_hugo_df['prot2_hgnc'].apply(
        lambda x: mrconso_hgnc_to_cui_dict.get(x)
    )
)

In [33]:
nn_links_table_cui_df = (
    nn_links_table_hugo_df
        .explode('prot1_cui')
        .explode('prot2_cui')
        .dropna()
)
len(nn_links_table_cui_df)

699128

In [34]:
nn_links_table_cui_df

Unnamed: 0,prot1_hgnc,prot2_hgnc,prot1_cui,prot2_cui
8562676,HGNC:658,HGNC:11276,C1412510,C1420389
8562677,HGNC:658,HGNC:14405,C1412510,C1422625
8562679,HGNC:658,HGNC:19349,C1412510,C1426181
8562680,HGNC:658,HGNC:24253,C1412510,C1539988
8562681,HGNC:658,HGNC:15852,C1412510,C1423774
...,...,...,...,...
9631099,HGNC:15302,HGNC:26176,C1423386,C1826795
9631100,HGNC:15302,HGNC:289,C1423386,C1366490
9631101,HGNC:15302,HGNC:4396,C1423386,C1333687
9631102,HGNC:15302,HGNC:15572,C1423386,C1826953


In [35]:
string_all_cui_mapped_pairs = list({
    tuple(sorted(p)) for p in list(
        zip(
            nn_links_table_cui_df['prot1_cui'],
            nn_links_table_cui_df['prot2_cui']
        )
    )
})
len(string_all_cui_mapped_pairs)

349357

## Saving

In [36]:
with open('../data_collection/01_cui_pairs_json/stringdb_cui_pairs.json', 'w') as f:
    json.dump(string_all_cui_mapped_pairs, f)