In [None]:
"""
In this notebook, the mapping from drug descriptions (present in FAERS) to active ingredients is performed.
"""

In [1]:
import sys
sys.path.insert(0, '../src')

In [2]:
import joblib
import pandas as pd
from xml.etree import ElementTree
from src.dataset.preprocessing import DridProcessor2, Drug

  from .autonotebook import tqdm as notebook_tqdm


### Drugbank

In [None]:
"""
Due to license restrictions, we're only allowed to provide drug names found in DrugBank.
If you have access to the full database, place the "full database.xml" file in ~/data/drid_files
and run the code below.
"""

tree = ElementTree.parse('../data/files/full database.xml')
root = tree.getroot()
drugs = [Drug(child) for child in root]
joblib.dump(drugs, '../data/files/drugs.joblib')

In [6]:
"""
It is possible to use DridProcessor2 without DrugBank files.
"""
fake_drugbank = []
joblib.dump(fake_drugbank, '../data/files/drugbank.joblib')

['../data/files/drugbank.joblib']

### Descriptions -> APIs

In [3]:
"""
The DridProcessor class expects a pandas dataframe with three columns as an input.
The dataframe must contain the following columns:
    - name: str - drug description
    - tokens: List[str] - tokenized descriptions (or just a single string in a list)
    - active: List[] - empty lists
"""

example_df = pd.read_csv('../data/files/my_drug_descriptions.tsv')  # toy example
example_df['tokens'] = example_df['name'].apply(lambda entry: entry.split(' | '))
example_df['active'] = [[] for _ in range(len(example_df))]

In [4]:
"""
To make use of previously done changes, first load the final instance of Processor and then update dataframes and provide the drugbank.
"""
processor = DridProcessor2()
processor.load('../data/files/processor.joblib')

processor.df = example_df
processor.processed_df = example_df.head(0)

In [5]:
processor.df.head(10)

Unnamed: 0,name,tokens,active
0,kelp | supplement,"[kelp, supplement]",[]
1,risperodal,[risperodal],[]
2,lyrica,[lyrica],[]
3,aggrenox,[aggrenox],[]
4,aprednislon,[aprednislon],[]
5,calcium | replacement,"[calcium, replacement]",[]
6,mononitrite | isosorbite,"[mononitrite, isosorbite]",[]
7,lotor,[lotor],[]
8,prenatal | iron,"[prenatal, iron]",[]
9,cisplatine | my,"[cisplatine, my]",[]


In [7]:
for _ in range(3):
    processor.auto_process()  # automatically apply all known mappings; can be done iteratively


| Auto-executing remove_by_string with [''] |

| Auto-executing remove_by_string with [''] |


In [8]:
processor.df.head(10)

Unnamed: 0,name,tokens,active
0,kelp | supplement,[],[]
1,risperodal,[],[risperidone]
2,lyrica,[],[pregabalin]
3,aggrenox,[],"[acetylsalicylic acid, dipyridamole]"
4,aprednislon,[aprednislon],[]
5,calcium | replacement,[replacement],[calcium]
6,mononitrite | isosorbite,"[isosorbite, mononitrite]",[]
7,lotor,[lotor],[]
8,prenatal | iron,[],[iron]
9,cisplatine | my,[cisplatine],[]


In [74]:
# Finally, manual processing will be required; executing "help" will show all available commands.

processor.process()


Current token: < aprednislon >
Original name: < aprednislon >

---< DrugBank >---
The similarity of < aprednislon > to < prednisolone > : 0.87
The similarity of < aprednislon > to < prednisone > : 0.857

---< PME >---
The < aprednislon > matches < prednisolone >

---< ChEMBL >---
The < aprednislon > matches < prednisolone >

---< Previous queries >---
Token matches previous query < ('substitute_by_regex', ['aprednislon', 'prednisolone']) >
