# Get Synonym Lists
<br>
James Chapman<br>
CIS 830 Advanced Topics in AI – Term Project<br>
Kansas State University<br><br>

This notebook collects synonym lists from 5 sources.

- [PubChem's Python API - PubChemPy](https://pubchempy.readthedocs.io/en/latest/)

- [Slang Terms and Code Words:  A Reference for Law Enforcement Personnel](https://www.dea.gov/sites/default/files/2018-07/DIR-022-18.pdf)

- [GitHub Repo for GPT 3 Paper](https://github.com/kristycarp/gpt3-lexicon)

    - Carpenter KA, Altman RB. Using GPT-3 to Build a Lexicon of Drugs of Abuse Synonyms for Social Media Pharmacovigilance. Biomolecules. 2023; 13(2):387. https://doi.org/10.3390/biom13020387

- [GitHub Repo for RedMed Paper](https://github.com/alavertu/redmed)

    - Lavertu, A. & Altman, R. B. RedMed: Extending drug lexicons for social media applications. bioRxiv (2019). doi:10.1101/663625

- [DEA emoji drug codes](https://www.dea.gov/sites/default/files/2021-12/Emoji%20Decoded.pdf)


Creates dataframe and saves as CSV - "synonym_lists.csv"<br><br><br>

In [1]:
import os
import json
import ast
import csv
import re
import ftfy
import time
import pandas as pd
import pubchempy as pcp
import numpy as np
from openai import OpenAI
from anthropic import Anthropic
from fuzzywuzzy import process
from tqdm import tqdm
from dotenv import load_dotenv
tqdm.pandas()
load_dotenv() 


True

#### Shared List of Substances of Interest (Flattened, removed duplicates, etc.)

In [2]:
substances_of_interest = pd.read_csv('../data/substances_of_interest_list.csv', encoding="utf-8-sig")
substances_of_interest

Unnamed: 0,drug
0,"2,6-Xylidine"
1,2-amino-5-chloropyridine
2,2-fluoro-2-oxo PCE
3,2-Oxo-3-hydroxy-LSD
4,3-hydroxy flubromazepam
...,...
72,Speciociliatine
73,Temazepam
74,Xylazine
75,Zolpidem


# PubChem Synonym set

In [3]:
synonyms = pcp.get_synonyms("2,6-Xylidine", "name")
print(synonyms[0]["CID"])
print(synonyms[0]["Synonym"])

6896
['2,6-Dimethylaniline', '87-62-7', '2,6-XYLIDINE', 'o-Xylidine', '2-Amino-m-xylene', '2,6-Dimethylbenzenamine', 'Benzenamine, 2,6-dimethyl-', '2-Amino-1,3-dimethylbenzene', '2-Amino-1,3-xylene', '2,6-Dimethylphenylamine', '2,6-Xylylamine', '1-Amino-2,6-dimethylbenzene', 'Aniline, 2,6-dimethyl-', 'NCI-C56188', 'vic-m-xylidine', 'NSC 7098', 'DTXSID8026307', '2,6-Dimethyl-aniline', 'MFCD00007747', '4FT62OX08D', 'CHEBI:28738', '2,6-DMA', 'NSC-7098', 'DTXCID006307', 'CAS-87-62-7', 'CCRIS 2373', 'HSDB 2094', 'Benzene, 2-amino-1,3-dimethyl-', 'EINECS 201-758-7', 'BRN 0636332', 'UNII-4FT62OX08D', 'AI3-52358', '2,6 dimethylaniline', '2.6-dimethylaniline', 'Aniline,6-dimethyl-', '2,6 dimethyl aniline', '2,6-dimethyl aniline', 'Benzenamine,6-dimethyl-', '2,6-dimethylbenzeneamine', '2,6-dimethyl-phenylamine', '2,6-dimethyl benzeneamine', '(2,6-dimethylphenyl)amine', 'EC 201-758-7', 'XYLIDENE, 2,6-', '2,6-Dimethylaniline, 99%', '4-12-00-02521 (Beilstein Handbook Reference)', 'SCHEMBL106244', '

In [4]:
synonym_lists = substances_of_interest.copy()
synonym_lists["CID"] = None
synonym_lists["pubchem_synonyms"] = None

def fetch_pubchem_info(drug_name):
    try:
        results = pcp.get_synonyms(drug_name, "name")
        if results:
            cid = results[0].get("CID", None)
            synonyms = results[0].get("Synonym", None)
            return cid, synonyms
    except Exception as e:
        print(f"Error retrieving data for '{drug_name}': {e}")
    return None, None

# each row of substances and add PubChem data
for index, row in synonym_lists.iterrows():
    drug_name = row["drug"].strip()  
    cid, synonyms = fetch_pubchem_info(drug_name)
    synonym_lists.at[index, "CID"] = cid
    synonym_lists.at[index, "pubchem_synonyms"] = synonyms

synonym_lists

Unnamed: 0,drug,CID,pubchem_synonyms
0,"2,6-Xylidine",6896,"[2,6-Dimethylaniline, 87-62-7, 2,6-XYLIDINE, o..."
1,2-amino-5-chloropyridine,66174,"[2-Amino-5-chloropyridine, 1072-98-6, 5-Chloro..."
2,2-fluoro-2-oxo PCE,,
3,2-Oxo-3-hydroxy-LSD,10155149,"[2-Oxo-3-hydroxy-lysergide, 2-Oxo-3-hydroxy-LS..."
4,3-hydroxy flubromazepam,13126012,"[3-hydroxy Flubromazepam, 3-Hydroxyflubromazep..."
...,...,...,...
72,Speciociliatine,15560576,"[Speciociliatine, 14382-79-7, Speciociliatin, ..."
73,Temazepam,5391,"[temazepam, Hydroxydiazepam, Methyloxazepam, O..."
74,Xylazine,5707,"[xylazine, 7361-61-7, N-(2,6-Dimethylphenyl)-5..."
75,Zolpidem,5732,"[zolpidem, 82626-48-0, Zolpidemum, Zolpidemum ..."


In [5]:
# ##############################################################################
# ## substances without pubchem synonyms or CID ####################################
# ##############################################################################
none_results = synonym_lists[
    synonym_lists["CID"].isnull() | synonym_lists["pubchem_synonyms"].isnull()]

print(none_results)

                                   drug   CID pubchem_synonyms
2                    2-fluoro-2-oxo PCE  None             None
5   3-hydroxy flubromazepam glucuronide  None             None
7                                4-HIAA  None             None
11                 7-OH-CBD glucuronide  None             None
13                          8R-OH-R-HHC  None             None
14                          8S-OH-R-HHC  None             None
16              alpha-hydroxybromazolam  None             None
25                     delta-8-THC-COOH  None             None
26                     delta-9-THC-COOH  None             None
37                                  MDA  None             None
49               N-desethylmetonitazene  None             None
57             N-Pyrrolidinoetonitazene  None             None
64               para-Fluoronorfentanyl  None             None
70                           R-HHC-COOH  None             None
71                           S-HHC-COOH  None          

In [6]:
# ##############################################################################
# ## Add handpicked pubchem IDs & add synonyms ####################################
# ##############################################################################
hand_picked_IDs_dictionary = {
  "2-fluoro-2-oxo PCE": 168323041,
  "3-hydroxy flubromazepam glucuronide": 13126012,
  "4-HIAA ": 7061393,
  "7-OH-CBD glucuronide": 11301963,
  "8R-OH-R-HHC": 168323034,
#   "8S-OH-R-HHC": None,
#   "alpha-hydroxybromazolam": None,
  "delta-8-THC-COOH": 162389,
  "delta-9-THC-COOH": 107885,
  "MDA": 1614,
#   "N-desethylmetonitazene": "",
  "N-Pyrrolidinoetonitazene": 155804760,
  "para-Fluoronorfentanyl": 10610682,
#   "R-HHC-COOH": "R-hexahydrocannabinol carboxylic acid",
#   "S-HHC-COOH": "S-hexahydrocannabinol carboxylic acid"
}

for drug, hand_picked_ID in hand_picked_IDs_dictionary.items():
    hand_picked_ID = hand_picked_IDs_dictionary[drug]
    compound = pcp.Compound.from_cid(hand_picked_ID)
    synonyms = compound.synonyms
    matching_rows = synonym_lists[synonym_lists["drug"] == drug]
    if not matching_rows.empty:
        idx = matching_rows.index[0]
        
        synonym_lists.at[idx, "CID"] = hand_picked_ID
        synonym_lists.at[idx, "pubchem_synonyms"] = synonyms

In [7]:
none_results = synonym_lists[
    synonym_lists["CID"].isnull() | synonym_lists["pubchem_synonyms"].isnull()]

print(none_results)

                       drug   CID pubchem_synonyms
7                    4-HIAA  None             None
14              8S-OH-R-HHC  None             None
16  alpha-hydroxybromazolam  None             None
49   N-desethylmetonitazene  None             None
70               R-HHC-COOH  None             None
71               S-HHC-COOH  None             None


## DEA synonym_lists

https://www.dea.gov/sites/default/files/2018-07/DIR-022-18.psynonym_lists

In [8]:
# ##############################################################################
# From preprocessed data for DEA PDF
#       process_DEA_PDF.ipynb
# ##############################################################################
DEA_drug_slang = pd.read_csv('../data/DEA_slang.csv', encoding='utf-8-sig')
DEA_drug_slang

Unnamed: 0,index_term,alt_term
0,Acetaminophen and Oxycodone Combination (Perco...,512s; Bananas; Blue; Blue Dynamite; Blueberrie...
1,Alprazolam (Xanax®),Bars; Benzos; Bicycle Handle Bars; Bicycle Par...
2,Amphetamine,Acelerador; Amy; Amps; Bam; B-Bombs; Beans; Be...
3,Amphetamine and Dextroamphetamine Combination ...,A-Train; Abby; Addy; Amps; Christmas Trees; Co...
4,Buprenorphine and Naloxone Combination (Suboxo...,"Boxes, Bupes; Oranges; Sobos; Stop Signs; Stop..."
5,Clonazepam (Klonopin®),Benzos; K; K-Pin; Pin; Super Valium; Tranks
6,Cocaine,7; 62; 77; 777; 921; A-1; Adidas; All-American...
7,Crack Cocaine,51s; 151s; 501s; Apple Jack; Baby T; Base; Bas...
8,Ecstasy/MDMA/Molly,Adam; Baby Slits; Beans; Blue Kisses; Blue Sup...
9,Fentanyl and Fentanyl Derivatives,Apache; Birria (fentanyl mixed with heroin); B...


In [9]:
# ##############################################################################
# ## For each DEA term (which has synonym/slang list), 
# #### find matching term in our drug list (shared drugs of interest) 
# ##############################################################################
for index, row in DEA_drug_slang.iterrows():
    index_term = row["index_term"]  
    # Find the top three closest matches 
    top_matches = process.extract(index_term, synonym_lists["drug"].tolist(), limit=3)
    print("------------------------------------------------------------")
    for match, score in top_matches:
        ##################################
        if score > 50:  ################################## 
            print(f"{index_term} | {match} | {score}")

# I skimmed through these 2 find a threshold make sure the matches are good.

------------------------------------------------------------
Acetaminophen and Oxycodone Combination (Percocet®) | Oxycodone | 90
Acetaminophen and Oxycodone Combination (Percocet®) | Noroxycodone | 75
Acetaminophen and Oxycodone Combination (Percocet®) | Ketamine | 68
------------------------------------------------------------
Alprazolam (Xanax®) | Bromazolam | 63
Alprazolam (Xanax®) | Lorazepam | 60
------------------------------------------------------------
Amphetamine | Amphetamine | 100
Amphetamine | Methamphetamine | 85
Amphetamine | Ketamine | 74
------------------------------------------------------------
Amphetamine and Dextroamphetamine Combination (Adderall®) | Amphetamine | 90
Amphetamine and Dextroamphetamine Combination (Adderall®) | Ketamine | 79
Amphetamine and Dextroamphetamine Combination (Adderall®) | Methamphetamine | 72
------------------------------------------------------------
Buprenorphine and Naloxone Combination (Suboxone®) | Buprenorphine | 90
Buprenorphin

In [10]:
# ##############################################################################
# ## Add matching DEA drug name & synonyms
# ##############################################################################

synonym_lists["DEA_name"] = None
synonym_lists["DEA_synonyms"] = None
for index, row in DEA_drug_slang.iterrows():
    index_term = row["index_term"]  
    alt_term = row["alt_term"]
    
    # Find the closest match in the synonym_lists DataFrame
    match, score = process.extractOne(index_term, synonym_lists["drug"].tolist())
    ##################################
    if score > 88:  ############################### 
        matched_index = synonym_lists[synonym_lists["drug"] == match].index[0]
        synonym_lists.at[matched_index, "DEA_name"] = index_term
        synonym_lists.at[matched_index, "DEA_synonyms"] = alt_term
    else:
        print(index_term)

# DEA names without match to our dataset

Alprazolam (Xanax®)
Clonazepam (Klonopin®)
Cocaine
Crack Cocaine
Flunitrazepam (Rohypnol®)
GHB (Gamma-Hydroxybutyric Acid)
Heroin
Khat
LSD (Lysergic Acid Diethylamide)
Marijuana
Marijuana Concentrates/Hash Oil
Mescaline/Peyote
Methylphenidate (Ritalin®, Concerta®, Daytrana®)
Opium
Promethazine
Synthetic Cannabinoids
Synthetic Cathinones
Steroids
U-47700


In [11]:
# fix 3

# Amphetamine and Dextroamphetamine Combination (Adderall®) | Amphetamine | 90
synonym_lists.loc[synonym_lists['drug']=='Amphetamine', "DEA_name"] = None
synonym_lists.loc[synonym_lists['drug']=='Amphetamine', "DEA_synonyms"] = None

# Buprenorphine and Naloxone Combination (Suboxone®) | Buprenorphine | 90
synonym_lists.loc[synonym_lists['drug']=='Buprenorphine', "DEA_name"] = None
synonym_lists.loc[synonym_lists['drug']=='Buprenorphine', "DEA_synonyms"] = None

# LSD (Lysergic Acid Diethylamide) | LSD | 60
synonym_lists.loc[synonym_lists['drug']=='LSD', "DEA_name"] = "LSD (Lysergic Acid Diethylamide)"
synonym_lists.loc[synonym_lists['drug']=='LSD', "DEA_synonyms"] = DEA_drug_slang.loc[DEA_drug_slang['index_term'] == 'LSD (Lysergic Acid Diethylamide)', 'alt_term'].iloc[0]

# marijuana
synonym_lists.loc[synonym_lists['drug']=='delta-9-THC-COOH', "DEA_name"] = "Marijuana"
synonym_lists.loc[synonym_lists['drug']=='delta-9-THC-COOH', "DEA_synonyms"] = DEA_drug_slang.loc[DEA_drug_slang['index_term'] == 'Marijuana', 'alt_term'].iloc[0]

synonym_lists.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 77 entries, 0 to 76
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   drug              77 non-null     object
 1   CID               71 non-null     object
 2   pubchem_synonyms  71 non-null     object
 3   DEA_name          12 non-null     object
 4   DEA_synonyms      12 non-null     object
dtypes: object(5)
memory usage: 3.1+ KB


In [12]:
# Sanity check (these should be the same)
DEA_drug_slang[DEA_drug_slang['index_term'] == 'LSD (Lysergic Acid Diethylamide)']['alt_term']

17    Aceite; Acelide; Acid; Acido; Alice; Angels in...
Name: alt_term, dtype: object

In [13]:
synonym_lists[synonym_lists['drug']=='LSD']["DEA_synonyms"]

36    Aceite; Acelide; Acid; Acido; Alice; Angels in...
Name: DEA_synonyms, dtype: object

# GPT 3 Lexicon<br>

### https://github.com/kristycarp/gpt3-lexicon

Carpenter KA, Altman RB. Using GPT-3 to Build a Lexicon of Drugs of Abuse Synonyms for Social Media Pharmacovigilance. Biomolecules. 2023; 13(2):387. https://doi.org/10.3390/biom13020387

In [14]:
lexicon = pd.read_csv('../data/gpt3_lexicon_synonyms.csv', encoding="utf-8-sig")
lexicon 

Unnamed: 0,index term,DrugBank ID,widely discussed,GPT-3 synonyms
0,bromazepam,DB01558,True,"'bromaze','lexotan','bromazolam','bromaz','zep..."
1,dexfenfluramine,DB01191,False,"'appetite-decreasing_medication','fenfen','fen..."
2,oxymorphone,DB01192,True,"'endocet','oxymorphone_hydrochloride','numorph..."
3,methylphenidate,DB00422,True,"'ritalin','concerta','metadate','attention_def..."
4,methamphetamine,DB01577,True,"'methamphet','methamphetamine_hydrochloride','..."
...,...,...,...,...
93,alprazolam,DB00404,True,"'xanax','a-prazolam','a-pra-zolam','alprazoram..."
94,fenfluramine,DB00574,False,"'plegine','levofenfluramine','ionamin','fen-ph..."
95,zolpidem,DB00425,False,"'zolpidem_tartrate','zolpidem_10_mg','zolpidem..."
96,modafinil,DB00745,True,"'moadfinil','modifanil','moddafinil','modafini..."


In [15]:
# ##############################################################################
# ## For each GPT 3 term (which has synonym/slang list), 
# #### find matching term in our drug list (shared drugs of interest) 
# ##############################################################################
for index, row in lexicon.iterrows():
    index_term = row["index term"]  
    DrugBank_ID = row["DrugBank ID"]
    widely_discussed = row["widely discussed"]
    GPT_synonyms = row["GPT-3 synonyms"]
    # Find the top three closest matches 
    top_matches = process.extract(index_term, synonym_lists["drug"].tolist(), limit=3)
    print("------------------------------------------------------------")
    for match, score in top_matches:
        if score > 88:  ################################## 
            print(f"{index_term} | {match} | {score}")

------------------------------------------------------------
bromazepam | 3-hydroxy flubromazepam | 90
bromazepam | 3-hydroxy flubromazepam glucuronide | 90
------------------------------------------------------------
------------------------------------------------------------
oxymorphone | Oxymorphone | 100
------------------------------------------------------------
------------------------------------------------------------
methamphetamine | Methamphetamine | 100
------------------------------------------------------------
------------------------------------------------------------
morphine | Morphine | 100
morphine | 6-acetylmorphine | 90
------------------------------------------------------------
phencyclidine | Phencyclidine | 100
------------------------------------------------------------
dihydrocodeine | Codeine | 90
------------------------------------------------------------
------------------------------------------------------------
------------------------------------

In [16]:
# ##############################################################################
# ## Add GPT 3 data, for matching drugs
# ##############################################################################
synonym_lists["DrugBank_ID"] = None
synonym_lists["widely_discussed"] = None
synonym_lists["GPT_synonyms"] = None
for index, row in lexicon.iterrows():
    index_term = row["index term"]  
    DrugBank_ID = row["DrugBank ID"]
    widely_discussed = row["widely discussed"]
    GPT_synonyms = row["GPT-3 synonyms"]
    # Find the top 
    match, score = process.extractOne(index_term, synonym_lists["drug"].tolist())
    
    if score > 92:  ############################### 
        matched_index = synonym_lists[synonym_lists["drug"] == match].index[0]
        synonym_lists.at[matched_index, "DrugBank_ID"] = DrugBank_ID
        synonym_lists.at[matched_index, "widely_discussed"] = widely_discussed
        synonym_lists.at[matched_index, "GPT_synonyms"] = GPT_synonyms

synonym_lists.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 77 entries, 0 to 76
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   drug              77 non-null     object
 1   CID               71 non-null     object
 2   pubchem_synonyms  71 non-null     object
 3   DEA_name          12 non-null     object
 4   DEA_synonyms      12 non-null     object
 5   DrugBank_ID       22 non-null     object
 6   widely_discussed  22 non-null     object
 7   GPT_synonyms      22 non-null     object
dtypes: object(8)
memory usage: 4.9+ KB


In [17]:
unmatched = synonym_lists[synonym_lists['GPT_synonyms'].isna()]

print("Drugs with NO GPT-3 match:")
print(unmatched[['drug', 'CID']].to_string(index=False))

Drugs with NO GPT-3 match:
                               drug       CID
                       2,6-Xylidine      6896
           2-amino-5-chloropyridine     66174
                 2-fluoro-2-oxo PCE 168323041
                2-Oxo-3-hydroxy-LSD  10155149
            3-hydroxy flubromazepam  13126012
3-hydroxy flubromazepam glucuronide  13126012
                             4-ANPP     88890
                             4-HIAA      None
                   6-acetylmorphine   5462507
                  7-aminoclonazepam    188298
               7-hydroxymitragynine  44301524
               7-OH-CBD glucuronide  11301963
                  8-aminoclonazolam  12562515
                        8R-OH-R-HHC 168323034
                        8S-OH-R-HHC      None
            alpha-hydroxyalprazolam    162244
            alpha-hydroxybromazolam      None
                    Benzoylecgonine    448223
                         Bromazolam  12562546
                                CBD    644019
       

In [18]:
unmatched_terms = []

for _, row in lexicon.iterrows():
    index_term = row["index term"]
    match, score = process.extractOne(index_term, synonym_lists["drug"].tolist())
    if score <= 92:
        unmatched_terms.append((index_term, score))

print("Index terms without a strong GPT-3 match (>92):")
for term, score in unmatched_terms:
    print(f"{term}  (best score: {score})")


Index terms without a strong GPT-3 match (>92):
bromazepam  (best score: 90)
dexfenfluramine  (best score: 60)
methylphenidate  (best score: 62)
pentazocine  (best score: 70)
dihydrocodeine  (best score: 90)
pethidine  (best score: 74)
flunitrazepam  (best score: 77)
butalbital  (best score: 54)
normethadone  (best score: 86)
clotiazepam  (best score: 80)
diphenoxylate  (best score: 64)
flurazepam  (best score: 87)
midazolam  (best score: 74)
ketobemidone  (best score: 70)
dextromoramide  (best score: 56)
tapentadol  (best score: 57)
thiopental  (best score: 63)
lisdexamfetamine  (best score: 79)
diethyltryptamine  (best score: 69)
methaqualone  (best score: 76)
perampanel  (best score: 60)
phendimetrazine  (best score: 68)
barbital  (best score: 53)
ethchlorvynol  (best score: 45)
fenethylline  (best score: 68)
estazolam  (best score: 63)
nabilone  (best score: 67)
cathine  (best score: 69)
lacosamide  (best score: 60)
clorazepate  (best score: 80)
cloxazolam  (best score: 81)
amobarb

# RedMed Lexicon<br>

### https://github.com/alavertu/redmed

Lavertu, A. & Altman, R. B. RedMed: Extending drug lexicons for social media applications. bioRxiv (2019). doi:10.1101/663625

In [19]:
redmed_lexicon = pd.read_csv('../data/redmed_lexicon.csv', encoding="utf-8-sig")
redmed_lexicon.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2997 entries, 0 to 2996
Data columns (total 12 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   dbid             2997 non-null   object
 1   drug             2997 non-null   object
 2   known            2975 non-null   object
 3   misspellingPhon  484 non-null    object
 4   edOne            1317 non-null   object
 5   edTwo            1519 non-null   object
 6   pillMark         19 non-null     object
 7   google_ms        28 non-null     object
 8   google_title     702 non-null    object
 9   google_snippet   727 non-null    object
 10  ud_slang         40 non-null     object
 11  missed           2856 non-null   object
dtypes: object(12)
memory usage: 281.1+ KB


In [20]:
# ##############################################################################
# ## For each redmed term (which has synonym/slang list), 
# #### find matching term in our drug list (shared drugs of interest) 
# ##############################################################################
# Add redmed data, for matching drugs
# ##############################################################################
synonym_lists["dbid"]            = None
synonym_lists["known"]           = None
synonym_lists["misspellingPhon"] = None
synonym_lists["edOne"]           = None
synonym_lists["edTwo"]           = None
synonym_lists["pillMark"]        = None
synonym_lists["google_ms"]       = None
synonym_lists["google_title"]    = None
synonym_lists["google_snippet"]  = None
synonym_lists["ud_slang"]        = None
synonym_lists["missed"]          = None
for index, row in redmed_lexicon.iterrows():
    drug_redmed_lexicon = row["drug"]  
    if drug_redmed_lexicon == "etonitazene":
        continue
    dbid            = row["dbid"]
    known           = row["known"]
    misspellingPhon = row["misspellingPhon"]
    edOne           = row["edOne"]
    edTwo           = row["edTwo"]
    pillMark        = row["pillMark"]
    google_ms       = row["google_ms"]
    google_title    = row["google_title"]
    google_snippet  = row["google_snippet"]
    ud_slang        = row["ud_slang"]
    missed          = row["missed"]
    # Find top 1
    match, score = process.extractOne(drug_redmed_lexicon, synonym_lists["drug"].tolist())
    ###############################
    if score > 92:  ############################### 
        matched_index = synonym_lists[synonym_lists["drug"] == match].index[0]
        if synonym_lists.at[matched_index, "dbid"] is not None:
            print("------------------------------------------------------------")
            print(drug_redmed_lexicon)
            print(match)
            print(score)
            print("Already matched with a redmed ID")
        synonym_lists.at[matched_index, "dbid"] = dbid
        synonym_lists.at[matched_index, "known"] = known
        synonym_lists.at[matched_index, "misspellingPhon"] = misspellingPhon
        synonym_lists.at[matched_index, "edOne"] = edOne
        synonym_lists.at[matched_index, "edTwo"] = edTwo
        synonym_lists.at[matched_index, "pillMark"] = pillMark
        synonym_lists.at[matched_index, "google_ms"] = google_ms
        synonym_lists.at[matched_index, "google_title"] = google_title
        synonym_lists.at[matched_index, "google_snippet"] = google_snippet
        synonym_lists.at[matched_index, "ud_slang"] = ud_slang
        synonym_lists.at[matched_index, "missed"] = missed
        if score < 100:
            print("------------------------------------------------------------")
            print(match)
            print(drug_redmed_lexicon)
            print(score)
    
synonym_lists.info(verbose=True)

------------------------------------------------------------
Psilocybin
psilocybine
95
------------------------------------------------------------
Nordiazepam
nordazepam
95
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 77 entries, 0 to 76
Data columns (total 19 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   drug              77 non-null     object
 1   CID               71 non-null     object
 2   pubchem_synonyms  71 non-null     object
 3   DEA_name          12 non-null     object
 4   DEA_synonyms      12 non-null     object
 5   DrugBank_ID       22 non-null     object
 6   widely_discussed  22 non-null     object
 7   GPT_synonyms      22 non-null     object
 8   dbid              29 non-null     object
 9   known             29 non-null     object
 10  misspellingPhon   11 non-null     object
 11  edOne             25 non-null     object
 12  edTwo             28 non-null     object
 13  pillMark          4 non-null

In [21]:
synonym_lists

Unnamed: 0,drug,CID,pubchem_synonyms,DEA_name,DEA_synonyms,DrugBank_ID,widely_discussed,GPT_synonyms,dbid,known,misspellingPhon,edOne,edTwo,pillMark,google_ms,google_title,google_snippet,ud_slang,missed
0,"2,6-Xylidine",6896,"[2,6-Dimethylaniline, 87-62-7, 2,6-XYLIDINE, o...",,,,,,,,,,,,,,,,
1,2-amino-5-chloropyridine,66174,"[2-Amino-5-chloropyridine, 1072-98-6, 5-Chloro...",,,,,,,,,,,,,,,,
2,2-fluoro-2-oxo PCE,168323041,"[2-fluoro-2-oxo PCE (hydrochloride), 2850352-6...",,,,,,,,,,,,,,,,
3,2-Oxo-3-hydroxy-LSD,10155149,"[2-Oxo-3-hydroxy-lysergide, 2-Oxo-3-hydroxy-LS...",,,,,,,,,,,,,,,,
4,3-hydroxy flubromazepam,13126012,"[3-hydroxy Flubromazepam, 3-Hydroxyflubromazep...",,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
72,Speciociliatine,15560576,"[Speciociliatine, 14382-79-7, Speciociliatin, ...",,,,,,,,,,,,,,,,
73,Temazepam,5391,"[temazepam, Hydroxydiazepam, Methyloxazepam, O...",,,DB00231,False,"'temaze','restoril','temazepan','temazepane','...",DB00231,"restoril,temaze,temazepams,normison,temazepam",,"tamazepam,temazpam,tempazepam,temazipam,temazi...","temanu,tamazapam,teaave,fenazepam,tamzepam,med...",temaz,,temazzies,,,"clonzalam,sedates_me,avoid_taking_drugs,diazap..."
74,Xylazine,5707,"[xylazine, 7361-61-7, N-(2,6-Dimethylphenyl)-5...",,,,,,DB11477,"rompun,xylazine",,,"wompuz,bolazine",,,detomidine,,,"tranquilizer,vetrinary,anaesthetizing,tranquil..."
75,Zolpidem,5732,"[zolpidem, 82626-48-0, Zolpidemum, Zolpidemum ...",,,DB00425,False,"'zolpidem_tartrate','zolpidem_10_mg','zolpidem...",DB00425,"sublinox,zolpidems,hypnogen,edluar,ambiens,zol...",,"zopidem,zolpiden,zolipidem,nesen,ambie,stillno...","ambein,ambii,zolpdeim,zolpediem,zolipdem,zoldi...",,,"cavalleria,imidazopyridine,hypnotics","sleeping_tablets,zolfresh",,"jakke,feeling_real_nice,benzopines,incredibly_..."


# Convert all synonym lists to the same format ['','']


In [22]:
# ##############################################################################
# ## some are; separated, some are [ ] separated, some are [a,b,c] separated etc.
# ##############################################################################
def standardize_synonyms(cell):
    if isinstance(cell, (list, tuple, np.ndarray)):
        return [str(x).strip() for x in cell if pd.notna(x)]
    if pd.isna(cell):
        return []
    s = str(cell).strip()
    if s.startswith('[') and s.endswith(']'):
        try:
            items = ast.literal_eval(s)
        except Exception:
            items = s[1:-1].split(',')
        return [str(x).strip().strip("'\"") for x in items if str(x).strip()]
    parts = s.split(';') if ';' in s else s.split(',')
    return [p.strip().strip("'\"") for p in parts if p.strip()]

skip_cols = ['drug', 'CID', 'DrugBank_ID', 'widely_discussed', 'dbid']
syn_cols  = [c for c in synonym_lists.columns if c not in skip_cols]
for col in syn_cols:
    synonym_lists[col] = (
        synonym_lists[col]
          .apply(standardize_synonyms)
          .apply(lambda lst: json.dumps(lst, ensure_ascii=False) if lst else None)
    )

# ##############################################################################
# Remove DEA name column
# ##############################################################################
def merge_dea_names(raw_syn, dea_name):
    # parse existing JSON list
    try:
        base = json.loads(raw_syn) if isinstance(raw_syn, str) else []
    except json.JSONDecodeError:
        base = []
    # append the DEA_name if present
    if pd.notna(dea_name):
        name = str(dea_name).strip()
        if name and name not in base:
            base.append(name)
    return base

synonym_lists['DEA_synonyms'] = (
    synonym_lists
      .apply(lambda r: merge_dea_names(r['DEA_synonyms'], r['DEA_name']), axis=1)
      .apply(lambda lst: json.dumps(lst, ensure_ascii=False) if lst else None)
)
synonym_lists.drop(columns=['DEA_name'], inplace=True)

# ##############################################################################
# Remove underscores & combine most redmed columns -> redmed_synonyms
# ##############################################################################
columns_to_fix = [ # red med
    'known',
    'misspellingPhon',
    'edOne',
    'edTwo',
    'pillMark',
    'google_ms',
    'google_title',
    'google_snippet',
    'ud_slang',
    'missed'
]
def remove_underscores_from_json_list(cell):
    if pd.isna(cell):
        return cell
    try:
        items = json.loads(cell)
        if isinstance(items, list):
            items = [s.replace('_', ' ') if isinstance(s, str) else s for s in items]
        return json.dumps(items, ensure_ascii=False)
    except Exception:
        return cell

for col in columns_to_fix:
    if col in synonym_lists.columns:
        synonym_lists[col] = synonym_lists[col].apply(remove_underscores_from_json_list)


def combine_redmed_columns(row):
    combined = []
    for col in [ # red med
                'known',
                'misspellingPhon',
                'edOne',
                'edTwo',
                'pillMark',
                # 'google_ms',
                # 'google_title',
                # 'google_snippet',
                # 'ud_slang',
                # 'missed'
                ]:
        val = row.get(col)
        if pd.isna(val):
            continue
        try:
            items = json.loads(val) if isinstance(val, str) else val
        except Exception:
            items = val
        if isinstance(items, list):
            combined.extend([str(x).strip() for x in items if x and str(x).strip()])
        elif isinstance(items, str) and items.strip():
            combined.append(items.strip())
    # Remove duplicates while preserving order
    seen = set()
    result = []
    for x in combined:
        if x not in seen:
            seen.add(x)
            result.append(x)
    return json.dumps(result, ensure_ascii=False) if result else None

synonym_lists['redmed_synonyms'] = synonym_lists.apply(combine_redmed_columns, axis=1)


# ##############################################################################
# collect all synonyms in one list  all_synonyms
# ##############################################################################
def collect_all(row):
    pool = {}
    for col in syn_cols:
        vals = row.get(col)
        # decode JSON strings back to lists
        if isinstance(vals, str):
            try:
                vals = json.loads(vals)
            except json.JSONDecodeError:
                vals = [vals]
        # skip if empty/None
        if not vals:
            continue
        # ensure iterable
        candidates = vals if isinstance(vals, (list, tuple)) else [vals]
        for s in candidates:
            if not s:
                continue
            key = str(s).lower().strip()
            if key and key not in pool:
                pool[key] = str(s).strip()
    return list(pool.values())

synonym_lists['all_synonyms'] = synonym_lists.apply(collect_all, axis=1)
synonym_lists.info(verbose=True)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 77 entries, 0 to 76
Data columns (total 20 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   drug              77 non-null     object
 1   CID               71 non-null     object
 2   pubchem_synonyms  71 non-null     object
 3   DEA_synonyms      12 non-null     object
 4   DrugBank_ID       22 non-null     object
 5   widely_discussed  22 non-null     object
 6   GPT_synonyms      22 non-null     object
 7   dbid              29 non-null     object
 8   known             29 non-null     object
 9   misspellingPhon   11 non-null     object
 10  edOne             25 non-null     object
 11  edTwo             28 non-null     object
 12  pillMark          4 non-null      object
 13  google_ms         3 non-null      object
 14  google_title      18 non-null     object
 15  google_snippet    18 non-null     object
 16  ud_slang          5 non-null      object
 17  missed            

# EMOJIs

In [23]:
emoji_map = {
    'Oxycodone'      : ['🍌', '🔵', '🅿️', '💊'],
    'MDMA'           : ['🍬', '❌', '⚡', '💊', '❤️'],
    'Psilocybin'     : ['🍄'],
    'delta-9-THC-COOH': ['🌲', '🍀', '🚬', '🌴', '🪴'],
    'Methamphetamine': ['💙', '🔮', '💎', '🧪'],
}

#others
# Cocaine	['🐡', '💎', '🎱', '😛', '🔑', '🦜', '🌨️', '❄️', '⛄']
# Adderall	['🅰️', '🚆', '💊']
# Fake prescription pills (general)	['💊']
# Xanax (benzodiazepines)	['🚌', '🍫', '💊']
# Cough syrup / “lean”	['🍇', '💜', '🍼'] #diphenhydramine?
# Heroin	['🤎', '🐉']

synonym_lists['DEA_emojis'] = synonym_lists['drug'].map(emoji_map)
synonym_lists.to_csv('../data/synonym_lists.csv', encoding='utf-8-sig', index=False)
synonym_lists.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 77 entries, 0 to 76
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   drug              77 non-null     object
 1   CID               71 non-null     object
 2   pubchem_synonyms  71 non-null     object
 3   DEA_synonyms      12 non-null     object
 4   DrugBank_ID       22 non-null     object
 5   widely_discussed  22 non-null     object
 6   GPT_synonyms      22 non-null     object
 7   dbid              29 non-null     object
 8   known             29 non-null     object
 9   misspellingPhon   11 non-null     object
 10  edOne             25 non-null     object
 11  edTwo             28 non-null     object
 12  pillMark          4 non-null      object
 13  google_ms         3 non-null      object
 14  google_title      18 non-null     object
 15  google_snippet    18 non-null     object
 16  ud_slang          5 non-null      object
 17  missed            

# Collect each synonyms list as one row

In [24]:
# ##############################################################################
# Collect each synonyms list as one row
# ##############################################################################
synonym_cols = ['GPT_synonyms', 'pubchem_synonyms', 'redmed_synonyms', 'DEA_synonyms']#,  'DEA_emojis']
records = []
for _, row in synonym_lists.iterrows():
    base = {
        'drug':             row['drug'],
        'CID':              row['CID'],
        'DrugBank_ID':      row['DrugBank_ID'],
        'widely_discussed': row['widely_discussed']
    }
    for col in synonym_cols:
        raw = row[col]
        if pd.isna(raw):
            continue

        # Parse `raw` into a Python list of strings
        try:
            syn_list = json.loads(raw)
        except (json.JSONDecodeError, TypeError):
            s = str(raw).strip()
            if s.startswith('[') and s.endswith(']'):
                try:
                    syn_list = ast.literal_eval(s)
                except Exception:
                    syn_list = [s]
            else:
                parts = s.split(';') if ';' in s else s.split(',')
                syn_list = [p.strip().strip("'\"") for p in parts if p.strip()]

        # Only keep non‑empty lists
        if syn_list:
            records.append({
                **base,
                'synonyms': syn_list,
                'source':   col
            })

#
synonym_lists = pd.DataFrame(records, columns=['drug', 'CID', 'DrugBank_ID', 'widely_discussed', 'synonyms', 'source'])
#synonym_lists.to_csv('../data/synonym_lists_ long, index=False, encoding='utf-8-sig')
print(synonym_lists.info())
print(synonym_lists.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 134 entries, 0 to 133
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   drug              134 non-null    object
 1   CID               134 non-null    int64 
 2   DrugBank_ID       74 non-null     object
 3   widely_discussed  74 non-null     object
 4   synonyms          134 non-null    object
 5   source            134 non-null    object
dtypes: int64(1), object(5)
memory usage: 6.4+ KB
None
                       drug        CID DrugBank_ID widely_discussed  \
0              2,6-Xylidine       6896        None             None   
1  2-amino-5-chloropyridine      66174        None             None   
2        2-fluoro-2-oxo PCE  168323041        None             None   
3       2-Oxo-3-hydroxy-LSD   10155149        None             None   
4   3-hydroxy flubromazepam   13126012        None             None   

                                            sy