# Get Side Effects

In [66]:
import pickle

with open('vector_data/side_effects_processed.pkl', 'rb') as f:
    dense = pickle.load(f)

with open('data/side_effects.txt', 'r') as f:
    side_effects = f.readlines()
side_effects = [x.strip() for x in side_effects]

In [67]:
print(side_effects[:10])

['Acute abdomen', 'Syndrome abdominal acute', 'Abdominal syndrome acute', 'Abdominal cramps', 'Abdominal cramp', 'Cramp abdominal', 'Abdominal crampy pains', 'Griping abdominal', 'Griping abdomen', 'Distended abdomen']


# Get Drugs

In [68]:
# read data from data/meddra.tsv
import pandas as pd

side_effects_path = 'data/meddra.tsv'
columns = ['UMLS_concept_id', 'MedDRA_concept_type', 'MedRA_id', 'Side_effect_name']

df = pd.read_csv(side_effects_path, sep='\t', header=None)
df.columns = columns
df.head()

Unnamed: 0,UMLS_concept_id,MedDRA_concept_type,MedRA_id,Side_effect_name
0,C0000727,LT,10000647,Acute abdomen
1,C0000727,PT,10000647,Acute abdomen
2,C0000727,LT,10042784,Syndrome abdominal acute
3,C0000727,LT,10000096,Abdominal syndrome acute
4,C0000729,LT,10000057,Abdominal cramps


In [69]:
# show unique UMLS_concept_id
print(df['UMLS_concept_id'].nunique())

49561


# Find Similarity of Unknown Side Effects

In [80]:
from utils.embeddings import genEmbs

unknown = 'Abbonimal pain'
unknown_vector, _ = genEmbs(unknown)
print(unknown_vector)

[ 0.0144888  -0.00026168 -0.02003238 ...  0.0483966  -0.06477924
 -0.01412994]


In [81]:
# Model
import torch.nn as nn
import torch

class PostNet(nn.Module):
    def __init__(self):
        super(PostNet, self).__init__()
        self.fc1 = nn.Linear(1024, 1024)
        self.bn = nn.BatchNorm1d(1024)
        self.fc2 = nn.Linear(1024, 1024)
        self.output = nn.Linear(1024, 1024)
        self.relu = nn.ReLU()
    
    def forward(self, x):
        x = self.fc1(x)
        x = self.bn(x)
        x = self.relu(x)
        x = self.fc2(x)
        x = self.relu(x)
        x = self.output(x)
        return x
    
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = PostNet()
model.to(device)
# load weights
model.load_state_dict(torch.load('model/model_300.pth'))
model.eval()

PostNet(
  (fc1): Linear(in_features=1024, out_features=1024, bias=True)
  (bn): BatchNorm1d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (fc2): Linear(in_features=1024, out_features=1024, bias=True)
  (output): Linear(in_features=1024, out_features=1024, bias=True)
  (relu): ReLU()
)

In [82]:
import numpy as np

dummy_batch = np.stack([unknown_vector, unknown_vector], axis=0)
dummy_batch = torch.tensor(dummy_batch, dtype=torch.float32).to(device)
with torch.no_grad():
    unknown_vector_postnet = model(dummy_batch)[0].detach().cpu().numpy()
print(unknown_vector_postnet)

[-1.7908159e-04 -5.9611872e-02  2.5857890e-02 ... -8.8794867e-04
 -1.8171874e-01 -1.6177643e-02]


In [86]:
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm

similarities = []
for vector in tqdm(dense):
    sim = cosine_similarity(unknown_vector_postnet.reshape(1,-1), vector.reshape(1,-1))
    similarities.append(sim[0,0])

100%|██████████| 75604/75604 [00:25<00:00, 3014.77it/s]


In [None]:
# find at .97 similarity
threshold = 0.97
similarities = np.array(similarities)
similar_indices = np.where(similarities > threshold)[0]
print(similar_indices)

# show similar side effects
similar_side_effects = [side_effects[i] for i in similar_indices]
print(similar_side_effects)

[   23    24  8122 21895 25212 27751 27752 33930 33931 36356 38893 39172
 39173 40180 43949 68040]
['Abdominal pain NOS', 'Abd. pain', 'Pain assessment', 'Injury to abdominal aorta', 'Ache stomach', 'Anal pain', 'Pain anal', 'Abdominal pain generalised', 'Abdominal pain generalized', 'Central abdominal pain', 'Administration site pain', 'Abdominal pain localised', 'Abdominal pain localized', 'Abdominal pain aggravated', 'Gastrointestinal and abdominal pains (excl oral and throat)', 'Functional abdominal pain']


In [95]:
# map back to UMLS_concept_id

possible_drugs = []
for row in tqdm(df.iterrows()):
    if row[1]['Side_effect_name'] in similar_side_effects and row[1]['UMLS_concept_id'] not in possible_drugs:
        possible_drugs.append(row[1]['UMLS_concept_id'])

95912it [00:04, 23146.79it/s]


In [96]:
print(possible_drugs)

['C0000737', 'C0030198', 'C0160704', 'C0221512', 'C0238637', 'C0344304', 'C0423644', 'C0521491', 'C0522061', 'C0549273', 'C0851977', 'C1609533']
