In [None]:
# Import data set

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

data = pd.read_csv('crowd_final.csv')
data = data.drop(data[data['relation']== 'IDK'].index).reset_index()
data['relation'].value_counts()

In [None]:
print(data[data['text'].isnull()]['text'])
print(data[data['relation'].isnull()]['relation'])
print(data[data['DOID'].isnull()]['DOID'])
print(data[data['DBID'].isnull()]['DBID'])

In [None]:
text =  data['text']
labels = data['relation']
disease = data['DOID']
drug = data['DBID']

In [125]:
# Functions for cleaning 

def remove (x):
    no_punct = ""
    for char in x:
            if char in '''qwertyuiopasdfghjklzxcvbnmQWERTYUIOPASDFGHJKLZXCVBNM ''':
                    no_punct = no_punct + char
    return no_punct

def remove_non_digits (x):
    no_punct = ""
    for char in x:
            if char in '''1234567890qwertyuiopasdfghjklzxcvbnmQWERTYUIOPASDFGHJKLZXCVBNM ''':
                    no_punct = no_punct + char
    return no_punct

def lower (x):
    return x.lower()

# Preprocess

In [231]:
# --- Encode Labels ---

from sklearn import preprocessing
le = preprocessing.LabelEncoder()
le.fit(data['relation'])
le.classes_
labels_en = le.transform(data['relation']) 

# Check the encoding

zero = list(le.inverse_transform([0]))
one = list(le.inverse_transform([1]))
two = list(le.inverse_transform([2]))
three = list(le.inverse_transform([3]))

print(zero, 'is encoded as 0')
print(one, 'is encoded as 1')
print(two, 'is encoded as 2')
print(three, 'is encoded as 3')



['contraindication'] is encoded as 0
['effect'] is encoded as 1
['relief'] is encoded as 2
['treatment'] is encoded as 3


In [232]:
# --- Encode Drugs ---

le.fit(data['DBID'])
le.classes_
drug_labbeled = le.transform(data['DBID']) 

# --- Encode Disease ---

le.fit(data['DOID'])
le.classes_
disease_labbeled = le.transform(data['DOID'])

In [233]:
# --- Remove stop words and clean the Text ---

from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 

stop_words = set(stopwords.words('english'))
data['text'] = data['text'].apply(remove)
data['text'] = data['text'].apply(lower)

def remove_stopwords(sentence):
    word_tokens = word_tokenize(sentence)
    filtered_sentence = []
    for w in word_tokens:
        if w not in stop_words and len(w)>=2:
            filtered_sentence.append(w)
    return filtered_sentence

corpus = data['text'].apply(remove_stopwords)

# New Data Frame with tokenized and clean sentences
d = {'label': labels_en, 'text': corpus, 'disease': disease_labbeled, 'drug':drug_labbeled}
df = pd.DataFrame(data=d)

# Embeddings

## Word2Vec

In [209]:
# Word2Vec
import gensim
from gensim.models import Word2Vec

In [210]:
# Encode the labels 
y = labels_en

In [211]:
# Train the model of vector representation
model = Word2Vec(sentences = corpus, size = 100, sg = 1, window = 3, 
                 min_count = 1, iter = 10, workers = 3)

In [212]:
# Create a copy of the initial df so that we work on that 
df_en = df.copy()

In [213]:
# For each instance/sentence compute the average of all the words 

def avg_vector(list_of_words):
    vector_sum = model.wv[list_of_words[0]]
    for i in range(1,len(list_of_words)):
        vector_sum = vector_sum +  model.wv[list_of_words[i]]
    return vector_sum/len(list_of_words)

df_en['text'] = df_en['text'].apply(avg_vector)

In [214]:
# Create a data frame with the embedded text
d2 = {'label': y, 'text':df_en['text'] , 'disease': disease_labbeled, 'drug':drug_labbeled}
df2 = pd.DataFrame(data=d2)
embedded = df2.to_csv('embedded.csv')

In [221]:
# Create the features

X_features = []
for i in range(len(df_en['text'])):
    a = df_en['text'][i].tolist()
    b = int(drug_labbeled[i])
    c = int(disease_labbeled[i])
    a.append(b)
    d = a
    d.append(c)
    e = d
    X_features.append(e)


## BERT

In [101]:
from bert_embedding import BertEmbedding
bert_embedding = BertEmbedding()

def get_embeddings (sentence):
    result = bert_embedding(sentence)
    average_vec = result[0][1][0]
    for i in range(1,len(result)):
        average_vec = np.add(average_vec,result[i][1][0])
    return average_vec/len(result)

In [None]:
final_bert= pd.DataFrame()
count = 0

for i in range(len(df['text'])):
    count = count +1
    print(count,i)
    final_bert[i]= get_embeddings(corpus[i])

1 0
2 1
3 2
4 3
5 4
6 5
7 6
8 7
9 8
10 9
11 10
12 11
13 12
14 13
15 14
16 15
17 16
18 17
19 18
20 19
21 20
22 21
23 22
24 23
25 24
26 25
27 26
28 27
29 28
30 29
31 30
32 31
33 32
34 33
35 34
36 35
37 36
38 37
39 38
40 39
41 40
42 41
43 42
44 43
45 44
46 45
47 46
48 47
49 48
50 49
51 50
52 51
53 52
54 53
55 54
56 55
57 56
58 57
59 58
60 59
61 60
62 61
63 62
64 63
65 64
66 65
67 66
68 67
69 68
70 69
71 70
72 71
73 72
74 73
75 74
76 75
77 76
78 77
79 78
80 79
81 80
82 81
83 82
84 83
85 84
86 85
87 86
88 87
89 88
90 89
91 90
92 91
93 92
94 93
95 94
96 95
97 96
98 97
99 98
100 99
101 100
102 101
103 102
104 103
105 104
106 105
107 106
108 107
109 108
110 109
111 110
112 111
113 112
114 113
115 114
116 115
117 116
118 117
119 118
120 119
121 120
122 121
123 122
124 123
125 124
126 125
127 126
128 127
129 128
130 129
131 130
132 131
133 132
134 133
135 134
136 135
137 136
138 137
139 138
140 139
141 140
142 141
143 142
144 143
145 144
146 145
147 146
148 147
149 148
150 149
151 150
152 151
15

# Baseline Models

In [216]:
# Scaling

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix,classification_report
from sklearn.model_selection import cross_val_score, GridSearchCV

scaler = StandardScaler()

# Plotting
import seaborn as sns

## SVM

In [217]:
from sklearn import svm 
from sklearn.svm import SVC

from sklearn.model_selection import StratifiedKFold, KFold
import pycm
from pycm import *

skf = StratifiedKFold(n_splits=5)
X = scaler.fit_transform(X_features)
#X = X_features #or not scaled

all_cm = []

for train_index, test_index in skf.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    class_weight = dict({0:4, 1:5, 2:4, 3:1})
    #, class_weight = class_weight
    
    
    model = svm.SVC(gamma= 'scale', class_weight = class_weight)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    cm = ConfusionMatrix(y_pred, y_test)
    
    all_cm.append(cm)
    

In [218]:
# Combine the different confusion matrices from the k validation sets

confusion_matrix = pd.DataFrame()
for i in range(len(all_cm)):
      confusion_matrix= confusion_matrix.append(pd.DataFrame(all_cm[i].table))
        
confusion_matrix = confusion_matrix.groupby(confusion_matrix.index).sum()
confusion_matrix.columns=['Contraindication', 'Effect', 'Relief', 'Treatment']
confusion_matrix.index = ['Contraindication', 'Effect', 'Relief', 'Treatment']
confusion_matrix


Unnamed: 0,Contraindication,Effect,Relief,Treatment
Contraindication,275,48,97,34
Effect,40,146,44,28
Relief,108,40,321,34
Treatment,155,143,182,1884


In [219]:
# Metrics from confusion matrix
FP = confusion_matrix.sum(axis=0) - np.diag(confusion_matrix)  
FN = confusion_matrix.sum(axis=1) - np.diag(confusion_matrix)
TP = np.diag(confusion_matrix)
TN = confusion_matrix.values.sum() - (FP + FN + TP)

# Overall accuracy
ACC = (TP+TN)/(TP+FP+FN+TN)
PRE = (TP)/(TP+FP)
REC = (TP)/(TP+FN)
F1 = 2*(TP)/(2*TP+FP+FN)


print('---Accuracy---')
print(ACC)
print('---Precision---')
print(PRE)
print('---Recall---')
print(REC)
print('---F1-score---')
print(F1)

---Accuracy---
Contraindication    0.865326
Effect              0.904163
Relief              0.858899
Treatment           0.839061
dtype: float64
---Precision---
Contraindication    0.475779
Effect              0.387268
Relief              0.498447
Treatment           0.951515
dtype: float64
---Recall---
Contraindication    0.605727
Effect              0.565891
Relief              0.638171
Treatment           0.796954
dtype: float64
---F1-score---
Contraindication    0.532946
Effect              0.459843
Relief              0.559721
Treatment           0.867403
dtype: float64


In [220]:
mistakes  = FP.sum()
mistakes

953

## Decision Tree

In [159]:
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics
from sklearn.model_selection import StratifiedKFold
from pycm import *

In [180]:
skf = StratifiedKFold(n_splits=5)
dt_model = DecisionTreeClassifier()

X = scaler.fit_transform(X_features)
#X = X_features
all_cm = []


for train_index, test_index in skf.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    #print (len(X_train), len(X_test))
    class_weight = dict({0:4, 1:5, 2:4, 3:1})
    #, class_weight = class_weight
    
    
    dt_model = DecisionTreeClassifier( criterion = 'entropy', random_state = 42)
    dt_model = dt_model.fit(X_train, y_train)
    
    
    y_pred = dt_model.predict(X_test)
    
    cm = ConfusionMatrix(y_test, y_pred)
    
    all_cm.append(cm)

In [181]:
# Combine the different confusion matrices from the k validation sets

confusion_matrix = pd.DataFrame()
for i in range(len(all_cm)):
      confusion_matrix= confusion_matrix.append(pd.DataFrame(all_cm[i].table))
        
confusion_matrix = confusion_matrix.groupby(confusion_matrix.index).sum()
confusion_matrix.columns=['Contraindication', 'Effect', 'Relief', 'Treatment']
confusion_matrix.index = ['Contraindication', 'Effect', 'Relief', 'Treatment']
confusion_matrix

Unnamed: 0,Contraindication,Effect,Relief,Treatment
Contraindication,261,41,67,83
Effect,31,162,36,54
Relief,80,22,323,114
Treatment,82,33,77,2113


In [182]:
# Metrics from confusion matrix
FP = confusion_matrix.sum(axis=0) - np.diag(confusion_matrix)  
FN = confusion_matrix.sum(axis=1) - np.diag(confusion_matrix)
TP = np.diag(confusion_matrix)
TN = confusion_matrix.values.sum() - (FP + FN + TP)

# Overall accuracy
ACC = (TP+TN)/(TP+FP+FN+TN)
PRE = (TP)/(TP+FP)
REC = (TP)/(TP+FN)
F1 = 2*(TP)/(2*TP+FP+FN)

In [183]:
print('---Accuracy---')
print(ACC)
print('---Precision---')
print(PRE)
print('---Recall---')
print(REC)
print('---F1-score---')
print(F1)

---Accuracy---
Contraindication    0.892707
Effect              0.939369
Relief              0.889355
Treatment           0.876222
dtype: float64
---Precision---
Contraindication    0.574890
Effect              0.627907
Relief              0.642147
Treatment           0.893824
dtype: float64
---Recall---
Contraindication    0.577434
Effect              0.572438
Relief              0.599258
Treatment           0.916703
dtype: float64
---F1-score---
Contraindication    0.576159
Effect              0.598891
Relief              0.619962
Treatment           0.905119
dtype: float64


In [184]:
mistakes  = FP.sum()
mistakes

720

## Random forest

In [197]:
from sklearn.ensemble import RandomForestClassifier
X = scaler.fit_transform(X_features)
skf = StratifiedKFold(n_splits=10)
#X = X_features
all_cm = []



for train_index, test_index in skf.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    #print (len(X_train), len(X_test))
    
    class_weight = dict({0:4, 1:5, 2:4, 3:1})
    #, class_weight = class_weight
    
    rf_model = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 42)
   
    rf_model = rf_model.fit(X_train, y_train)
    
    
    y_pred = rf_model.predict(X_test)
    cm = ConfusionMatrix(y_test, y_pred)
    
    all_cm.append(cm)

In [198]:
# Combine the different confusion matrices from the k validation sets

confusion_matrix = pd.DataFrame()
for i in range(len(all_cm)):
      confusion_matrix= confusion_matrix.append(pd.DataFrame(all_cm[i].table))
        
confusion_matrix = confusion_matrix.groupby(confusion_matrix.index).sum()
confusion_matrix.columns=['Contraindication', 'Effect', 'Relief', 'Treatment']
confusion_matrix.index = ['Contraindication', 'Effect', 'Relief', 'Treatment']
confusion_matrix

Unnamed: 0,Contraindication,Effect,Relief,Treatment
Contraindication,248,42,94,109
Effect,27,148,22,52
Relief,81,21,302,91
Treatment,98,47,85,2112


In [199]:
# Compute Accuracy
FP = confusion_matrix.sum(axis=0) - np.diag(confusion_matrix)  
FN = confusion_matrix.sum(axis=1) - np.diag(confusion_matrix)
TP = np.diag(confusion_matrix)
TN = confusion_matrix.values.sum() - (FP + FN + TP)

# Overall accuracy
ACC = (TP+TN)/(TP+FP+FN+TN)
PRE = (TP)/(TP+FP)
REC = (TP)/(TP+FN)
F1 = 2*(TP)/(2*TP+FP+FN)

print('---Accuracy---')
print(ACC)
print('---Precision---')
print(PRE)
print('---Recall---')
print(REC)
print('---F1-score---')
print(F1)

---Accuracy---
Contraindication    0.873987
Effect              0.941045
Relief              0.889913
Treatment           0.865326
dtype: float64
---Precision---
Contraindication    0.546256
Effect              0.573643
Relief              0.600398
Treatment           0.893401
dtype: float64
---Recall---
Contraindication    0.503043
Effect              0.594378
Relief              0.610101
Treatment           0.901793
dtype: float64
---F1-score---
Contraindication    0.523759
Effect              0.583826
Relief              0.605210
Treatment           0.897578
dtype: float64


In [200]:
mistakes  = FP.sum()
mistakes

769