In [143]:
# Import data set

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

data = pd.read_csv('crowd_final.csv')
data = data.drop(data[data['relation']== 'IDK'].index).reset_index()
data['relation'].value_counts()

In [None]:
# Check if there is anything missing
print(data[data['text'].isnull()]['text'])
print(data[data['relation'].isnull()]['relation'])
print(data[data['DOID'].isnull()]['DOID'])
print(data[data['DBID'].isnull()]['DBID'])

In [147]:
# Functions for cleaning 

def remove (x):
    no_punct = ""
    for char in x:
            if char in '''qwertyuiopasdfghjklzxcvbnmQWERTYUIOPASDFGHJKLZXCVBNM ''':
                    no_punct = no_punct + char
    return no_punct

def remove_non_digits (x):
    no_punct = ""
    for char in x:
            if char in '''1234567890qwertyuiopasdfghjklzxcvbnmQWERTYUIOPASDFGHJKLZXCVBNM ''':
                    no_punct = no_punct + char
    return no_punct

def lower (x):
    return x.lower()

In [None]:
# --- Encode Labels ---

from sklearn import preprocessing
le = preprocessing.LabelEncoder()
le.fit(data['relation'])
le.classes_
labels_en = le.transform(data['relation']) 

# Check the encoding
zero = list(le.inverse_transform([0]))
one = list(le.inverse_transform([1]))
two = list(le.inverse_transform([2]))
three = list(le.inverse_transform([3]))

print(zero, 'is encoded as 0')
print(one, 'is encoded as 1')
print(two, 'is encoded as 2')
print(three, 'is encoded as 3')

In [149]:
# --- Encode Drugs ---

le.fit(data['DBID'])
le.classes_
drug_labbeled = le.transform(data['DBID']) 

# --- Encode Disease ---

le.fit(data['DOID'])
le.classes_
disease_labbeled = le.transform(data['DOID'])

In [151]:
# Clean text

from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 

stop_words = set(stopwords.words('english'))
data['text'] = data['text'].apply(remove)
data['text'] = data['text'].apply(lower)
data['disease'] = data['disease'].apply(lower)
data['drug'] = data['drug'].apply(lower)

In [152]:
d = {'label': labels_en, 'text': data['text'],'disease_name':data['disease'],  'disease': disease_labbeled,'drug_name':data['drug'], 'drug':drug_labbeled}
data = pd.DataFrame(data=d)

In [154]:
# The rule phrases

phrases =  ["hypersensitivity reactions",
"associated with the risk of",
"to the risk of",
"a high risk for",
"a high risk of",
"high incidence of", 
"higher incidence of", 
" cause ",
" causes ",
"symptoms occure",
"teratogenic",
"site reaction",
"the risk of development",
"is associated with a risk of",
"symptoms of the poisoning",
"symptoms of poisoning" ,
"not administrated to",
"contraindicated in",
"contraindicatedin",
"should not be used",
"is contraindication for",
"is contraindication when",
"is contraindicated when",
"must not be used for",
"do not administer",
"should not initiate",
"not be administered to",
"do not initiate patients",
"contraindication for",
"should not be given",
"do not use",
"patients with a history of",
"relief of the signs",
"relief of the signs and symptoms of",
"relief of signs",
"relief of symptoms",
"relief of the symptoms",
"help",
"helps",
"relief of signs and symptoms of",
"reduction of symptoms of",
"treatment of the symptoms of",
"for the relief",
"management of the signs and symptoms of", 
" indicated for the treatment of",
" indicated in the management of",
" indicated for the management of",
"for the management of",
"management of",
" indicated for the maintenance of remission", 
"or the treatment of",
"in the treatment of",
" indicated as",
" indicated in",
"be effective",
"active treatment of",
" indicated for",
"treatment of",
" indicated as an adjunct",
" indicated for use in the treatment of", 
" indicated for the intermittent treatment", 
" indicated to reduce the rate of",
" indicated for the rapid control",
" indicated for the control",
"reduce the risk of",
" indicated as adjunctive treatment",
"for the treatment of",
" indicated as an adjunct",
"areindicatedas",
"treatment is indicated",
"prophylaxis"]

In [159]:
# Create the features as distances of disease from each rule phrase
def get_features (text, disease):
    str = text
    position_disease = str.find(disease)
    
    feature = []
    for i in range(len(phrases)):
        position_phrase = str.find(phrases[i])
        if position_phrase != -1:
            distance = abs(position_disease - position_phrase)
            feature.append(distance)
        else:
            feature.append(0)
    return feature


def collect_features(df):
    X_features =[]
    for i in range(len(df)):
        new_feature = get_features (df['text'][i], df['disease_name'][i])
        X_features.append(new_feature)
    return X_features

X_features = collect_features(data)
y = labels_en

## SVM

In [None]:
from sklearn import svm 
from sklearn.svm import SVC

from sklearn.model_selection import StratifiedKFold, KFold
import pycm
from pycm import *

skf = StratifiedKFold(n_splits=10) #or n_splits=5

X = X_features
all_cm = []

for train_index, test_index in skf.split(X, y):
    X_train, X_test = [X[i] for i in train_index] , [X[i] for i in test_index]
    y_train, y_test = [y[i] for i in train_index], [y[i] for i in test_index]
    
    #class_weight = dict({0:4, 1:5, 2:4, 3:1})
    #, class_weight = class_weight
    model = svm.SVC(gamma= 'scale')
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    cm = ConfusionMatrix(y_pred, y_test)
    
    all_cm.append(cm)
    
confusion_matrix = pd.DataFrame()
for i in range(len(all_cm)):
      confusion_matrix= confusion_matrix.append(pd.DataFrame(all_cm[i].table))
        
confusion_matrix = confusion_matrix.groupby(confusion_matrix.index).sum()
confusion_matrix.columns=['Contraindication', 'Effect', 'Relief', 'Treatment']
confusion_matrix.index = ['Contraindication', 'Effect', 'Relief', 'Treatment']
confusion_matrix

In [None]:
# Metrics from confusion matrix
FP = confusion_matrix.sum(axis=0) - np.diag(confusion_matrix)  
FN = confusion_matrix.sum(axis=1) - np.diag(confusion_matrix)
TP = np.diag(confusion_matrix)
TN = confusion_matrix.values.sum() - (FP + FN + TP)

# Overall accuracy
ACC = (TP+TN)/(TP+FP+FN+TN)
PRE = (TP)/(TP+FP)
REC = (TP)/(TP+FN)
F1 = 2*(TP)/(2*TP+FP+FN)


print('---Accuracy---')
print(ACC)
print('---Precision---')
print(PRE)
print('---Recall---')
print(REC)
print('---F1-score---')
print(F1)

In [None]:
mistakes  = FP.sum()
mistakes

## Decision Tree

In [129]:
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics
from sklearn.model_selection import StratifiedKFold
from pycm import *


skf = StratifiedKFold(n_splits=10) #or n_splits=5
dt_model = DecisionTreeClassifier()

X = X_features
all_cm = []


for train_index, test_index in skf.split(X, y):
    X_train, X_test = [X[i] for i in train_index] , [X[i] for i in test_index]
    y_train, y_test = [y[i] for i in train_index], [y[i] for i in test_index]
    
    #class_weight = dict({0:4, 1:5, 2:4, 3:1})
    #, class_weight = class_weight
    
    dt_model = DecisionTreeClassifier( criterion = 'entropy', random_state = 42)
    dt_model = dt_model.fit(X_train, y_train)
    
    y_pred = dt_model.predict(X_test)
    cm = ConfusionMatrix(y_test, y_pred)
    
    all_cm.append(cm)
    
# Combine the different confusion matrices from the k validation sets

confusion_matrix = pd.DataFrame()
for i in range(len(all_cm)):
      confusion_matrix= confusion_matrix.append(pd.DataFrame(all_cm[i].table))
        
confusion_matrix = confusion_matrix.groupby(confusion_matrix.index).sum()
confusion_matrix.columns=['Contraindication', 'Effect', 'Relief', 'Treatment']
confusion_matrix.index = ['Contraindication', 'Effect', 'Relief', 'Treatment']
confusion_matrix

In [135]:
# Metrics from confusion matrix
FP = confusion_matrix.sum(axis=0) - np.diag(confusion_matrix)  
FN = confusion_matrix.sum(axis=1) - np.diag(confusion_matrix)
TP = np.diag(confusion_matrix)
TN = confusion_matrix.values.sum() - (FP + FN + TP)

# Overall accuracy
ACC = (TP+TN)/(TP+FP+FN+TN)
PRE = (TP)/(TP+FP)
REC = (TP)/(TP+FN)
F1 = 2*(TP)/(2*TP+FP+FN)

print('---Accuracy---')
print(ACC)
print('---Precision---')
print(PRE)
print('---Recall---')
print(REC)
print('---F1-score---')
print(F1)

In [None]:
mistakes  = FP.sum()
mistakes

 ## Random Forest

In [138]:
from sklearn.ensemble import RandomForestClassifier

skf = StratifiedKFold(n_splits=10)
X = X_features
all_cm = []



for train_index, test_index in skf.split(X, y):
    X_train, X_test = [X[i] for i in train_index] , [X[i] for i in test_index]
    y_train, y_test = [y[i] for i in train_index], [y[i] for i in test_index]
    #print (len(X_train), len(X_test))
    
    class_weight = dict({0:4, 1:5, 2:4, 3:1})
    #, class_weight = class_weight
    
    rf_model = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 42)
   
    rf_model = rf_model.fit(X_train, y_train)
    
    
    y_pred = rf_model.predict(X_test)
    cm = ConfusionMatrix(y_test, y_pred)
    
    all_cm.append(cm)

In [None]:
# Combine the different confusion matrices from the k validation sets

confusion_matrix = pd.DataFrame()
for i in range(len(all_cm)):
      confusion_matrix= confusion_matrix.append(pd.DataFrame(all_cm[i].table))
        
confusion_matrix = confusion_matrix.groupby(confusion_matrix.index).sum()
confusion_matrix.columns=['Contraindication', 'Effect', 'Relief', 'Treatment']
confusion_matrix.index = ['Contraindication', 'Effect', 'Relief', 'Treatment']
confusion_matrix

In [None]:
# Compute Accuracy
FP = confusion_matrix.sum(axis=0) - np.diag(confusion_matrix)  
FN = confusion_matrix.sum(axis=1) - np.diag(confusion_matrix)
TP = np.diag(confusion_matrix)
TN = confusion_matrix.values.sum() - (FP + FN + TP)

# Overall accuracy
ACC = (TP+TN)/(TP+FP+FN+TN)
PRE = (TP)/(TP+FP)
REC = (TP)/(TP+FN)
F1 = 2*(TP)/(2*TP+FP+FN)

print('---Accuracy---')
print(ACC)
print('---Precision---')
print(PRE)
print('---Recall---')
print(REC)
print('---F1-score---')
print(F1)

In [141]:
mistakes  = FP.sum()
mistakes

668