In [147]:
# Import train_data set

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

train_data = pd.read_csv('crowd_final.csv')
test_data = pd.read_csv('experts.csv')

In [148]:
def rep(text):
    a = text.replace('Indication: Treatment', 'treatment')
    b = a.replace('Contraindication', 'contraindication')
    c = b.replace('Indication: Symptomatic Relief', 'relief')
    d = c.replace('Effect', 'effect')
    return d

def rep2(text):
    a = text.replace('INDICATION AND USAGE', '')

    return a

def lower (x):
    return x.lower()


# Functions for cleaning 

def remove (x):
    no_punct = ""
    for char in x:
            if char in '''qwertyuiopasdfghjklzxcvbnmQWERTYUIOPASDFGHJKLZXCVBNM ''':
                    no_punct = no_punct + char
    return no_punct

def remove_non_digits (x):
    no_punct = ""
    for char in x:
            if char in '''1234567890qwertyuiopasdfghjklzxcvbnmQWERTYUIOPASDFGHJKLZXCVBNM ''':
                    no_punct = no_punct + char
    return no_punct

def lower (x):
    return x.lower()

## Test Data

In [149]:
print(test_data[test_data['context'].isnull()]['context'])
print(test_data[test_data['disease_name'].isnull()]['disease_name'])
print(test_data[test_data['expert_consensus'].isnull()]['expert_consensus'])

107    NaN
Name: context, dtype: object
Series([], Name: disease_name, dtype: object)
Series([], Name: expert_consensus, dtype: object)


In [150]:
test_data = test_data.drop(test_data.index[107]).reset_index()

test_data['expert_consensus'] = test_data['expert_consensus'].apply(rep)
test_data['disease_name'] = test_data['disease_name'].apply(lower)
test_data['drug_name'] = test_data['drug_name'].apply(lower)
test_data['context'] = test_data['context'].apply(lower)
test_data['context'] = test_data['context'].apply(remove_non_digits)

test_data = test_data[test_data['expert_consensus'] != 'No consensus'].reset_index()
test_data = test_data[['context', 'do_id', 'disease_name','drug_id','drug_name', 'expert_consensus' ]]
test_data = test_data.rename(columns={"context": "text", "do_id": "disease",  "drug_id": "drug", "expert_consensus": "label"})

## Train Data

In [151]:
print(train_data[train_data['text'].isnull()]['text'])
print(train_data[train_data['relation'].isnull()]['relation'])
print(train_data[train_data['disease'].isnull()]['disease'])

Series([], Name: text, dtype: object)
Series([], Name: relation, dtype: object)
Series([], Name: disease, dtype: object)


In [152]:
train_data['text'] = train_data['text'].apply(rep2)
train_data['text'] = train_data['text'].apply(lower)
train_data['drug'] = train_data['drug'].apply(lower)
train_data['disease'] = train_data['disease'].apply(lower)
test_data['text'] = test_data['text'].apply(remove_non_digits)
train_data = train_data[train_data['relation'] != 'IDK'].reset_index()
train_data = train_data[['text', 'DOID','disease', 'DBID','drug', 'relation' ]]
train_data = train_data.rename(columns={ "DOID": "disease","disease": "disease_name",  "DBID": "drug", "drug": "drug_name", "relation": "label"})

## Preprocess

In [155]:
# Merge them to have consistent encoding 
frames = [train_data, test_data]

merged_data = pd.concat(frames).reset_index()

In [159]:
data = merged_data.copy()

In [162]:
# Encode labels 
from sklearn import preprocessing
le = preprocessing.LabelEncoder()

le.fit(data['label'])
le.classes_
labels_en = le.transform(data['label']) 

# --- Encode Disease ---

le.fit(data['disease'])
le.classes_
disease_encoded = le.transform(data['disease'])

# --- Encode Drugs ---

le.fit(data['drug'])
le.classes_
drug_encoded = le.transform(data['drug']) 

# --- Remove stop words and clean the Text ---
data['text'] = data['text'].apply(remove)
data['text'] = data['text'].apply(lower)

# The final Set

d = {'index':data['index'], 'label': labels_en, 'text': data['text'],'disease_name': data['disease_name'],  'disease': disease_encoded, 'drug':drug_encoded, 'drug_name':data['drug_name'] }
df = pd.DataFrame(data=d)

In [164]:
# The rule phrases

phrases =  ["hypersensitivity reactions",
"associated with the risk of",
"to the risk of",
"a high risk for",
"a high risk of",
"high incidence of", 
"higher incidence of", 
" cause ",
" causes ",
"symptoms occure",
"teratogenic",
"site reaction",
"the risk of development",
"is associated with a risk of",
"symptoms of the poisoning",
"symptoms of poisoning" ,
"not administrated to",
"contraindicated in",
"contraindicatedin",
"should not be used",
"is contraindication for",
"is contraindication when",
"is contraindicated when",
"must not be used for",
"do not administer",
"should not initiate",
"not be administered to",
"do not initiate patients",
"contraindication for",
"should not be given",
"do not use",
"patients with a history of",
"relief of the signs",
"relief of the signs and symptoms of",
"relief of signs",
"relief of symptoms",
"relief of the symptoms",
"help",
"helps",
"relief of signs and symptoms of",
"reduction of symptoms of",
"treatment of the symptoms of",
"for the relief",
"management of the signs and symptoms of", 
" indicated for the treatment of",
" indicated in the management of",
" indicated for the management of",
"for the management of",
"management of",
" indicated for the maintenance of remission", 
"or the treatment of",
"in the treatment of",
" indicated as",
" indicated in",
"be effective",
"active treatment of",
" indicated for",
"treatment of",
" indicated as an adjunct",
" indicated for use in the treatment of", 
" indicated for the intermittent treatment", 
" indicated to reduce the rate of",
" indicated for the rapid control",
" indicated for the control",
"reduce the risk of",
" indicated as adjunctive treatment",
"for the treatment of",
" indicated as an adjunct",
"areindicatedas",
"treatment is indicated",
"prophylaxis"]

In [165]:
# Create the features as distances of disease from each rule phrase
def get_features (text, disease):
    str = text
    position_disease = str.find(disease)
    
    feature = []
    for i in range(len(phrases)):
        position_phrase = str.find(phrases[i])
        if position_phrase != -1:
            distance = abs(position_disease - position_phrase)
            feature.append(distance)
        else:
            feature.append(0)
    return feature


def collect_features(df):
    X_features =[]
    for i in range(len(df)):
        new_feature = get_features (df['text'][i], df['disease_name'][i])
        X_features.append(new_feature)
    return X_features

X_features = collect_features(data)
y = labels_en

## SVM

In [166]:
df['index'][0:3580]

0          0
1          1
2          2
3          3
4          4
        ... 
3575    3575
3576    3576
3577    3577
3578    3578
3579       0
Name: index, Length: 3580, dtype: int64

In [None]:
from sklearn import svm 
from sklearn.svm import SVC

from sklearn.model_selection import StratifiedKFold, KFold
import pycm
from pycm import *


X = X_features

X_train, X_test = X[0:3579], X[3580:]
y_train, y_test = y[0:3579], y[3580:]

class_weight = dict({0:4, 1:5, 2:4, 3:1})
#, class_weight = class_weight


model = svm.SVC(gamma= 'scale')
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

cm = ConfusionMatrix(y_pred, y_test)
confusion_matrix = pd.DataFrame(data=cm.table)
confusion_matrix.columns=['Contraindication', 'Effect', 'Relief', 'Treatment']
confusion_matrix.index = ['Contraindication', 'Effect', 'Relief', 'Treatment']
confusion_matrix

In [None]:
# Metrics from confusion matrix
FP = confusion_matrix.sum(axis=0) - np.diag(confusion_matrix)  
FN = confusion_matrix.sum(axis=1) - np.diag(confusion_matrix)
TP = np.diag(confusion_matrix)
TN = confusion_matrix.values.sum() - (FP + FN + TP)

# Overall accuracy
ACC = (TP+TN)/(TP+FP+FN+TN)
PRE = (TP)/(TP+FP)
REC = (TP)/(TP+FN)
F1 = 2*(TP)/(2*TP+FP+FN)


print('---Accuracy---')
print(ACC)
print('---Precision---')
print(PRE)
print('---Recall---')
print(REC)
print('---F1-score---')
print(F1)

In [170]:
mistakes  = FP.sum()
mistakes

137

## Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics
from sklearn.model_selection import StratifiedKFold
from pycm import *

dt_model = DecisionTreeClassifier()

X = X_features

X_train, X_test = X[0:3579], X[3580:]
y_train, y_test = y[0:3579], y[3580:]

class_weight = dict({0:4, 1:5, 2:4, 3:1})
    #, class_weight = class_weight
    
dt_model = DecisionTreeClassifier( criterion = 'entropy', random_state = 42, class_weight = class_weight)
dt_model = dt_model.fit(X_train, y_train)
    
y_pred = dt_model.predict(X_test)

cm = ConfusionMatrix(y_test, y_pred)

confusion_matrix = pd.DataFrame(data=cm.table)
confusion_matrix.columns=['Contraindication', 'Effect', 'Relief', 'Treatment']
confusion_matrix.index = ['Contraindication', 'Effect', 'Relief', 'Treatment']
confusion_matrix

In [None]:
# Metrics from confusion matrix
FP = confusion_matrix.sum(axis=0) - np.diag(confusion_matrix)  
FN = confusion_matrix.sum(axis=1) - np.diag(confusion_matrix)
TP = np.diag(confusion_matrix)
TN = confusion_matrix.values.sum() - (FP + FN + TP)

# Overall accuracy
ACC = (TP+TN)/(TP+FP+FN+TN)
PRE = (TP)/(TP+FP)
REC = (TP)/(TP+FN)
F1 = 2*(TP)/(2*TP+FP+FN)

print('---Accuracy---')
print(ACC)
print('---Precision---')
print(PRE)
print('---Recall---')
print(REC)
print('---F1-score---')
print(F1)

In [None]:
mistakes  = FP.sum()
mistakes

 ## Random Forest

In [138]:
from sklearn.ensemble import RandomForestClassifier
X = X_features

X_train, X_test = X[0:3579], X[3580:]
y_train, y_test = y[0:3579], y[3580:]
    
class_weight = dict({0:4, 1:5, 2:4, 3:1})
   
rf_model = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 42, class_weight = class_weight)
rf_model = rf_model.fit(X_train, y_train)


y_pred = rf_model.predict(X_test)
cm = ConfusionMatrix(y_test, y_pred)

confusion_matrix = pd.DataFrame(data=cm.table)
confusion_matrix.columns=['Contraindication', 'Effect', 'Relief', 'Treatment']
confusion_matrix.index = ['Contraindication', 'Effect', 'Relief', 'Treatment']
confusion_matrix

In [None]:
# Compute Accuracy
FP = confusion_matrix.sum(axis=0) - np.diag(confusion_matrix)  
FN = confusion_matrix.sum(axis=1) - np.diag(confusion_matrix)
TP = np.diag(confusion_matrix)
TN = confusion_matrix.values.sum() - (FP + FN + TP)

# Overall accuracy
ACC = (TP+TN)/(TP+FP+FN+TN)
PRE = (TP)/(TP+FP)
REC = (TP)/(TP+FN)
F1 = 2*(TP)/(2*TP+FP+FN)

print('---Accuracy---')
print(ACC)
print('---Precision---')
print(PRE)
print('---Recall---')
print(REC)
print('---F1-score---')
print(F1)

In [141]:
mistakes  = FP.sum()
mistakes

668