In [69]:
# Import train_data set

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

train_data = pd.read_csv('crowd_final.csv')
test_data = pd.read_csv('experts.csv')

In [70]:
def rep(text):
    a = text.replace('Indication: Treatment', 'treatment')
    b = a.replace('Contraindication', 'contraindication')
    c = b.replace('Indication: Symptomatic Relief', 'relief')
    d = c.replace('Effect', 'effect')
    return d

def rep2(text):
    a = text.replace('INDICATION AND USAGE', '')

    return a

def lower (x):
    return x.lower()


# Functions for cleaning 

def remove (x):
    no_punct = ""
    for char in x:
            if char in '''qwertyuiopasdfghjklzxcvbnmQWERTYUIOPASDFGHJKLZXCVBNM ''':
                    no_punct = no_punct + char
    return no_punct

def remove_non_digits (x):
    no_punct = ""
    for char in x:
            if char in '''1234567890qwertyuiopasdfghjklzxcvbnmQWERTYUIOPASDFGHJKLZXCVBNM ''':
                    no_punct = no_punct + char
    return no_punct

def lower (x):
    return x.lower()

## Test Data 

In [71]:
print(test_data[test_data['context'].isnull()]['context'])
print(test_data[test_data['disease_name'].isnull()]['disease_name'])
print(test_data[test_data['expert_consensus'].isnull()]['expert_consensus'])

107    NaN
Name: context, dtype: object
Series([], Name: disease_name, dtype: object)
Series([], Name: expert_consensus, dtype: object)


In [72]:
test_data = test_data.drop(test_data.index[107]).reset_index()

test_data['expert_consensus'] = test_data['expert_consensus'].apply(rep)
test_data['disease_name'] = test_data['disease_name'].apply(lower)
test_data['context'] = test_data['context'].apply(lower)
test_data['context'] = test_data['context'].apply(remove_non_digits)

test_data = test_data[test_data['expert_consensus'] != 'No consensus'].reset_index()
test_data = test_data[['context', 'do_id','drug_id', 'expert_consensus' ]]
test_data = test_data.rename(columns={"context": "text", "do_id": "disease", "drug_id": "drug", "expert_consensus": "label"})

## Train Data

In [73]:
print(train_data[train_data['text'].isnull()]['text'])
print(train_data[train_data['relation'].isnull()]['relation'])
print(train_data[train_data['disease'].isnull()]['disease'])

Series([], Name: text, dtype: object)
Series([], Name: relation, dtype: object)
Series([], Name: disease, dtype: object)


In [74]:
train_data['text'] = train_data['text'].apply(rep2)
train_data['text'] = train_data['text'].apply(lower)
test_data['text'] = test_data['text'].apply(remove_non_digits)
train_data = train_data[train_data['relation'] != 'IDK'].reset_index()
train_data = train_data[['text', 'DOID','DBID', 'relation' ]]
train_data = train_data.rename(columns={ "DOID": "disease", "DBID": "drug", "relation": "label"})

# Preprocess

In [12]:
# Merge them to have consistent encoding 
frames = [train_data, test_data]

merged_data = pd.concat(frames).reset_index()

In [14]:
data = merged_data.copy()

In [15]:
# Encode labels 
from sklearn import preprocessing
le = preprocessing.LabelEncoder()

le.fit(data['label'])
le.classes_
labels_en = le.transform(data['label']) 

# --- Encode Disease ---

le.fit(data['disease'])
le.classes_
disease_encoded = le.transform(data['disease'])

# --- Encode Drugs ---

le.fit(data['drug'])
le.classes_
drug_encoded = le.transform(data['drug']) 

# --- Remove stop words and clean the Text ---

from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 

stop_words = set(stopwords.words('english'))
data['text'] = data['text'].apply(remove)
data['text'] = data['text'].apply(lower)

def remove_stopwords(sentence):
    word_tokens = word_tokenize(sentence)
    filtered_sentence = []
    for w in word_tokens:
        if w not in stop_words and len(w)>=2:
            filtered_sentence.append(w)
    return filtered_sentence

data['text'] = merged_data['text'].apply(remove_stopwords)

# The final Set

d = {'index':data['index'], 'label': labels_en, 'text': data['text'], 'disease': disease_encoded, 'drug':drug_encoded }
df = pd.DataFrame(data=d)

# Embeddings

## Word2Vec

In [19]:
import gensim
# Word2Vec
from gensim.models import Word2Vec

In [20]:
# Encode the labels 
y = labels_en

In [21]:
# Train the model of vector representation
model = Word2Vec(sentences = data['text'], size = 100, sg = 1, window = 3, 
                 min_count = 1, iter = 10, workers = 3)

In [22]:
# For each instance/sentence compute the average of all the words 

def avg_vector(list_of_words):
    vector_sum = model.wv[list_of_words[0]]
    for i in range(1,len(list_of_words)):
        vector_sum = vector_sum +  model.wv[list_of_words[i]]
    return vector_sum/len(list_of_words)

In [23]:
df['text'] = df['text'].apply(avg_vector)

In [24]:
# Create the features

X_features = []
for i in range(len(df['text'])):
    a = df['text'][i].tolist()
    b = int(df['disease'][i])
    c = int(df['drug'][i])
    a.append(b)
    d = a
    d.append(c)
    e = d
    X_features.append(e)


# Baseline Models

In [25]:
# Scaling

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix,classification_report
from sklearn.model_selection import cross_val_score, GridSearchCV

scaler = StandardScaler()

# Plotting
import seaborn as sns

In [26]:
df['index'][0:3580]

0          0
1          1
2          2
3          3
4          4
        ... 
3575    3575
3576    3576
3577    3577
3578    3578
3579       0
Name: index, Length: 3580, dtype: int64

## SVM

In [31]:
from sklearn import svm 
from sklearn.svm import SVC

from sklearn.model_selection import StratifiedKFold, KFold
import pycm
from pycm import *


X = scaler.fit_transform(X_features)

X_train, X_test = X[0:3579], X[3580:]
y_train, y_test = y[0:3579], y[3580:]

class_weight = dict({0:4, 1:5, 2:4, 3:1})
#, class_weight = class_weight


model = svm.SVC(gamma= 'scale')
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

cm = ConfusionMatrix(y_pred, y_test)

In [None]:
confusion_matrix = pd.DataFrame(data=cm.table)
confusion_matrix.columns=['Contraindication', 'Effect', 'Relief', 'Treatment']
confusion_matrix.index = ['Contraindication', 'Effect', 'Relief', 'Treatment']
confusion_matrix

In [None]:
# Metrics from confusion matrix
FP = confusion_matrix.sum(axis=0) - np.diag(confusion_matrix)  
FN = confusion_matrix.sum(axis=1) - np.diag(confusion_matrix)
TP = np.diag(confusion_matrix)
TN = confusion_matrix.values.sum() - (FP + FN + TP)

# Overall accuracy
ACC = (TP+TN)/(TP+FP+FN+TN)
PRE = (TP)/(TP+FP)
REC = (TP)/(TP+FN)
F1 = 2*(TP)/(2*TP+FP+FN)


print('---Accuracy---')
print(ACC)
print('---Precision---')
print(PRE)
print('---Recall---')
print(REC)
print('---F1-score---')
print(F1)

In [None]:
mistakes  = FP.sum()
mistakes

## Decision Tree

In [36]:
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics
from sklearn.model_selection import StratifiedKFold
from pycm import *

dt_model = DecisionTreeClassifier()

X = scaler.fit_transform(X_features)
#X = X_features

X_train, X_test = X[0:3579], X[3580:]
y_train, y_test = y[0:3579], y[3580:]

class_weight = dict({0:4, 1:5, 2:4, 3:1})
    #, class_weight = class_weight
    
dt_model = DecisionTreeClassifier( criterion = 'entropy', random_state = 42, class_weight = class_weight)
dt_model = dt_model.fit(X_train, y_train)
    
y_pred = dt_model.predict(X_test)

cm = ConfusionMatrix(y_test, y_pred)

confusion_matrix = pd.DataFrame(data=cm.table)
confusion_matrix.columns=['Contraindication', 'Effect', 'Relief', 'Treatment']
confusion_matrix.index = ['Contraindication', 'Effect', 'Relief', 'Treatment']
confusion_matrix

In [44]:
# Metrics from confusion matrix
FP = confusion_matrix.sum(axis=0) - np.diag(confusion_matrix)  
FN = confusion_matrix.sum(axis=1) - np.diag(confusion_matrix)
TP = np.diag(confusion_matrix)
TN = confusion_matrix.values.sum() - (FP + FN + TP)

# Overall accuracy
ACC = (TP+TN)/(TP+FP+FN+TN)
PRE = (TP)/(TP+FP)
REC = (TP)/(TP+FN)
F1 = 2*(TP)/(2*TP+FP+FN)

In [None]:
print('---Accuracy---')
print(ACC)
print('---Precision---')
print(PRE)
print('---Recall---')
print(REC)
print('---F1-score---')
print(F1)

In [46]:
mistakes  = FP.sum()
mistakes

132

## Random forest

In [79]:
from sklearn.ensemble import RandomForestClassifier
X = scaler.fit_transform(X_features)

X_train, X_test = X[0:3579], X[3580:]
y_train, y_test = y[0:3579], y[3580:]
    
class_weight = dict({0:4, 1:5, 2:4, 3:1})
   
rf_model = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 42, class_weight = class_weight)
rf_model = rf_model.fit(X_train, y_train)


y_pred = rf_model.predict(X_test)
cm = ConfusionMatrix(y_test, y_pred)

confusion_matrix = pd.DataFrame(data=cm.table)
confusion_matrix.columns=['Contraindication', 'Effect', 'Relief', 'Treatment']
confusion_matrix.index = ['Contraindication', 'Effect', 'Relief', 'Treatment']
confusion_matrix

In [None]:
# Compute Accuracy
FP = confusion_matrix.sum(axis=0) - np.diag(confusion_matrix)  
FN = confusion_matrix.sum(axis=1) - np.diag(confusion_matrix)
TP = np.diag(confusion_matrix)
TN = confusion_matrix.values.sum() - (FP + FN + TP)

# Overall accuracy
ACC = (TP+TN)/(TP+FP+FN+TN)
PRE = (TP)/(TP+FP)
REC = (TP)/(TP+FN)
F1 = 2*(TP)/(2*TP+FP+FN)

print('---Accuracy---')
print(ACC)
print('---Precision---')
print(PRE)
print('---Recall---')
print(REC)
print('---F1-score---')
print(F1)

In [82]:
mistakes  = FP.sum()
mistakes

120