In [246]:
# Import data set

import pandas as pd
import numpy as np

data = pd.read_csv('data.csv')
text =  data['text']
labels = data['relation_type']
disease = data['disease_name']
drug = data['drug_name']


In [247]:
# Functions for cleaning 

def remove (x):
    no_punct = ""
    for char in x:
            if char in '''qwertyuiopasdfghjklzxcvbnmQWERTYUIOPASDFGHJKLZXCVBNM ''':
                    no_punct = no_punct + char
    return no_punct

def remove_non_digits (x):
    no_punct = ""
    for char in x:
            if char in '''1234567890qwertyuiopasdfghjklzxcvbnmQWERTYUIOPASDFGHJKLZXCVBNM ''':
                    no_punct = no_punct + char
    return no_punct

def lower (x):
    return x.lower()

# Preprocess

In [248]:
# --- Encode Labels ---

from sklearn import preprocessing
le = preprocessing.LabelEncoder()
le.fit(labels)
le.classes_
labels_en = le.transform(labels) 

# --- Encode Drugs ---
drug_new = drug.apply(lower)
le.fit(drug_new)
le.classes_
drugs_labbeled = le.transform(drug_new) 

# --- Encode Disease ---
disease_new = disease.apply(lower)
le.fit(disease_new)
le.classes_
disease_labbeled = le.transform(disease_new) 

In [249]:
# --- Remove stop words and clean the Text ---

from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 

stop_words = set(stopwords.words('english'))
text_new = text.apply(remove)
text_new = text_new.apply(lower)

def remove_stopwords(sentence):
    word_tokens = word_tokenize(sentence)
    filtered_sentence = []
    for w in word_tokens:
        if w not in stop_words and len(w)>=2:
            filtered_sentence.append(w)
    return filtered_sentence

In [250]:
corpus = text_new.apply(remove_stopwords)

In [251]:
# New Data Frame with tokenized and clean sentences

d = {'label': labels_en, 'text': corpus, 'disease': disease_labbeled, 'drug':drugs_labbeled}
df = pd.DataFrame(data=d)

# Embeddings

## Word2Vec

In [252]:
# Word2Vec
from gensim.models import Word2Vec

# Encode the labels 
y = labels_en


In [253]:
# Train the model of vector representation
model = Word2Vec(sentences = corpus, size = 100, sg = 1, window = 3, 
                 min_count = 1, iter = 10, workers = 3)

In [254]:
# Create a copy of the initial df so that we work on that 
df_en = df.copy()

In [255]:
# For each instance/sentence compute the average of all the words 

def avg_vector(list_of_words):
    vector_sum = model.wv[list_of_words[0]]
    for i in range(1,len(list_of_words)):
        vector_sum = vector_sum +  model.wv[list_of_words[i]]
    return vector_sum/len(list_of_words)

In [256]:
df_en['text'] = df_en['text'].apply(avg_vector)

In [257]:
# Create a data frame with the embedded text
d2 = {'label': y, 'text':df_en['text'] , 'disease': disease_labbeled, 'drug':drugs_labbeled}
df2 = pd.DataFrame(data=d2)
embedded = df2.to_csv('embedded.csv')

In [258]:
# Create the features

X_features = []
for i in range(len(df_en['text'])):
    a = df_en['text'][i].tolist()
    b = int(drugs_labbeled[i])
    c = int(disease_labbeled[i])
    a.append(b)
    d = a
    d.append(c)
    e = d
    X_features.append(e)


# Baseline Models

In [259]:
# Scaling

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix,classification_report
from sklearn.model_selection import cross_val_score, GridSearchCV

scaler = StandardScaler()

# Plotting
import seaborn as sns

## SVM

In [260]:
from sklearn import svm 
from sklearn.svm import SVC

from sklearn.model_selection import StratifiedKFold, KFold
from pycm import *

skf = StratifiedKFold(n_splits=5)
X = scaler.fit_transform(X_features)
#X = X_features #or not scaled

all_cm = []

for train_index, test_index in skf.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    #print (len(X_train), len(X_test))
    
    
    model = svm.SVC(gamma= 'scale')
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    cm = ConfusionMatrix(y_test, y_pred)
    
    all_cm.append(cm)
    

In [261]:
for i in range(5):
    print(all_cm[i].table)

{0: {0: 1, 1: 0, 2: 0, 3: 15}, 1: {0: 0, 1: 5, 2: 0, 3: 7}, 2: {0: 0, 1: 0, 2: 18, 3: 16}, 3: {0: 2, 1: 12, 2: 1, 3: 188}}
{0: {0: 1, 1: 0, 2: 1, 3: 14}, 1: {0: 0, 1: 2, 2: 0, 3: 10}, 2: {0: 1, 1: 0, 2: 16, 3: 17}, 3: {0: 6, 1: 0, 2: 0, 3: 197}}
{0: {0: 1, 1: 1, 2: 0, 3: 14}, 1: {0: 0, 1: 7, 2: 0, 3: 5}, 2: {0: 0, 1: 0, 2: 16, 3: 18}, 3: {0: 0, 1: 14, 2: 1, 3: 187}}
{0: {0: 2, 1: 0, 2: 1, 3: 13}, 1: {0: 0, 1: 1, 2: 0, 3: 11}, 2: {0: 1, 1: 0, 2: 16, 3: 17}, 3: {0: 5, 1: 0, 2: 0, 3: 197}}
{0: {0: 2, 1: 0, 2: 0, 3: 14}, 1: {0: 0, 1: 4, 2: 0, 3: 8}, 2: {0: 0, 1: 0, 2: 14, 3: 20}, 3: {0: 2, 1: 0, 2: 0, 3: 200}}


In [262]:
# Combine the different confusion matrices from the k validation sets

confusion_matrix = pd.DataFrame()
for i in range(len(all_cm)):
      confusion_matrix= confusion_matrix.append(pd.DataFrame(all_cm[i].table))
        
confusion_matrix = confusion_matrix.groupby(confusion_matrix.index).sum()
confusion_matrix.columns=['Contraindication', 'Effect', 'Syptomatic Relief', 'Treatment']
confusion_matrix.index = ['Contraindication', 'Effect', 'Syptomatic Relief', 'Treatment']
confusion_matrix


Unnamed: 0,Contraindication,Effect,Syptomatic Relief,Treatment
Contraindication,7,0,2,15
Effect,1,19,0,26
Syptomatic Relief,2,0,80,2
Treatment,70,41,88,969


In [263]:
# The output matrix is: Predicted/Actual; 

In [264]:
# Metrics from confusion matrix
FP = confusion_matrix.sum(axis=0) - np.diag(confusion_matrix)  
FN = confusion_matrix.sum(axis=1) - np.diag(confusion_matrix)
TP = np.diag(confusion_matrix)
TN = confusion_matrix.values.sum() - (FP + FN + TP)

# Overall accuracy
ACC = (TP+TN)/(TP+FP+FN+TN)
PRE = (TP)/(TP+FP)
REC = (TP)/(TP+FN)
F1 = 2*(TP)/(2*TP+FP+FN)

In [269]:
REC

Contraindication     0.291667
Effect               0.413043
Syptomatic Relief    0.952381
Treatment            0.829623
dtype: float64

## Decision Tree

In [270]:
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics
from sklearn.model_selection import StratifiedKFold
from pycm import *

In [271]:
skf = StratifiedKFold(n_splits=5)
dt_model = DecisionTreeClassifier()

X = scaler.fit_transform(X_features)
#X = X_features
all_cm = []


for train_index, test_index in skf.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    #print (len(X_train), len(X_test))
    
    dt_model = DecisionTreeClassifier()
    dt_model = dt_model.fit(X_train, y_train)
    
    
    y_pred = dt_model.predict(X_test)
    
    cm = ConfusionMatrix(y_test, y_pred)
    
    all_cm.append(cm)

In [272]:
for i in range(5):
    print(all_cm[i].table)

{0: {0: 16, 1: 0, 2: 0, 3: 0}, 1: {0: 0, 1: 12, 2: 0, 3: 0}, 2: {0: 0, 1: 0, 2: 34, 3: 0}, 3: {0: 0, 1: 0, 2: 0, 3: 203}}
{0: {0: 16, 1: 0, 2: 0, 3: 0}, 1: {0: 0, 1: 12, 2: 0, 3: 0}, 2: {0: 0, 1: 0, 2: 34, 3: 0}, 3: {0: 0, 1: 0, 2: 0, 3: 203}}
{0: {0: 16, 1: 0, 2: 0, 3: 0}, 1: {0: 0, 1: 12, 2: 0, 3: 0}, 2: {0: 0, 1: 0, 2: 34, 3: 0}, 3: {0: 0, 1: 0, 2: 0, 3: 202}}
{0: {0: 16, 1: 0, 2: 0, 3: 0}, 1: {0: 0, 1: 12, 2: 0, 3: 0}, 2: {0: 0, 1: 0, 2: 34, 3: 0}, 3: {0: 0, 1: 0, 2: 0, 3: 202}}
{0: {0: 15, 1: 0, 2: 0, 3: 1}, 1: {0: 0, 1: 12, 2: 0, 3: 0}, 2: {0: 0, 1: 0, 2: 34, 3: 0}, 3: {0: 0, 1: 0, 2: 0, 3: 202}}


In [273]:
# Combine the different confusion matrices from the k validation sets

confusion_matrix = pd.DataFrame()
for i in range(len(all_cm)):
      confusion_matrix= confusion_matrix.append(pd.DataFrame(all_cm[i].table))
        
confusion_matrix = confusion_matrix.groupby(confusion_matrix.index).sum()
confusion_matrix.columns=['Contraindication', 'Effect', 'Syptomatic Relief', 'Treatment']
confusion_matrix.index = ['Contraindication', 'Effect', 'Syptomatic Relief', 'Treatment']
confusion_matrix

Unnamed: 0,Contraindication,Effect,Syptomatic Relief,Treatment
Contraindication,79,0,0,0
Effect,0,60,0,0
Syptomatic Relief,0,0,170,0
Treatment,1,0,0,1012


In [274]:
# Metrics from confusion matrix
FP = confusion_matrix.sum(axis=0) - np.diag(confusion_matrix)  
FN = confusion_matrix.sum(axis=1) - np.diag(confusion_matrix)
TP = np.diag(confusion_matrix)
TN = confusion_matrix.values.sum() - (FP + FN + TP)

# Overall accuracy
ACC = (TP+TN)/(TP+FP+FN+TN)
PRE = (TP)/(TP+FP)
REC = (TP)/(TP+FN)
F1 = 2*(TP)/(2*TP+FP+FN)

In [275]:
ACC

Contraindication     0.999244
Effect               1.000000
Syptomatic Relief    1.000000
Treatment            0.999244
dtype: float64

## Random forest

In [281]:
from sklearn.ensemble import RandomForestClassifier
X = scaler.fit_transform(X_features)
#X = X_features
all_cm = []


for train_index, test_index in skf.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    #print (len(X_train), len(X_test))
    
    rf_model = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 42)
    rf_model = rf_model.fit(X_train, y_train)
    
    
    y_pred = rf_model.predict(X_test)
    cm = ConfusionMatrix(y_test, y_pred)
    
    all_cm.append(cm)

In [282]:
for i in range(5):
    print(all_cm[i].table)

{0: {0: 13, 1: 0, 2: 1, 3: 2}, 1: {0: 0, 1: 12, 2: 0, 3: 0}, 2: {0: 0, 1: 0, 2: 32, 3: 2}, 3: {0: 0, 1: 2, 2: 2, 3: 199}}
{0: {0: 15, 1: 0, 2: 0, 3: 1}, 1: {0: 0, 1: 12, 2: 0, 3: 0}, 2: {0: 0, 1: 1, 2: 32, 3: 1}, 3: {0: 2, 1: 0, 2: 1, 3: 200}}
{0: {0: 15, 1: 0, 2: 0, 3: 1}, 1: {0: 0, 1: 12, 2: 0, 3: 0}, 2: {0: 0, 1: 0, 2: 31, 3: 3}, 3: {0: 0, 1: 2, 2: 3, 3: 197}}
{0: {0: 14, 1: 1, 2: 0, 3: 1}, 1: {0: 0, 1: 11, 2: 1, 3: 0}, 2: {0: 1, 1: 0, 2: 33, 3: 0}, 3: {0: 0, 1: 0, 2: 0, 3: 202}}
{0: {0: 10, 1: 0, 2: 2, 3: 4}, 1: {0: 0, 1: 12, 2: 0, 3: 0}, 2: {0: 0, 1: 0, 2: 32, 3: 2}, 3: {0: 2, 1: 0, 2: 0, 3: 200}}


In [283]:
# Combine the different confusion matrices from the k validation sets

confusion_matrix = pd.DataFrame()
for i in range(len(all_cm)):
      confusion_matrix= confusion_matrix.append(pd.DataFrame(all_cm[i].table))
        
confusion_matrix = confusion_matrix.groupby(confusion_matrix.index).sum()
confusion_matrix.columns=['Contraindication', 'Effect', 'Syptomatic Relief', 'Treatment']
confusion_matrix.index = ['Contraindication', 'Effect', 'Syptomatic Relief', 'Treatment']
confusion_matrix

Unnamed: 0,Contraindication,Effect,Syptomatic Relief,Treatment
Contraindication,67,0,1,4
Effect,1,59,1,4
Syptomatic Relief,3,1,160,6
Treatment,9,0,8,998


In [284]:
# Metrics from confusion matrix
FP = confusion_matrix.sum(axis=0) - np.diag(confusion_matrix)  
FN = confusion_matrix.sum(axis=1) - np.diag(confusion_matrix)
TP = np.diag(confusion_matrix)
TN = confusion_matrix.values.sum() - (FP + FN + TP)

# Overall accuracy
ACC = (TP+TN)/(TP+FP+FN+TN)
PRE = (TP)/(TP+FP)
REC = (TP)/(TP+FN)
F1 = 2*(TP)/(2*TP+FP+FN)

In [285]:
ACC

Contraindication     0.986384
Effect               0.994705
Syptomatic Relief    0.984871
Treatment            0.976551
dtype: float64