In [1]:
# Packages needed

import nltk
from nltk.tokenize import sent_tokenize, word_tokenize

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('stopwords')
nltk.download('punkt')
import numpy as np
import pandas as pd

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/kristi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/kristi/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
# Import data set

data = pd.read_csv('data.csv')
text =  data['text']
labels = data['relation_type']
disease = data['disease_name']
drug = data['drug_name']


In [3]:
# Functions for cleaning 

def remove (x):
    no_punct = ""
    for char in x:
            if char in '''qwertyuiopasdfghjklzxcvbnmQWERTYUIOPASDFGHJKLZXCVBNM ''':
                    no_punct = no_punct + char
    return no_punct

def remove_non_digits (x):
    no_punct = ""
    for char in x:
            if char in '''1234567890qwertyuiopasdfghjklzxcvbnmQWERTYUIOPASDFGHJKLZXCVBNM ''':
                    no_punct = no_punct + char
    return no_punct

def lower (x):
    return x.lower()

# Preprocess

In [4]:
# --- Encode Labels ---

from sklearn import preprocessing
le = preprocessing.LabelEncoder()
le.fit(labels)
le.classes_
labels_en = le.transform(labels) 

In [5]:
# --- Clean Drug and Disease ---

#Disease
disease_new_1 = disease.apply(remove_non_digits)
disease_new_2 = disease_new_1.apply(lower)
disease_new = disease_new_2.apply(word_tokenize)


#Drug
drug_new_1 = drug.apply(remove_non_digits)
drug_new_2 = drug_new_1.apply(lower)
drug_new = drug_new_2.apply(word_tokenize)




In [6]:
# --- Remove stop words and clean Text ---

#lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))
corpus_prep= []
text_new=[]


for i in range(len(text)):
    text_new.append(remove(text[i]))
    
for i in range (len(text_new)):
    sentence = text_new[i].lower()
    word_tokens = word_tokenize(sentence)
    filtered_sentence = [w for w in word_tokens if not w in stop_words]
    filtered_sentence = []
    
    for w in word_tokens:
        if w not in stop_words and len(w)>2:
            filtered_sentence.append(w)
    corpus_prep.append(filtered_sentence)

In [7]:
# New Data Frame with tokenized and clean sentences

d = {'label': labels_en, 'text': corpus_prep, 'disease': disease_new, 'drug':drug_new}
df = pd.DataFrame(data=d)

# Embeddings

## TFIDF

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()

In [12]:
seq = (text_new[0], disease[0], drug[0])

for i in range(len(text_new)):
    seq = seq + (text_new[i], disease[i], drug[i])
rows = " ".join(seq)

In [13]:
for i in range(len(text_new)):
    rows = rows + text_new[i] + drug[i] +disease[i]

In [44]:
response = tfidf.fit_transform([rows])

## Word2Vec

In [8]:
# Word2Vec
from gensim.models import Word2Vec

# Encode the labels 
y = labels_en

# Create the total corpus where the word2vec is going to be trained 
corpus_final = corpus_prep + disease_new + drug_new

In [9]:
# Train the model of vector representation
model = Word2Vec(sentences = corpus_final, size = 100, sg = 1, window = 3, 
                 min_count = 1, iter = 10, workers = 3)

In [10]:
# Create a copy of the initial df so that we work on that 
df_en = df.copy()

In [11]:
# For each instance/sentence compute the average of all the words 

def avg_vector(list_of_words):
    vector_sum = model.wv[list_of_words[0]]
    for i in range(1,len(list_of_words)):
        vector_sum = vector_sum +  model.wv[list_of_words[i]]
    return vector_sum/len(list_of_words)


#def avg_vector_test():
    #text = ['nisoldipine', 'extendedrelease', 'tablets', 'indicated', 'treatment', 'hypertension', 'may', 'used', 'alone', 'combination', 'antihypertensive', 'agents']
    #print (text)
    #print (avg_vector(text))
    



In [12]:
# 
df_en['text'] = df_en['text'].apply(avg_vector)
df_en['disease'] = df_en['disease'].apply(avg_vector)
df_en['drug'] = df_en['drug'].apply(avg_vector)


In [13]:
arr1 =  df_en['text'].values
arr2 = np.array( df_en['drug'])
arr3 = np.array( df_en['disease'] )

In [14]:
X_arr1 = np.array(list(map(np.array, arr1)))
X_arr2 = np.array(list(map(np.array, arr2)))
X_arr3 = np.array(list(map(np.array, arr3)))

# The instances and the labels
X_features =np.hstack((X_arr1, X_arr2, X_arr3))
y = list(y)

In [15]:
# Check the shape, we have 1322 data points of 3 different features (text, drug, disease) each having a shape of 100
# so we should have a total ok (1322,300)
X_features.shape

(1322, 300)

In [16]:
y = np.array(y)

In [None]:
d1 = {'feature': X_features, 'class': y}
df1 = pd.DataFrame(data=d1)
embedded = df1.to_csv('embedded.csv')

# Baseline Models

## Split 

In [17]:
#from sklearn.model_selection import train_test_split

In [18]:
#X_train, X_test, y_train, y_test = train_test_split(X_features, y, test_size=0.2)

## Scaling

In [98]:
# Scaling

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix,classification_report
from sklearn.model_selection import cross_val_score, GridSearchCV

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

## SVM

In [35]:
params_grid = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4],
                     'C': [1, 10, 100, 1000]},
                    {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}]

In [None]:
from sklearn import svm 
from sklearn.svm import SVC

from sklearn.model_selection import StratifiedKFold, KFold
from pycm import *

skf = StratifiedKFold(n_splits=5)
#kf = KFold(n_splits=2)
X = scaler.fit_transform(X_features)
#X = X_features

all_cm = []

for train_index, test_index in skf.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    print (len(X_train), len(X_test))
    
    #svm_model = GridSearchCV(SVC(), params_grid, cv=5)
    model = svm.SVC(gamma= 'scale')
    model.fit(X_train, y_train)
    
    
    y_pred = model.predict(X_test)
    
    cm = ConfusionMatrix(y_test, y_pred)
    print(cm)
    all_cm.append(c)
    

In [23]:
# print('Best score for training data:', svm_model.best_score_,"\n") 

# View the best parameters for the model found using grid search
# print('Best C:',svm_model.best_estimator_.C,"\n") 
# print('Best Kernel:',svm_model.best_estimator_.kernel,"\n")
# print('Best Gamma:',svm_model.best_estimator_.gamma,"\n")


#final_model = svm_model.best_estimator_
#y_pred = final_model.predict(X_test_scaled)
#y_pred_label = list(le.inverse_transform(y_pred))

## Decision Tree

In [112]:
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics
from sklearn.model_selection import StratifiedKFold
from pycm import *

In [113]:
skf = StratifiedKFold(n_splits=5)
dt_model = DecisionTreeClassifier()

X = scaler.fit_transform(X_features)
#X = X_features
all_cm = []


for train_index, test_index in skf.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    print (len(X_train), len(X_test))
    
    dt_model = DecisionTreeClassifier()
    dt_model = dt_model.fit(X_train, y_train)
    
    
    y_pred = dt_model.predict(X_test)
    
    cm = ConfusionMatrix(y_test, y_pred)
    print(cm)
    all_cm.append(c)

1057 265
Predict   0         1         2         3         
Actual
0         16        0         0         0         

1         0         12        0         0         

2         0         0         34        0         

3         0         0         0         203       





Overall Statistics : 

95% CI                                                            (1.0,1.0)
ACC Macro                                                         1.0
AUNP                                                              1.0
AUNU                                                              1.0
Bennett S                                                         1.0
CBA                                                               1.0
Chi-Squared                                                       795.0
Chi-Squared DF                                                    9
Conditional Entropy                                               -0.0
Cramer V                                                     

In [114]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       1.00      0.94      0.97        16
           1       1.00      1.00      1.00        12
           2       1.00      0.97      0.99        34
           3       0.99      1.00      1.00       202

    accuracy                           0.99       264
   macro avg       1.00      0.98      0.99       264
weighted avg       0.99      0.99      0.99       264



## Random forest

In [115]:
from sklearn.ensemble import RandomForestClassifier
X = scaler.fit_transform(X_features)
#X = X_features
all_cm = []


for train_index, test_index in skf.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    print (len(X_train), len(X_test))
    
    rf_model = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 42)
    rf_model = rf_model.fit(X_train, y_train)
    
    
    y_pred = rf_model.predict(X_test)
    
    cm = ConfusionMatrix(y_test, y_pred)
    print(cm)
    all_cm.append(c)

1057 265
Predict   0         1         2         3         
Actual
0         13        0         1         2         

1         0         12        0         0         

2         0         0         32        2         

3         0         0         0         203       





Overall Statistics : 

95% CI                                                            (0.96475,0.99751)
ACC Macro                                                         0.99057
AUNP                                                              0.96558
AUNU                                                              0.9606
Bennett S                                                         0.97484
CBA                                                               0.93359
Chi-Squared                                                       718.01811
Chi-Squared DF                                                    9
Conditional Entropy                                               0.09384
Cramer V                   

In [116]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       1.00      0.69      0.81        16
           1       0.92      1.00      0.96        12
           2       1.00      0.97      0.99        34
           3       0.98      1.00      0.99       202

    accuracy                           0.98       264
   macro avg       0.97      0.91      0.94       264
weighted avg       0.98      0.98      0.98       264

