In [18]:
# DEPENDENCIES Python3
import os
import re
import numpy as np
import pickle
from nltk import word_tokenize, pos_tag, bigrams, trigrams
from nltk.tokenize import WordPunctTokenizer
from sklearn.feature_extraction import DictVectorizer
from sklearn import svm
from sklearn import metrics
from geniatagger import GeniaTagger

In [19]:
# CONSTANTS
# The directories where the gold standard xml is stored (update as required)
TRAIN_DIR = './Train/'
TEST_DIR = './Test/'
GENIA_TAGGER_PATH = '/Users/macbook13/Desktop/geniatagger-3.0.2/geniatagger'

In [None]:
# PARSING UTILITIES
def is_drug_bank_filename(path):
    return path.count('DrugBank') > 0 and path[-4:] == '.xml'


def is_med_line_filename(path):
    return path.count('MedLine') > 0 and path[-4:] == '.xml'


def sentence_from_doc(doc):
    doc = doc.replace('\n', '').replace('</document>', '')
    sentences = doc.split('<sentence')[1:]
    return ['<sentence' + str for str in sentences]


def text_from_sentence(sentence):
    regex = r'<sentence .*?text="(.*?)".*?>'
    return re.findall(regex, sentence)[0]


def drug_pos_from_sentence(sentence):
    regex = r'<entity.*?id="(.*?)".*?charOffset="(.*?)".*?type="(.*?)".*?text="(.*?)"'
    return re.findall(regex, sentence)


def drug_pairs_from_sentence(sentence):
    regex = r'<pair.*?e1="(.*?)".*?e2="(.*?)".*?(?:ddi=".*?" type="(.*?)"\/>|ddi="(.*?)")'
    return re.findall(regex, sentence)


def replace_drug_name(text_string, drug_pos_string, replace_string):
    replaced_text = []

    for pos_pair in drug_pos_string.split(';'):
        start, end = pos_pair.split('-')
        middle = int((int(end) + int(start))/2)

        for text_idx, text_char in enumerate(list(text_string)):
            
            if text_idx >= int(start) and text_idx <= int(end):
                text_char = ' '
            
            if middle <= text_idx < middle + len(replace_string):
                text_char = replace_string[text_idx - middle]
                
            replaced_text.append(text_char)                            

    return ''.join(replaced_text)


def parse_drug_ddi(xml_dir, tags=None):
    
    for root, dirs, files in os.walk(xml_dir):
        
        for file_name in files:
            path = os.path.join(root, file_name)
            
            if is_drug_bank_filename(path) or is_med_line_filename(path):
                doc = open(path, 'rb').read().decode('utf-8')
                sentences = sentence_from_doc(doc)
                
                for sentence in sentences:
                    text = text_from_sentence(sentence)
                    drug_pos_map = {}
                    
                    # First we replace any drug instances with DGN token 'DGN' has been chosen as they have no conflict with the corpus
                    for drug_id, drug_pos_str, drug_type, drug_text in drug_pos_from_sentence(sentence):
                        drug_pos_map[drug_id] = [drug_pos_str, drug_type, drug_text]
                        text = replace_drug_name(text, drug_pos_str, 'DGN')
                    
                    # Then for each pair we update the token to be either 'DGX' or 'DGY' depending on the pair
                    for e1, e2, ddi_type, ddi_bool in drug_pairs_from_sentence(sentence):                            
                        e1_drug_pos_str, e1_drug_type, e1_drug_text = drug_pos_map[e1]
                        e2_drug_pos_str, e2_drug_type, e2_drug_text = drug_pos_map[e2]
                        text = replace_drug_name(text, e1_drug_pos_str, 'DGX')
                        text = replace_drug_name(text, e2_drug_pos_str, 'DGY')
                        e_meta = [e1_drug_text, e1_drug_type, e2_drug_text, e2_drug_type]
                        yield text, e_meta, str(ddi_bool or ddi_type)

xml_reader = parse_drug_ddi(TRAIN_DIR)
next(xml_reader)

In [21]:
# FEATURE BUILDING FUNCTIONS
tagger = GeniaTagger(GENIA_TAGGER_PATH)
tokenizer = WordPunctTokenizer()


def tokenize(text):
    text = text.replace(',', ' ,').replace('-', ' ').replace('/', ' / ')
    text = re.sub('\d', "num", text)
    text = tokenizer.tokenize(text)
    return ' '.join(text)


def feature_builder(data_reader):    
    for text, e_meta, _ in data_reader:        
        feature_dict = {}
        sent = tokenize(text)
        split_text = text.replace('DGX', ' DGX ').replace('DGY', ' DGY ').split()
        e1_split = split_text.index('DGX')
        e2_split = split_text.index('DGY')
        sent_list, _, pos, chunk, _ = list(zip(*tagger.parse(sent)))
        
        #CF1 : any word between relation arguments
        for k,i in enumerate(range(e1_split+1, e2_split)):
            feature_dict["CF1_"+str(k)] = split_text[i] 

        #CF2 : any pos between relation arguments
        for k,i in enumerate(range(e1_split+1, e2_split)):
            feature_dict["CF2_"+str(k)] = split_text[i] 

        #CF3 : any bigram between relation arguments
        sent_bigrams = list(bigrams(split_text[e1_split+1:e2_split]))
        for k,bigram in enumerate(sent_bigrams):
            feature_dict['CF3_'+str(k)] = '-'.join(bigram)

        #CF4 : word preciding first argument
        if e1_split == 0:
            feature_dict['CF4'] = '<S>'
        else:
            feature_dict['CF4'] = split_text[e1_split - 1]

        #CF5 : word prediding second arguments
        if e2_split == 0:
            feature_dict['CF5'] = '<S>'
        else:
            feature_dict['CF5'] = split_text[e2_split - 1]

        #CF6 : any three words succeeding the first arguments

        if e1_split <= len(sent) - 3 : 
            sent_trigrams = list(trigrams(split_text[e1_split+1:]))
            for k,trigram in enumerate(sent_trigrams):
                feature_dict['CF6_'+str(k)] = '-'.join(trigram)
        else:
            feature_dict['CF6_0'] = '<E>'

        #CF7 : any three succeeding the second arguments
        if e2_split <= len(sent) - 3 : 
            sent_trigrams = list(trigrams(split_text[e2_split+1:]))
            for k,trigram in enumerate(sent_trigrams):
                feature_dict['CF7_'+str(k)] = '-'.join(trigram)
        else:
            feature_dict['CF7_0'] = '<E>'

        #CF8 : sequence of chunk type between relation argumemts
        feature_dict['CF8'] = '-'.join(chunk[e1_split+1:e2_split])

        #CF9 : string of words between relation arguments
        feature_dict['CF9'] = '-'.join(split_text[e1_split+1:e2_split])

        #CF13 : Distance between two arguments
        feature_dict['CF13'] = abs(sent.index('DGX') - sent.index('DGY'))

        #CF14 : Presence of puncuation sign between arguments
        if split_text[e1_split : e2_split] == [','] or ['and'] or ['or'] or ['/'] :
            feature_dict['CF14'] = True
        else:
            feature_dict['CF14'] = False

        yield feature_dict

In [30]:
classes = {'false':0, 'mechanism': 1, 'effect': 2, 'advise': 3, 'int': 4, 'true': 5}
print("PARSING DATA")
train_data = parse_drug_ddi(TRAIN_DIR)
test_data = parse_drug_ddi(TEST_DIR)
print("BUILDING FEATURES")

if os.path.isfile('train_features.pickle'):
    train_features = pickle.load(open('train_features.pickle', 'rb'))
else: 
    print('Building features, may take a while')
    train_features = feature_builder(train_data)

if os.path.isfile('test_features.pickle'):
    test_features = pickle.load(open('test_features.pickle', 'rb'))
else: 
    print('Building features, may take a while')
    test_features = feature_builder(test_data)

print("VECTORIZING FEATURES")
vec = DictVectorizer()
print('X_train')
X_train = vec.fit_transform(train_features)
print('Y_train')
Y_train = [classes[label] for _, _, label in train_data]
print('X_test')
X_test = vec.transform(test_features)
print('Y_test')
Y_test = [classes[label] for _, _, label in test_data]

PARSING DATA


TypeError: 'generator' object is not subscriptable

In [None]:
# EVALUATING THE CLASSIFIER
print("FITTING CLASSIFIER")
clf = svm.SVC(kernel='linear', C=0.1).fit(X_train, Y_train)
a = clf.score(X_test, Y_test)
print("accuracy", a)
y_true = Y_test
y_pred = clf.predict(X_test)
print(metrics.classification_report(y_true, y_pred,[1,2,3,4],digits=4))