#### Importing required lib

In [1]:
import pandas as pd
import sklearn
import numpy as np
from tqdm import tqdm_notebook
import os,time

In [2]:
train_files = os.listdir(path='rel/BioNLP-OST-2019_BB-rel_train')
valid_files = os.listdir(path='rel/BioNLP-OST-2019_BB-rel_dev/')

In [3]:
def get_data_files(files,types = 'train'):
    #creating two dataframes
    a1_df = pd.DataFrame(columns=['Words','sc_ec','Entity'])
    a2_df = pd.DataFrame(columns = ['Relation','word_1','word_2'])

    #reading all files 
    for file in tqdm_notebook(files):
        #look for files ends with .a1
        if file.endswith('.a1'):

            if (os.path.getsize(f'rel/BioNLP-OST-2019_BB-rel_{types}/{file}') > 0 ):
                a1 = pd.read_csv(f'rel/BioNLP-OST-2019_BB-rel_{types}/{file}',sep='\t',header=None)
                spl = a1[1].str.split(' ',n=1,expand=True)
                a1['Words'] = a1[2]
                a1['sc_ec'] = spl[1]
                a1['Entity'] = spl[0]
                a1.drop([1,2],axis=1,inplace=True)
                a1_df = a1_df.append(a1,ignore_index=True,sort=True)

                #comparing with a2 and append it to a2_data frame

                file = file.split(sep='.')

                if(os.path.getsize(f'rel/BioNLP-OST-2019_BB-rel_{types}/{file[0]}.a2') > 0):
                    a2 = pd.read_csv(f'rel/BioNLP-OST-2019_BB-rel_{types}/{file[0]}.a2',sep = '\t',header=None)
                    spl = a2[1].str.split(' ',n = 2,expand= True)
                    spl_1 = spl[1].str.split(':',n = 1,expand=True)
                    spl_2 = spl[2].str.split(':',n = 1,expand=True)

                    a2['Relation'] = spl[0]
                    a2['w_1'] = spl_1[1]
                    a2['w_2'] = spl_2[1]

                    me = pd.merge(a2,a1[[0,'Words']],how='left',
                                     left_on = ['w_1'],
                                     right_on = [0])
                    a2['word_1'] = me['Words']
                    del me
                    me = pd.merge(a2,a1[[0,'Words']],how='left',
                                     left_on = ['w_2'],
                                     right_on = [0])
                    a2['word_2'] = me['Words']
                    del me

                    a2.drop([1,'w_1','w_2'],axis = 1,inplace = True)
                    a2_df = a2_df.append(a2,ignore_index=True,sort=True)
                    del a1,a2
                    
                    a1_df.drop([0],axis=1,inplace = True)
                    
    return a1_df,a2_df


## Training data - Preprocessing

In [4]:
train_a1,train_a2 = get_data_files(train_files,'train')

HBox(children=(IntProgress(value=0, max=375), HTML(value='')))




In [5]:
train_a2.head()

Unnamed: 0,Relation,word_1,word_2,0
0,Lives_In,Non-O1 Vibrio cholerae,patients with cirrhosis,R1
1,Lives_In,non-O1 V. cholerae,blood culture,R2
2,Lives_In,non-O1 V. cholerae,blood,R3
3,Lives_In,non-O1 V. cholerae,Patients with underlying cirrhosis and the afo...,R4
4,Lives_In,non-O1 V. cholerae,Patients with decompensated cirrhosis,R5


In [6]:
print(f'total number of train_a1 samples with title and paragraph : {train_a1.shape}')

total number of train_a1 samples with title and paragraph : (2329, 3)


In [7]:
train_a1.drop(train_a1[train_a1['Entity'] == 'Title'].index,axis=0,inplace=True)
train_a1.drop(train_a1[train_a1['Entity'] == 'Paragraph'].index,axis=0,inplace=True)

In [8]:
print(f'total number of train_a1 samples without title and paragraph : {train_a1.shape}')
print(f'total number of train_a2 samples : {train_a2.shape}')

total number of train_a1 samples without title and paragraph : (2179, 3)
total number of train_a2 samples : (1142, 4)


## validation data - Preprocessing

In [9]:
valid_a1,valid_a2 = get_data_files(valid_files,'dev')

HBox(children=(IntProgress(value=0, max=192), HTML(value='')))




In [10]:
valid_a1.head()

Unnamed: 0,Entity,Words,sc_ec
0,Title,Neutrophils are resistant to Yersinia YopJ/P-i...,0 143
1,Paragraph,The human innate immune system relies on the c...,144 664
2,Paragraph,"In this study, we utilized wild-type and mutan...",665 1672
3,Paragraph,Our findings showed that Yersinia YopJ and/or ...,1673 2071
4,Habitat,Neutrophils,0 11


In [11]:
valid_a2.head()

Unnamed: 0,Relation,word_1,word_2,0
0,Lives_In,Yersinia,humans,R1
1,Exhibits,Yersinia,bacterial pathogens,R2
2,Exhibits,Yersinia,bacterial pathogens,R3
3,Exhibits,Y. pestis,bacterial pathogens,R4
4,Exhibits,Y. pseudotuberculosis,bacterial pathogens,R5


In [12]:
print(f'total number of valid_a1 samples with title and paragraph : {valid_a1.shape}')

total number of valid_a1 samples with title and paragraph : (1354, 3)


In [13]:
valid_a1.drop(valid_a1[valid_a1['Entity'] == 'Title'].index,axis=0,inplace=True)
valid_a1.drop(valid_a1[valid_a1['Entity'] == 'Paragraph'].index,axis=0,inplace=True)

In [14]:
train_a2.drop([0],axis=1,inplace=True)
valid_a2.drop([0],axis=1,inplace=True)

In [15]:
print(f'total number of valid_a1 samples without title and paragraph : {valid_a1.shape}')
print(f'total number of valid_a2 samples : {valid_a2.shape}')

total number of valid_a1 samples without title and paragraph : (1266, 3)
total number of valid_a2 samples : (610, 3)


In [16]:
train_a2.head(10)

Unnamed: 0,Relation,word_1,word_2
0,Lives_In,Non-O1 Vibrio cholerae,patients with cirrhosis
1,Lives_In,non-O1 V. cholerae,blood culture
2,Lives_In,non-O1 V. cholerae,blood
3,Lives_In,non-O1 V. cholerae,Patients with underlying cirrhosis and the afo...
4,Lives_In,non-O1 V. cholerae,Patients with decompensated cirrhosis
5,Lives_In,Kingella kingae,Respiratory
6,Lives_In,Kingella kingae,healthy children
7,Lives_In,Kingella kingae,respiratory tract
8,Lives_In,Kingella kingae,young children
9,Exhibits,Kingella kingae,invasive pathogen of young children


In [17]:
na_inx = train_a2[train_a2['Relation'] == 'Equiv'].index.values

In [18]:
train_a2.drop(na_inx,inplace=True)

In [19]:
na_inx = valid_a2[valid_a2['Relation'] == 'Equiv'].index.values
valid_a2.drop(na_inx,inplace=True)

### Removing stop words from words

In [20]:
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
import re
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/paperspace/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [21]:
#Preprocessing - removing unwanted characters, tokenization, stop-word removal
def clean_data(txts):
    x = re.sub("[^a-zA-Z0-9]", " ",txts) 
    x = x.lower().split()                             
    stops = set(stopwords.words("english"))                  
    words = [w for w in x if not w in stops] 
    
    return( " ".join(words)) 

In [22]:
train_a2['word_1'] = train_a2['word_1'].apply(lambda x: clean_data(x))
print(train_a2['word_1'].head())

0    non o1 vibrio cholerae
1         non o1 v cholerae
2         non o1 v cholerae
3         non o1 v cholerae
4         non o1 v cholerae
Name: word_1, dtype: object


In [23]:
train_a2.head(10)

Unnamed: 0,Relation,word_1,word_2
0,Lives_In,non o1 vibrio cholerae,patients with cirrhosis
1,Lives_In,non o1 v cholerae,blood culture
2,Lives_In,non o1 v cholerae,blood
3,Lives_In,non o1 v cholerae,Patients with underlying cirrhosis and the afo...
4,Lives_In,non o1 v cholerae,Patients with decompensated cirrhosis
5,Lives_In,kingella kingae,Respiratory
6,Lives_In,kingella kingae,healthy children
7,Lives_In,kingella kingae,respiratory tract
8,Lives_In,kingella kingae,young children
9,Exhibits,kingella kingae,invasive pathogen of young children


In [24]:
train_a2['word_2'] = train_a2['word_2'].apply(lambda x: clean_data(x))
print(train_a2['word_2'].head())

0                                   patients cirrhosis
1                                        blood culture
2                                                blood
3    patients underlying cirrhosis aforementioned b...
4                     patients decompensated cirrhosis
Name: word_2, dtype: object


In [25]:
valid_a2['word_1'] = valid_a2['word_1'].apply(lambda x: clean_data(x))
valid_a2['word_2'] = valid_a2['word_2'].apply(lambda x: clean_data(x))


In [26]:
valid_a2.head()

Unnamed: 0,Relation,word_1,word_2
0,Lives_In,yersinia,humans
1,Exhibits,yersinia,bacterial pathogens
2,Exhibits,yersinia,bacterial pathogens
3,Exhibits,pestis,bacterial pathogens
4,Exhibits,pseudotuberculosis,bacterial pathogens


In [27]:
train_a2['combined_word'] = train_a2[['word_1', 'word_2']].apply(lambda x: ' '.join(x), axis=1)

In [28]:
valid_a2['combined_word'] = valid_a2[['word_1', 'word_2']].apply(lambda x: ' '.join(x), axis=1)

In [29]:
train_a2.head()

Unnamed: 0,Relation,word_1,word_2,combined_word
0,Lives_In,non o1 vibrio cholerae,patients cirrhosis,non o1 vibrio cholerae patients cirrhosis
1,Lives_In,non o1 v cholerae,blood culture,non o1 v cholerae blood culture
2,Lives_In,non o1 v cholerae,blood,non o1 v cholerae blood
3,Lives_In,non o1 v cholerae,patients underlying cirrhosis aforementioned b...,non o1 v cholerae patients underlying cirrhosi...
4,Lives_In,non o1 v cholerae,patients decompensated cirrhosis,non o1 v cholerae patients decompensated cirrh...


### Converting the dataframe into list

##### Corpus for combined words

In [30]:
train_words = train_a2['combined_word'].tolist()
train_relation = train_a2['Relation'].tolist()

In [31]:
valid_words = valid_a2['combined_word'].tolist()
valid_relation = valid_a2['Relation'].tolist()

In [32]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/paperspace/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [33]:
corpus = []
for i in range(len(train_words)):
    corpus.append(word_tokenize(train_words[i]))

In [28]:
for i in range(len(valid_words)):
    corpus.append(word_tokenize(valid_words[i]))

In [37]:
corpus

[['non', 'o1', 'vibrio', 'cholerae', 'patients', 'cirrhosis'],
 ['non', 'o1', 'v', 'cholerae', 'blood', 'culture'],
 ['non', 'o1', 'v', 'cholerae', 'blood'],
 ['non',
  'o1',
  'v',
  'cholerae',
  'patients',
  'underlying',
  'cirrhosis',
  'aforementioned',
  'bacteremia'],
 ['non', 'o1', 'v', 'cholerae', 'patients', 'decompensated', 'cirrhosis'],
 ['kingella', 'kingae', 'respiratory'],
 ['kingella', 'kingae', 'healthy', 'children'],
 ['kingella', 'kingae', 'respiratory', 'tract'],
 ['kingella', 'kingae', 'young', 'children'],
 ['kingella', 'kingae', 'invasive', 'pathogen', 'young', 'children'],
 ['k', 'kingae', 'day', 'care', 'center', 'attendees'],
 ['k', 'kingae', 'throat', 'cultures'],
 ['k', 'kingae', 'throat'],
 ['k', 'kingae', 'children'],
 ['k', 'kingae', 'children'],
 ['k', 'kingae', 'surgical', 'patients'],
 ['k', 'kingae', 'pharyngeal'],
 ['campylobacter', 'uk'],
 ['campylobacter', 'uk', 'retail', 'poultry'],
 ['campylobacter', 'antimicrobial', 'resistance'],
 ['campyloba

In [29]:
train=[]
for i in range(len(train_words)):
    train.append(word_tokenize(train_words[i]))

In [30]:
test=[]
for i in range(len(valid_words)):
    test.append(word_tokenize(valid_words[i]))

### Feature extraction using word2vec

In [7]:
from gensim.models import Word2Vec
from sklearn import preprocessing

In [32]:
#Creating word embedding for the words. Embedding dimension = 100
model = Word2Vec(corpus, size=50, window=2, min_count=1)

In [42]:
#Voabulary list
model.wv.vocab

{'non': <gensim.models.keyedvectors.Vocab at 0x7f28a1b2e550>,
 'o1': <gensim.models.keyedvectors.Vocab at 0x7f28a1b2e588>,
 'vibrio': <gensim.models.keyedvectors.Vocab at 0x7f28a1b2e5c0>,
 'cholerae': <gensim.models.keyedvectors.Vocab at 0x7f28a1b2e5f8>,
 'patients': <gensim.models.keyedvectors.Vocab at 0x7f28a1b2e630>,
 'cirrhosis': <gensim.models.keyedvectors.Vocab at 0x7f28a1b2e668>,
 'v': <gensim.models.keyedvectors.Vocab at 0x7f28a1b2e6a0>,
 'blood': <gensim.models.keyedvectors.Vocab at 0x7f28a1b2e6d8>,
 'culture': <gensim.models.keyedvectors.Vocab at 0x7f28a1b2e710>,
 'underlying': <gensim.models.keyedvectors.Vocab at 0x7f28a1b2e748>,
 'aforementioned': <gensim.models.keyedvectors.Vocab at 0x7f28a1b2e780>,
 'bacteremia': <gensim.models.keyedvectors.Vocab at 0x7f28a1b2e7b8>,
 'decompensated': <gensim.models.keyedvectors.Vocab at 0x7f28a1b2e7f0>,
 'kingella': <gensim.models.keyedvectors.Vocab at 0x7f28a1b2e828>,
 'kingae': <gensim.models.keyedvectors.Vocab at 0x7f28a1b2e860>,
 'res

In [33]:
#Creating the input data

#Initializing the X matrix with zeros
X_train= np.zeros((len(train),50)) 

for i in range(len(train)):
    #Create a list of word embeddings of the words in each sentence
    emb = [model.wv[w] for w in train[i]] 
    #Take the mean of the word embeddings of the words in a sentence 
    #because length of the sentences varies and the dimension of the 
    #features will increase with the increase in the number of 
    #words in the sentence
    X_train[i] = np.mean(emb, axis=0) 

In [34]:
#Creating the input data

#Initializing the X matrix with zeros
X_test= np.zeros((len(test),50)) 

for i in range(len(test)):
    #Create a list of word embeddings of the words in each sentence
    emb = [model.wv[w] for w in test[i]] 
    #Take the mean of the word embeddings of the words in a sentence 
    #because length of the sentences varies and the dimension of the 
    #features will increase with the increase in the number of 
    #words in the sentence
    X_test[i] = np.mean(emb, axis=0) 

In [77]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
Y_train = le.fit_transform(train_relation)

In [78]:
Y_test = le.fit_transform(valid_relation)

### Train model

In [8]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix

In [40]:
model = RandomForestClassifier(n_estimators=30,criterion='entropy')

In [41]:
model.fit(X_train,Y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=30,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [42]:
pre = model.predict(X_test)

In [43]:
accuracy_score(Y_test,pre)

0.7911184210526315

In [44]:
confusion_matrix(Y_test,pre)

array([[ 59,  95],
       [ 32, 422]])

In [45]:
model = RandomForestClassifier(n_estimators=30,criterion='entropy',class_weight='balanced')

In [46]:
model.fit(X_train,Y_train)
pre = model.predict(X_test)
print(accuracy_score(Y_test,pre))
print(confusion_matrix(Y_test,pre))

0.7648026315789473
[[ 42 112]
 [ 31 423]]


In [48]:
from sklearn.model_selection import KFold, train_test_split
import lightgbm as lgb

In [49]:
n_fold = 10
folds = KFold(n_splits=n_fold,shuffle=True,random_state = 42)

In [50]:
#model Parameters
params = {'num_leaves': 41,
         'min_data_in_leaf': 10, 
         'objective':'binary',
         'max_depth': -1,
         'learning_rate': 0.002,
         "boosting": "gbdt",
         "feature_fraction": 0.91,
         "bagging_freq": 1,
         "bagging_fraction": 0.91,
         "bagging_seed": 42,
         "metric": 'mae',
         "lambda_l1": 0.1,
         "verbosity": -1,
         "nthread": -1,
         "random_state": 42}

In [51]:
oof = np.zeros(len(X_train))
predictions = np.zeros(len(X_test))

In [52]:
#run model
for fold_,(trn_idx,val_idx) in enumerate(folds.split(X_train,Y_train)):
    strLog = "fold {}".format(fold_)
    print(strLog)
    
    X_tr,X_val ,Y_tr,Y_val= train_test_split(X_train,Y_train,test_size=0.1)
    model = lgb.LGBMClassifier(**params,n_estimators = 1000,n_jobs = -1)
    model.fit(X_tr,Y_tr,
             eval_set=[(X_tr,Y_tr),(X_val,Y_val)],
             eval_metric = 'accuracy',
             verbose = 500,
             early_stopping_rounds=100)
                        
    #predictions
    

fold 0
Training until validation scores don't improve for 100 rounds.
[500]	training's l1: 0.210574	valid_1's l1: 0.292979
[1000]	training's l1: 0.115887	valid_1's l1: 0.237917
Did not meet early stopping. Best iteration is:
[1000]	training's l1: 0.115887	valid_1's l1: 0.237917
fold 1
Training until validation scores don't improve for 100 rounds.
[500]	training's l1: 0.208289	valid_1's l1: 0.30717
[1000]	training's l1: 0.114326	valid_1's l1: 0.24843
Did not meet early stopping. Best iteration is:
[1000]	training's l1: 0.114326	valid_1's l1: 0.24843
fold 2
Training until validation scores don't improve for 100 rounds.
[500]	training's l1: 0.214386	valid_1's l1: 0.2619
[1000]	training's l1: 0.117975	valid_1's l1: 0.201465
Did not meet early stopping. Best iteration is:
[1000]	training's l1: 0.117975	valid_1's l1: 0.201465
fold 3
Training until validation scores don't improve for 100 rounds.
[500]	training's l1: 0.209023	valid_1's l1: 0.289516
[1000]	training's l1: 0.115111	valid_1's l1: 

In [53]:
predictions = model.predict(X_test, 
                                 num_iteration=model.best_iteration_)

In [54]:
predictions

array([1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1,
       0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1,
       1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0,
       1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

In [55]:
accuracy_score(Y_test,predictions)

0.7944078947368421

In [56]:
from sklearn.metrics import confusion_matrix
confusion_matrix(Y_test,predictions)

array([[ 46, 108],
       [ 17, 437]])

In [61]:
np.sum(Y_test)

454

###### Corpus for Word1, word2 separately


In [57]:
train_words1 = train_a2['word_1'].tolist()
train_words2 = train_a2['word_2'].tolist()

valid_words1 = valid_a2['word_1'].tolist()
valid_words2 = valid_a2['word_2'].tolist()

In [58]:
corpus = []
for i in range(len(train_words1)):
    corpus.append(word_tokenize(train_words1[i]))
for i in range(len(train_words2)):
    corpus.append(word_tokenize(train_words2[i]))
for i in range(len(valid_words1)):
    corpus.append(word_tokenize(valid_words1[i]))
for i in range(len(valid_words2)):
    corpus.append(word_tokenize(valid_words2[i]))

In [59]:
train=[]
for i in range(len(train_words1)):
    train.append(word_tokenize(train_words1[i]))
for i in range(len(train_words2)):
    train.append(word_tokenize(train_words2[i]))

test = []
for i in range(len(valid_words1)):
    test.append(word_tokenize(valid_words1[i]))
for i in range(len(valid_words2)):
    test.append(word_tokenize(valid_words2[i]))

In [60]:
#Creating word embedding for the words. Embedding dimension = 100
model = Word2Vec(corpus, size=50, window=2, min_count=1)

In [61]:
#Creating the input data

#Initializing the X matrix with zeros
X1_train= np.zeros((len(train_words1),50)) 

for i in range(len(train_words1)):
    words = word_tokenize(train_words1[i])
    emb = [model.wv[w] for w in words] 
    X1_train[i] = np.mean(emb, axis=0) 
    
    
X2_train= np.zeros((len(train_words2),50)) 

for i in range(len(train_words2)):
    words = word_tokenize(train_words2[i])
    emb = [model.wv[w] for w in words] 
    X2_train[i] = np.mean(emb, axis=0) 

In [62]:
X_train = np.concatenate((X1_train,X2_train),axis=1)

In [63]:
X_train.shape

(1127, 100)

In [64]:
X1_test= np.zeros((len(valid_words1),50)) 

for i in range(len(valid_words1)):
    words = word_tokenize(valid_words1[i])
    emb = [model.wv[w] for w in words] 
    X1_test[i] = np.mean(emb, axis=0) 
    
    
X2_test= np.zeros((len(valid_words2),50)) 

for i in range(len(valid_words2)):
    words = word_tokenize(valid_words2[i])
    emb = [model.wv[w] for w in words] 
    X2_test[i] = np.mean(emb, axis=0) 
    
X_test = np.concatenate((X1_test,X2_test),axis=1)

In [65]:
X_test.shape

(608, 100)

In [67]:
model = RandomForestClassifier(n_estimators=30,criterion='entropy',class_weight='balanced')
model.fit(X_train,Y_train)

RandomForestClassifier(bootstrap=True, class_weight='balanced',
                       criterion='entropy', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, min_impurity_decrease=0.0,
                       min_impurity_split=None, min_samples_leaf=1,
                       min_samples_split=2, min_weight_fraction_leaf=0.0,
                       n_estimators=30, n_jobs=None, oob_score=False,
                       random_state=None, verbose=0, warm_start=False)

In [68]:
pre = model.predict(X_test)

In [69]:
accuracy_score(Y_test,pre)

0.8305921052631579

In [70]:
confusion_matrix(Y_test,pre)

array([[ 58,  96],
       [  7, 447]])

In [71]:
n_fold = 10
folds = KFold(n_splits=n_fold,shuffle=True,random_state = 42)

oof = np.zeros(len(X_train))
predictions = np.zeros(len(X_test))

#run model
for fold_,(trn_idx,val_idx) in enumerate(folds.split(X_train,Y_train)):
    strLog = "fold {}".format(fold_)
    print(strLog)
    
    X_tr,X_val ,Y_tr,Y_val= train_test_split(X_train,Y_train,test_size=0.1)
    model = lgb.LGBMClassifier(**params,n_estimators = 1000,n_jobs = -1)
    model.fit(X_tr,Y_tr,
             eval_set=[(X_tr,Y_tr),(X_val,Y_val)],
             eval_metric = 'accuracy',
             verbose = 500,
             early_stopping_rounds=100)
                        
    #predictions
    

fold 0
Training until validation scores don't improve for 100 rounds.
[500]	training's l1: 0.184556	valid_1's l1: 0.240666
[1000]	training's l1: 0.0887449	valid_1's l1: 0.163821
Did not meet early stopping. Best iteration is:
[1000]	training's l1: 0.0887449	valid_1's l1: 0.163821
fold 1
Training until validation scores don't improve for 100 rounds.
[500]	training's l1: 0.185478	valid_1's l1: 0.229211
[1000]	training's l1: 0.0895416	valid_1's l1: 0.15017
Did not meet early stopping. Best iteration is:
[1000]	training's l1: 0.0895416	valid_1's l1: 0.15017
fold 2
Training until validation scores don't improve for 100 rounds.
[500]	training's l1: 0.187687	valid_1's l1: 0.21848
[1000]	training's l1: 0.09082	valid_1's l1: 0.145285
Did not meet early stopping. Best iteration is:
[1000]	training's l1: 0.09082	valid_1's l1: 0.145285
fold 3
Training until validation scores don't improve for 100 rounds.
[500]	training's l1: 0.183653	valid_1's l1: 0.232397
[1000]	training's l1: 0.0891804	valid_1's

In [72]:
predictions = model.predict(X_test, 
                                 num_iteration=model.best_iteration_)

In [73]:
confusion_matrix(Y_test,predictions)

array([[ 70,  84],
       [ 30, 424]])

In [74]:
accuracy_score(Y_test,predictions)

0.8125

### FASTTEXT

In [33]:
from gensim.models.wrappers import FastText

In [125]:
model = FastText.load_fasttext_format('wiki.en')

In [38]:
model.wv.vocab

{',': <gensim.models.deprecated.keyedvectors.Vocab at 0x7f6c6fb198d0>,
 '.': <gensim.models.deprecated.keyedvectors.Vocab at 0x7f6c6fb19240>,
 'the': <gensim.models.deprecated.keyedvectors.Vocab at 0x7f6c6fb19550>,
 '</s>': <gensim.models.deprecated.keyedvectors.Vocab at 0x7f6c6fb19a20>,
 'of': <gensim.models.deprecated.keyedvectors.Vocab at 0x7f6c6fb19ba8>,
 '-': <gensim.models.deprecated.keyedvectors.Vocab at 0x7f6c6fb19b70>,
 'in': <gensim.models.deprecated.keyedvectors.Vocab at 0x7f6c6fb19be0>,
 'and': <gensim.models.deprecated.keyedvectors.Vocab at 0x7f6c6fb19c18>,
 "'": <gensim.models.deprecated.keyedvectors.Vocab at 0x7f6c6fb19c88>,
 ')': <gensim.models.deprecated.keyedvectors.Vocab at 0x7f6c6fb19cc0>,
 '(': <gensim.models.deprecated.keyedvectors.Vocab at 0x7f6c6fb19cf8>,
 'to': <gensim.models.deprecated.keyedvectors.Vocab at 0x7f6c6fb19d68>,
 'a': <gensim.models.deprecated.keyedvectors.Vocab at 0x7f6c6fb19da0>,
 'is': <gensim.models.deprecated.keyedvectors.Vocab at 0x7f6c6fb19e

In [39]:
model.wv['Yersinia'].shape

(300,)

In [40]:
#Preprocessing - removing unwanted characters, tokenization, stop-word removal
def clean_data(txts):
    x = re.sub("[^a-zA-Z]", " ",txts) 
    x = x.lower().split()                             
    stops = set(stopwords.words("english"))                  
    words = [w for w in x if not w in stops] 
    
    return( " ".join(words))

In [41]:
train_a2['word_1'] = train_a2['word_1'].apply(lambda x: clean_data(x))
train_a2['word_2'] = train_a2['word_2'].apply(lambda x: clean_data(x))



In [42]:

valid_a2['word_1'] = valid_a2['word_1'].apply(lambda x: clean_data(x))
valid_a2['word_2'] = valid_a2['word_2'].apply(lambda x: clean_data(x))



In [43]:
train_words1 = train_a2['word_1'].tolist()
train_words2 = train_a2['word_2'].tolist()

valid_words1 = valid_a2['word_1'].tolist()
valid_words2 = valid_a2['word_2'].tolist()

train=[]
for i in range(len(train_words1)):
    train.append(word_tokenize(train_words1[i]))
for i in range(len(train_words2)):
    train.append(word_tokenize(train_words2[i]))

test = []
for i in range(len(valid_words1)):
    test.append(word_tokenize(valid_words1[i]))
for i in range(len(valid_words2)):
    test.append(word_tokenize(valid_words2[i]))

In [44]:
#Creating the input data

#Initializing the X matrix with zeros
X1_train= np.zeros((len(train_words1),300)) 

for i in range(len(train_words1)):
    words = word_tokenize(train_words1[i])
    emb = [model.wv[w] for w in words] 
    X1_train[i] = np.mean(emb, axis=0) 
    
    
X2_train= np.zeros((len(train_words2),300)) 

for i in range(len(train_words2)):
    words = word_tokenize(train_words2[i])
    emb = [model.wv[w] for w in words] 
    X2_train[i] = np.mean(emb, axis=0) 
X_train = np.concatenate((X1_train,X2_train),axis=1)

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


In [45]:
X1_test= np.zeros((len(valid_words1),300)) 

for i in range(len(valid_words1)):
    words = word_tokenize(valid_words1[i])
    emb = [model.wv[w] for w in words] 
    X1_test[i] = np.mean(emb, axis=0) 
    
    
X2_test= np.zeros((len(valid_words2),300)) 

for i in range(len(valid_words2)):
    words = word_tokenize(valid_words2[i])
    emb = [model.wv[w] for w in words] 
    X2_test[i] = np.mean(emb, axis=0) 
    
X_test = np.concatenate((X1_test,X2_test),axis=1)

In [46]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
Y_train = le.fit_transform(train_relation)

In [47]:
Y_test = le.fit_transform(valid_relation)

In [48]:
df_train= pd.DataFrame(X_train)
df_test= pd.DataFrame(X_test)

In [49]:
df_train.fillna(value=0,inplace=True)
df_test.fillna(value=0,inplace=True)

In [83]:
X_valid = np.array(df_test)
X_train = np.array(df_train)
X_test.shape
X_train.shape

(1127, 600)

In [51]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score,confusion_matrix

In [52]:
model_class_1 = RandomForestClassifier(n_estimators=30,criterion='entropy',class_weight='balanced')
model_class_1.fit(X_train,Y_train)
pre = model_class_1.predict(X_test)
print(accuracy_score(Y_test,pre))
print(confusion_matrix(Y_test,pre))

0.9391447368421053
[[120  34]
 [  3 451]]


In [54]:
from sklearn.externals import joblib

joblib.dump(model_class_1, 'RF_entropy.sav')
joblib_model= joblib.load('RF_entropy.sav')


In [85]:
pre = joblib_model.predict(X_valid)
print(accuracy_score(Y_test,pre))
print(confusion_matrix(Y_test,pre))

0.9391447368421053
[[120  34]
 [  3 451]]


In [56]:
model_class_2 = RandomForestClassifier(n_estimators=30,criterion='gini',class_weight='balanced')
model_class_2.fit(X_train,Y_train)
pre = model_class_2.predict(X_test)
print(accuracy_score(Y_test,pre))
print(confusion_matrix(Y_test,pre))

0.9226973684210527
[[112  42]
 [  5 449]]


In [57]:

joblib.dump(model_class_1, 'RF_gini.sav')


['RF_gini.sav']

In [164]:
n_fold = 10
folds = KFold(n_splits=n_fold,shuffle=True,random_state = 42)

oof = np.zeros(len(X_train))
predictions = np.zeros(len(X_test))

#run model
for fold_,(trn_idx,val_idx) in enumerate(folds.split(X_train,Y_train)):
    strLog = "fold {}".format(fold_)
    print(strLog)
    
    X_tr,X_val ,Y_tr,Y_val= train_test_split(X_train,Y_train,test_size=0.1)
    model = lgb.LGBMClassifier(**params,n_estimators = 1000,n_jobs = -1)
    model.fit(X_tr,Y_tr,
             eval_set=[(X_tr,Y_tr),(X_val,Y_val)],
             eval_metric = 'accuracy',
             verbose = 500)
                        
    #predictions
    

fold 0
[500]	training's l1: 0.165143	valid_1's l1: 0.197237
[1000]	training's l1: 0.0713101	valid_1's l1: 0.113039
fold 1
[500]	training's l1: 0.165454	valid_1's l1: 0.190115
[1000]	training's l1: 0.0713488	valid_1's l1: 0.0990801
fold 2
[500]	training's l1: 0.16486	valid_1's l1: 0.189613
[1000]	training's l1: 0.0709096	valid_1's l1: 0.0963972
fold 3
[500]	training's l1: 0.164417	valid_1's l1: 0.198467
[1000]	training's l1: 0.0703986	valid_1's l1: 0.112808
fold 4
[500]	training's l1: 0.166074	valid_1's l1: 0.181274
[1000]	training's l1: 0.0711658	valid_1's l1: 0.0970829
fold 5
[500]	training's l1: 0.16676	valid_1's l1: 0.1839
[1000]	training's l1: 0.0722367	valid_1's l1: 0.092742
fold 6
[500]	training's l1: 0.168459	valid_1's l1: 0.166201
[1000]	training's l1: 0.0729964	valid_1's l1: 0.07928
fold 7
[500]	training's l1: 0.163163	valid_1's l1: 0.202905
[1000]	training's l1: 0.0699217	valid_1's l1: 0.112343
fold 8
[500]	training's l1: 0.16686	valid_1's l1: 0.17425
[1000]	training's l1: 0.

In [165]:
predictions = model.predict(X_test, 
                                 num_iteration=model.best_iteration_)
print(accuracy_score(Y_test,predictions))
print(confusion_matrix(Y_test,predictions))

0.9111842105263158
[[104  50]
 [  4 450]]


#### Prediction on test data

In [58]:
def clean_data(txts):
    x = re.sub("[^a-zA-Z]", " ",txts) 
    x = x.lower().split()                             
    stops = set(stopwords.words("english"))                  
    words = [w for w in x if not w in stops] 
    
    return( " ".join(words))

In [126]:
def predict_data(model_class):    
    test_files= os.listdir(path='rel/BioNLP-OST-2019_BB-rel_test/')
    predictions_each=[]
    predictions_all=[]
    for file in tqdm_notebook(test_files):
        if(file.endswith('.a1')):
            if (os.path.getsize(f'rel/BioNLP-OST-2019_BB-rel_test/{file}') > 0 ):
                a1 = pd.read_csv(f'rel/BioNLP-OST-2019_BB-rel_test/{file}',sep='\t',header=None)
                spl = a1[1].str.split(' ',n=1,expand=True)
                a1['Words'] = a1[2]
                scec = spl[1].str.split(' ',n=1,expand=True)
                a1['sc'] = scec[0]
                a1['ec'] = scec[1]
                a1['ec'] = a1['ec'].str.split(';',n=1,expand=True)
                a1['Entity'] = spl[0]
                a1['Words'] = a1['Words'].apply(lambda x: clean_data(x))
                a1.drop([1,2],axis=1,inplace=True)
                a1.drop(a1[a1['Entity'] == 'Title'].index,axis=0,inplace=True)
                a1.drop(a1[a1['Entity'] == 'Paragraph'].index,axis=0,inplace=True)


                fname = file.split('.')
                f = open(f'rel/BioNLP-OST-2019_BB-rel_test/'+fname[0]+'.txt', "r")
                text=f.read()

                text = text.replace("Fig. (","Fig.")
                text = text.replace("Fig. ","Fig.")

                sentences = sent_tokenize(text)

                sc=0

                tag1=[]
                entity1=[]

                tag2=[]
                entity2=[]


                pre=[]
                for sent in sentences:
                    sentence_tags = a1[a1['ec'].astype(int)<=sc+len(sent)] 
                    sentence_tags = sentence_tags[sentence_tags['ec'].astype(int)>=sc]

                    sc=sc+len(sent)
                    a2_part1 =sentence_tags[sentence_tags['Entity']=='Microorganism']
                    a2_part2 =sentence_tags[sentence_tags['Entity']!='Microorganism']

                    n_ent = len(sentence_tags[sentence_tags['Entity']!='Microorganism'])
                    n_mic = len(sentence_tags[sentence_tags['Entity']=='Microorganism'])




                    if(n_ent>0 and n_mic>0):
                        word1=sentence_tags[sentence_tags['Entity']=='Microorganism']['Words'].loc[sentence_tags[sentence_tags['Entity']=='Microorganism']['Words'].index.repeat(n_ent)].tolist()
                        tag1.extend(sentence_tags[sentence_tags['Entity']=='Microorganism'][0].loc[sentence_tags[sentence_tags['Entity']=='Microorganism'][0].index.repeat(n_ent)].values)
                        entity1.extend(sentence_tags[sentence_tags['Entity']=='Microorganism']['Entity'].loc[sentence_tags[sentence_tags['Entity']=='Microorganism']['Entity'].index.repeat(n_ent)].values)

                        word2=pd.concat([sentence_tags[sentence_tags['Entity']!='Microorganism']['Words']]*n_mic, ignore_index=True).tolist()
                        tag2.extend(pd.concat([sentence_tags[sentence_tags['Entity']!='Microorganism'][0]]*n_mic, ignore_index=True).values)
                        entity2.extend(pd.concat([sentence_tags[sentence_tags['Entity']!='Microorganism']['Entity']]*n_mic, ignore_index=True).values)

                        df = pd.DataFrame({'word_1':word1,'word_2':word2})



                        valid_words1 = df['word_1'].tolist()
                        valid_words2 = df['word_2'].tolist()

                        X1_test= np.zeros((len(valid_words1),300)) 

                        for i in range(len(valid_words1)):
                            words = word_tokenize(valid_words1[i])
                            emb = [model.wv[w] for w in words] 
                            X1_test[i] = np.mean(emb, axis=0) 


                        X2_test= np.zeros((len(valid_words2),300)) 

                        for i in range(len(valid_words2)):
                            words = word_tokenize(valid_words2[i])
                            emb = [model.wv[w] for w in words] 
                            X2_test[i] = np.mean(emb, axis=0) 

                        X_test = np.concatenate((X1_test,X2_test),axis=1)
                        X_test[np.isnan(X_test)] = 0

                        pre.extend(le.inverse_transform(model_class.predict(X_test)))


                idx = np.arange(1,len(pre)+1)
                a2_df = pd.DataFrame({'id':idx,'pred':pre,'ent1':entity1,'tag1':tag1,'ent2':entity2,'tag2':tag2})
                a2_df['id'] = 'R'+a2_df['id'].astype(str)
                a2_df['ent2'].replace('Phenotype','Property',inplace=True)
                a2_df['ent2'].replace(['Habitat','Geographical'],'Location',inplace=True)
                a2_df['pre'] = a2_df['pred']+' '+a2_df['ent1']+':'+a2_df['tag1']+' '+a2_df['ent2'] + ':'+a2_df['tag2']

                a2_corrected = pd.DataFrame({'id':a2_df['id'],'pred':a2_df['pre']})


               #             


                predictions_each.append(pre)
                predictions_all.extend(pre) 
    
                a2_corrected.to_csv('a2_files/'+fname[0]+'.a2',index=False,sep='\t',header=False)





In [76]:
len(predictions_all)

1335

In [77]:
print('No.of.Lives_in---predicted using RF_gini',predictions_all.count('Lives_In'))

No.of.Lives_in---predicted using RF_entropy 1114


In [79]:
len(predictions_all)

1335

In [80]:
print('No.of.Lives_in---predicted using RF_entropy',predictions_all.count('Lives_In'))

No.of.Lives_in---predicted using RF_entropy 1122


In [81]:
from sklearn import svm

In [113]:
svm_model = svm.SVC(class_weight='balanced', C=1000,kernel='linear')

In [114]:
svm_model.fit(X_train,Y_train)
pre = svm_model.predict(X_valid)
print(accuracy_score(Y_test,pre))
print(confusion_matrix(Y_test,pre))

0.9506578947368421
[[141  13]
 [ 17 437]]


In [104]:
pre_e, pre_a = predict_data(svm_model)

HBox(children=(IntProgress(value=0, max=186), HTML(value='')))

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)





In [94]:
print(len(pre_a),'Total predictions')

1335 Total predictions


In [97]:
print('No.of.Lives_in---predicted using svm_model_c=1000',pre_a.count('Lives_In'))

No.of.Lives_in---predicted using svm_model_c=1000 1021


In [96]:
joblib.dump(model_class_1, 'svm_model.sav')

['svm_model.sav']

In [105]:
print('No.of.Lives_in---predicted using svm_model_c=10000',pre_a.count('Lives_In'))

No.of.Lives_in---predicted using svm_model_c=10000 1021


In [118]:
from sklearn.neighbors import KNeighborsClassifier

model = KNeighborsClassifier(n_neighbors=10)
model.fit(X_train,Y_train)
pre = model.predict(X_valid)
print(accuracy_score(Y_test,pre))
print(confusion_matrix(Y_test,pre))


0.90625
[[104  50]
 [  7 447]]


In [123]:
svm_model = joblib.load('svm_model.sav')

In [127]:
predict_data(svm_model)

HBox(children=(IntProgress(value=0, max=186), HTML(value='')))

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)



