# Approach Summary:
-  <font color= 'blue'>I used CRF(Conditional Random Field) classifier to constuct a model.</font>
-  <font color= 'blue'>The library used for this purpose was __Stanford CoreNLP__.</font>
-  <font color= 'blue'>This library is written in Java so the training of the model and generating predicitons was done on command line.</font>
-  <font color= 'blue'>The screenshots of the above processes are shared below.</font>

### Loading the Dependecies

In [114]:
import pandas as pd
import numpy as np
import nltk
from nltk.tag.stanford import StanfordNERTagger
from tqdm import tqdm_notebook as tqdm
from multiprocessing import Process, current_process
import warnings
warnings.filterwarnings(action= 'ignore')

### Loading Files

In [76]:
df_train= pd.read_csv('./train.csv')
df_test= pd.read_csv('./test.csv')
df_submission= pd.read_csv('./sample_submission.csv')

In [6]:
df_train.tag.value_counts()

O                4446206
B-indications      53003
I-indications      44624
Name: tag, dtype: int64

In [7]:
df_train.head()

Unnamed: 0,id,Doc_ID,Sent_ID,Word,tag
0,1,1,1,Obesity,O
1,2,1,1,in,O
2,3,1,1,Low-,O
3,4,1,1,and,O
4,5,1,1,Middle-Income,O


<font color= 'blue'>For training my model I removed documents which didn't contain the __indication__ tag.</font>

In [8]:
ls= df_train.groupby('Doc_ID')['tag'].apply(lambda x: len(x.unique()))
doc_id_to_use= ls.loc[ls!= 1].index.tolist()

In [9]:
len(doc_id_to_use)

14225

### Creating the training file for the model
-  <font color= 'blue'>The training file is a Tab Seperated Value(.tsv) file in which the first element of the row is the word and the second element is the tag.</font>
-  <font color= 'blue'>For the purpose of training, I just kept two tags, __INDICATION__ for tagging the diseases and __O__ for everything else.</font>

In [10]:
with open('stanford_train.txt', 'a') as f:
    for doc_id in doc_id_to_use[: 14126]:
        df_temp= df_train.loc[df_train['Doc_ID']== doc_id, :]
        
        for idx, row in df_temp[['Word', 'tag']].iterrows():
            word= row.Word
            tag= row.tag
            if tag== 'O':
                f.write(str(word)+ '\t'+tag)

            else:
                f.write(str(word)+ '\t'+ 'INDICATIONS')

            f.write('\n')
        
        f.write('\n')

<font color= 'blue'>The below image depicts the training process.</font>

![title](img/Training_Model.png)

<font color= 'blue'>Creating the validation file. The format of this file is same as that of the training file.</font>

In [11]:
with open('stanford_val.txt', 'a') as f:
    for doc_id in doc_id_to_use[14126: ]:
        df_temp= df_train.loc[df_train['Doc_ID']== doc_id, :]
        
        for idx, row in df_temp[['Word', 'tag']].iterrows():
            word= row.Word
            tag= row.tag
            if tag== 'O':
                f.write(str(word)+ '\t'+tag)

            else:
                f.write(str(word)+ '\t'+ 'INDICATIONS')

            f.write('\n')
        
        f.write('\n')

<font color= 'blue'>The below image shows the validation process.</font>

![title](img/Testing_Model.png)

<font color= 'blue'>The below image shows the validation results.</font>

![title](img/Test_Results.png)

<font color= 'blue'>After getting satisfactory results on the validation set, I again trained my model on complete training data to improve my models accuracy. i.e. I used all the 14225 doc ids to train my model</font>

### Loading the trained model and the jar file containing the all the dependencies.

In [12]:
model= '/Users/hu20018391/Practice/Innoplexus/ner-model.ser.gz'
jar= '/Users/hu20018391/Practice/stanford-ner-2018-10-16/stanford-ner-3.9.2.jar'

In [13]:
st_2= StanfordNERTagger(model, jar, encoding= 'utf-8')

The StanfordTokenizer will be deprecated in version 3.2.5.
Please use [91mnltk.tag.corenlp.CoreNLPPOSTagger[0m or [91mnltk.tag.corenlp.CoreNLPNERTagger[0m instead.
  super(StanfordNERTagger, self).__init__(*args, **kwargs)


In [120]:
sentence= df_test.loc[df_test.Sent_ID== 191294, 'Word'].values.tolist()

In [121]:
st_2.tag(sentence, )

[('Pasteurellosis', 'INDICATIONS'),
 ('in', 'O'),
 ('japanese', 'O'),
 ('quail', 'O'),
 ('(', 'O'),
 ('Coturnix', 'O'),
 ('coturnix', 'O'),
 ('japonica', 'O'),
 (')', 'O'),
 ('caused', 'O'),
 ('by', 'O'),
 ('Pasteurella', 'INDICATIONS'),
 ('multocida', 'INDICATIONS'),
 ('multocida', 'INDICATIONS'),
 ('A:4', 'O'),
 ('.', 'O')]

In [14]:
test_doc_id= df_test.Doc_ID.unique().tolist()

In [16]:
len(test_doc_id)

20000

In [78]:
df_test.shape, df_submission.shape

((2994463, 4), (2994463, 3))

In [79]:
sent_id_test= df_test.Sent_ID.unique().tolist()

<font color= 'blue'>The above process of making prediction on individual sentences was slow. So I converted the test file into the same format as the validation file and made predictions from the command line.</font>

In [122]:
with open('stanford_test.txt', 'a') as f:
    for doc_id in sent_id_test:
        df_temp= df_test.loc[df_test['Doc_ID']== doc_id, :]
        
        for idx, row in df_temp[['Word']].iterrows():
            word= row.Word
            tag= 'O'
            
            f.write(str(word)+ '\t'+tag)

            f.write('\n')
        
        f.write('\n')

<font color= 'blue'>The below image shows the process of generating the output.</font>

![title](img/Generating_Output.png)

### Loading the predictions file

In [89]:
output_file= pd.read_csv('./test_output.txt', sep= '\t', names= ['Word', 'tag_0', 'tag'])

### Converting the INDICATION tag into IOB format

In [102]:
def bio_tagger(ne_tagged):
    bio_tagged = []
    prev_tag = "O"
    
    for tag in ne_tagged:
        if tag == "O": #O
            bio_tagged.append(tag)
            prev_tag = tag
            continue
            
        if tag != "O" and prev_tag == "O": # Begin NE
            bio_tagged.append("B-"+'indications')
            prev_tag = tag
            
        elif prev_tag != "O" and prev_tag == tag: # Inside NE
            bio_tagged.append("I-"+'indications')
            prev_tag = tag
            
    return bio_tagged

In [111]:
df_test['tag']= bio_tagger(output_file['tag'].values)

### Creating the submission file

In [113]:
df_test[['id', 'Sent_ID', 'tag']].to_csv('Submission.csv', index= False)