In [54]:
#Imports
import pandas as pd
import os
import spacy
from spacy.tokens.doc import Doc
from spacy.gold import GoldParse
import plac
from spacy.gold import offsets_from_biluo_tags


import math
import re

#load my model
nlp = spacy.load("./model_new")

fields = ['Sentences','Word', 'POS', 'Tag']

train = pd.read_csv("./data/TrainNER.csv",sep=';',na_values='', encoding='latin1')
train = train.rename(columns={train.columns[0]: 'Sentences', train.columns[1]: 'Word',train.columns[2]:  'POS',train.columns[3]:  'Tag'}) # adding first row
del train["POS"]
train["Predicted"]=""


test1 = pd.read_csv("./data/Test1NER.csv",sep=';',na_values='', encoding='latin1')
test2 = pd.read_csv("./data/Test2NER.csv",sep=';',na_values='', encoding='latin1')

#Add columns to test1 df
test1.loc[-1] = [test1.columns[0], test1.columns[1], test1.columns[2]]  # rename columns
test1.index = test1.index + 1  # shifting index
test1 = test1.sort_index()  # sorting by index
test1 = test1.rename(columns={test1.columns[0]: 'Sentences', test1.columns[1]: 'Word',test1.columns[2]:  'POS'}) # adding first row
test1["Predicted"]=""

#Add columns to test2 df
test2.loc[-1] = [test2.columns[0], test2.columns[1]]  # rename columns
test2.index = test2.index + 1  # shifting index
test2 = test2.sort_index()  # sorting by index
test2 = test2.rename(columns={test2.columns[0]: 'Sentences', test2.columns[1]: 'Word'}) # adding first row
test2["Predicted"]=""

In [55]:
#Model trainer method
#source: https://github.com/explosion/spaCy/blob/master/examples/training/train_ner.py

from __future__ import unicode_literals, print_function

import plac
import random
import warnings
from pathlib import Path
import spacy
from spacy.util import minibatch, compounding

@plac.annotations(
    model=("Model name. Defaults to blank 'en' model.", "option", "m", str),
    output_dir=("Optional output directory", "option", "o", Path),
    n_iter=("Number of training iterations", "option", "n", int),
)
def main(model=None, output_dir=None, n_iter=100):
    """Load the model, set up the pipeline and train the entity recognizer."""
    if model is not None:
        nlp = spacy.load(model)  # load existing spaCy model
        print("Loaded model '%s'" % model)
    else:
        nlp = spacy.blank("en")  # create blank Language class
        print("Created blank 'en' model")

    # create the built-in pipeline components and add them to the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if "ner" not in nlp.pipe_names:
        ner = nlp.create_pipe("ner")
        nlp.add_pipe(ner, last=True)
    # otherwise, get it so we can add labels
    else:
        ner = nlp.get_pipe("ner")

    # add labels
    for _, annotations in train_data[0:8998]:
        for ent in annotations.get("entities"):
            ner.add_label(ent[2])

    # get names of other pipes to disable them during training
    pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]
    # only train NER
    with nlp.disable_pipes(*other_pipes), warnings.catch_warnings():
        # show warnings for misaligned entity spans once
        warnings.filterwarnings("once", category=UserWarning, module='spacy')

        # reset and initialize the weights randomly – but only if we're
        # training a new model
        if model is None:
            nlp.begin_training()
        for itn in range(n_iter):
            random.shuffle(train_data[0:8998])
            losses = {}
            # batch up the examples using spaCy's minibatch
            batches = minibatch(train_data[0:8998], size=compounding(4.0, 32.0, 1.001))
            print(batches)
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(
                    texts,  # batch of texts
                    annotations,  # batch of annotations
                    drop=0.4,  # dropout - make it harder to memorise data
                    losses=losses,
                )
            print("Losses", losses)

    # test the trained model
    for text, _ in train_data[0:8998]:
        doc = nlp(text)
        print("Entities", [(ent.text, ent.label_) for ent in doc.ents])
        print("Tokens", [(t.text, t.ent_type_, t.ent_iob) for t in doc])

    # save model to output directory
    if output_dir is not None:
        output_dir = Path(output_dir)
        if not output_dir.exists():
            output_dir.mkdir()
        nlp.to_disk(output_dir)
        print("Saved model to", output_dir)

        # test the saved model
        print("Loading from", output_dir)
        nlp2 = spacy.load(output_dir)
        for text, _ in train_data[0:8998]:
            doc = nlp2(text)
            print("Entities", [(ent.text, ent.label_) for ent in doc.ents])
            print("Tokens", [(t.text, t.ent_type_, t.ent_iob) for t in doc])


In [56]:
#Creates BILOU tagging from BIO tagging for spacy.

def bio_to_bilou(tags):
    result = list()
    if(tags[0][0] == 'B' and tags[1][0] == 'O'):
        result.append(tags[0][:0] + 'U' + tags[0][0 +1:])
    else:
        result.append(tags[0])
    for i in range(1,len(tags)-1):
        
        current_iob = tags[i][0]
        prev_iob = tags[i-1][0]
        next_iob = tags[i+1][0]
        # Outside entities

        
        if current_iob == 'O':
            result.append(tags[i])
        elif (prev_iob == 'O' and next_iob == 'O' and current_iob != 'O') or (prev_iob == 'I' and current_iob == 'B' and next_iob == 'O') or (prev_iob == 'B' and current_iob == 'B' and next_iob == 'O') or (prev_iob == 'B' and current_iob == 'B' and next_iob == 'B') or (prev_iob == 'O' and current_iob == 'B' and next_iob == 'B'):
            result.append(tags[i][:0] + 'U' + tags[i][0 +1:])
        # Unit length entities
        elif (prev_iob == 'O' and next_iob != 'O') or (prev_iob == 'I' and next_iob != 'O') or (prev_iob == 'B' and next_iob != 'O'):
            result.append(tags[i][:0] + 'B' + tags[i][0 +1:])
        elif (prev_iob != 'O' and next_iob == 'O') or (prev_iob != 'O' and next_iob == 'B'):
            result.append(tags[i][:0] + 'L' + tags[i][0 +1:])
        elif (prev_iob != 'O' and next_iob != 'O'):
            result.append(tags[i][:0] + 'I' + tags[i][0 +1:])
    return result

#Adds BIO tagging to the prediction.
def bio_tagger(ne_tagged):
		bio_tagged = []
		prev_tag = "O"
		for token, tag in ne_tagged:
			if tag == "O": #O
				bio_tagged.append((token, tag))
				prev_tag = tag
				continue
			if tag != "O" and prev_tag == "O": # Begin NE
				bio_tagged.append((token, "B-"+tag))
				prev_tag = tag
			elif prev_tag != "O" and prev_tag == tag: # Inside NE
				bio_tagged.append((token, "I-"+tag))
				prev_tag = tag
			elif prev_tag != "O" and prev_tag != tag: # Adjacent NE
				bio_tagged.append((token, "B-"+tag))
				prev_tag = tag
		return bio_tagged

In [57]:
#Helper methods, to substract data from the dataframes

def getAllTrainSentences(df):
    all_sentences = list()
    s = ""
    for w in df.index:
        if not pd.notna(df.iloc[w]['Sentences']):
            if (not (('É' in str(df.iloc[w]['Word'])) or ('Ó' in str(df.iloc[w]['Word'])))):
                s = s+str(df.iloc[w]['Word'])+" "
        else:
            s = s[:-1]
            all_sentences.append(s)
            s = ""
            if (not (('É' in str(df.iloc[w]['Word'])) or ('Ó' in str(df.iloc[w]['Word'])))):
                s = s+str(df.iloc[w]['Word'])+" "
    s = s[:-5]
    temp = list()
    temp.append(s)
    all_sentences.append(temp)
    all_sentences.pop(0)
    return all_sentences

def getAllTags(df):
    all_tags = list()
    s = list()
    for w in df.index:
        if not pd.notna(df.iloc[w]['Sentences']):
            if (not (('É' in str(df.iloc[w]['Word'])) or ('Ó' in str(df.iloc[w]['Word'])))):
                s.append(df.iloc[w]['Tag'])
        else:
            all_tags.append(s)
            s = list()
            if (not (('É' in str(df.iloc[w]['Word'])) or ('Ó' in str(df.iloc[w]['Word'])))):
                 s.append(df.iloc[w]['Tag'])
    s.pop()
    all_tags.append(s)
    all_tags.pop(0)
    return all_tags

def getAllSentences(df):
    all_sentences = list()
    s = ""
    for w in df.index:
        if not pd.notna(df.iloc[w]['Sentences']):
            if (not (('É' in str(df.iloc[w]['Word'])) or ('Ó' in str(df.iloc[w]['Word'])))):
                s = s+str(df.iloc[w]['Word'])+" "
        else:
            s = s[:-1]
            all_sentences.append(s)
            s = ""
            if (not (('É' in str(df.iloc[w]['Word'])) or ('Ó' in str(df.iloc[w]['Word'])))):
                s = s+str(df.iloc[w]['Word'])+" "
    s = s[:-1]
    all_sentences.append(s)
    all_sentences.pop(0)
    return all_sentences

def getAllWords(df):
    all_words = list()
    for index, row in df.iterrows():
        all_words.append(str(row['Word']))
    return all_words

def getAllEntities(df):
    all_entities = list()
    for index, row in df.iterrows():
        all_entities.append(str(row['Tag']))
    return all_entities

#Insert the result of prediction into the given dataframe
def insertPred(df,result):
    i=0
    for index, row in df.iterrows():
        if result[i][0] in row['Word']:
            row['Predicted']=result[i][1]
            if i < len(result)-1:
                i = i+1
        else: 
            row['Predicted']='O'

#Gets the entites from the test text by my trained model
def get_all_entities(df):
    df_all_sentences = getAllSentences(df)
    all_entities = list()
    for text in df_all_sentences:
            doc = nlp(text)
            all_entities.append(substract_entities(doc.ents))
            #print("Entities", [(ent.text, ent.label_) for ent in doc.ents])
            #print("Tokens", [(t.text, t.ent_type_, t.ent_iob) for t in doc])
    return all_entities

#Tagging all words
def tag_all_words(df):
    ne_tags = df["Predicted"]
    tokens =  df["Word"]
    tups = list()
    for f,s in zip(ne_tags,tokens):
        tups.append((s,f))
    tokens,ne_tags = zip(*bio_tagger(tups))
    df["Predicted"] = ne_tags
    
#helper methods for the entite substraction
def substract_list(entities):
    all_entities = list()
    for elem in entities:
        for i in range(len(elem)):
            all_entities.append(elem[i])
    return all_entities

def substract_entities(doc):
    allEntity = list()
    for entity in doc:
        if(len(entity.text.split(" "))>1):
            s=entity.text.split(" ")
            for t in s:
                tup = (t,str(entity.label_))
                allEntity.append(tup)
        else:
            tup = (entity.text,str(entity.label_))
            allEntity.append(tup)
    return allEntity

#More helper methods to preconsume data to be able to give it to the spacy training method
def getAllTrainData():
    train_data= list()
    for i in range(len(all_sentences)):
        train_data.append((all_sentences[i],{"entities": offsets_from_biluo_tags(nlp(str(all_sentences[i])), bio_to_bilou(all_tags[i]))}))
    return train_data        

In [6]:
#The sequence to get the train data properly
all_sentences = getAllTrainSentences(train)
all_tags = getAllTags(train)
train_data= getAllTrainData()

In [62]:
def get_all_entities_print(df):
    df_all_sentences = getAllSentences(df)
    all_entities = list()
    for text in df_all_sentences:
            doc = nlp(text)
            #all_entities.append(substract_entities(doc.ents))
            print("Entities", [(ent.text, ent.label_) for ent in doc.ents])
            print("Tokens", [(t.text, t.ent_type_, t.ent_iob) for t in doc])
    return all_entities

In [63]:
get_all_entities_print(test1)

Entities [('2005', 'time'), ('Zambia', 'geo'), ('Country Initiative', 'org'), ('USD', 'org')]
Tokens [('In', '', 2), ('2005', 'time', 3), (',', '', 2), ('Zambia', 'geo', 3), ('qualified', '', 2), ('for', '', 2), ('debt', '', 2), ('relief', '', 2), ('under', '', 2), ('the', '', 2), ('Highly', '', 2), ('Indebted', '', 2), ('Poor', '', 2), ('Country', 'org', 3), ('Initiative', 'org', 1), (',', '', 2), ('consisting', '', 2), ('of', '', 2), ('approximately', '', 2), ('USD', 'org', 3), ('6', '', 2), ('billion', '', 2), ('in', '', 2), ('debt', '', 2), ('relief', '', 2), ('.', '', 2)]
Entities [('Zambia', 'geo')]
Tokens [('Poverty', '', 2), ('remains', '', 2), ('a', '', 2), ('significant', '', 2), ('problem', '', 2), ('in', '', 2), ('Zambia', 'geo', 3), (',', '', 2), ('despite', '', 2), ('a', '', 2), ('stronger', '', 2), ('economy', '', 2), ('.', '', 2)]
Entities [('Zambia', 'geo'), ('2010', 'time'), ('Zambia', 'geo'), ('2008', 'time')]
Tokens [('Zambia', 'geo', 3), ("'s", '', 2), ('dependency

[]

In [13]:
#Not necessary
temp = get_all_entities(test1)
temp_sub = substract_list(get_all_entities(test1))


In [61]:
temp

[[('2005', 'time'),
  ('Zambia', 'geo'),
  ('Country', 'org'),
  ('Initiative', 'org'),
  ('USD', 'org')],
 [('Zambia', 'geo')],
 [('Zambia', 'geo'), ('2010', 'time'), ('Zambia', 'geo'), ('2008', 'time')],
 [('/', 'org'), ('AIDS', 'org'), ('Zambia', 'geo')],
 [('Slovakia', 'geo'), ('Great', 'geo'), ('Moravia', 'geo')],
 [('Slovaks', 'gpe'), ('Hungarian', 'gpe'), ('1', 'time')],
 [('monarchy', 'time'),
  ('use', 'gpe'),
  ('Hungarian', 'geo'),
  ('closely', 'gpe'),
  ('by', 'gpe')],
 [('Slovaks', 'gpe'), ('Czechs', 'gpe'), ('Czechoslovakia', 'geo')],
 [('II', 'event'), ('Czechoslovakia', 'geo'), ('-dominated', 'org')],
 [('1989', 'time'), ('Czechoslovakia', 'geo')],
 [('Slovaks', 'gpe'),
  ('Czechs', 'gpe'),
  ('January', 'time'),
  ('1993', 'time')],
 [('Slovakia', 'geo'),
  ('NATO', 'org'),
  ('EU', 'org'),
  ('2004', 'time'),
  ('January', 'time'),
  ('2009', 'time')],
 [('French', 'gpe'),
  ('Afars', 'gpe'),
  ('Issas', 'geo'),
  ('Djibouti', 'geo'),
  ('1977', 'time')],
 [('Gouled'

In [308]:
#Not necessary
def getAllOffsets(all_sentences,all_tags):
    entities = list()
    for sentence, tags in zip(all_sentences,all_tags):
        doc = nlp(str(sentence))
        tag = bio_to_bilou(tags)
        temp = offsets_from_biluo_tags(doc, tag)
        entities.append(temp)
    return entities

In [58]:
insertPred(test1,substract_list(get_all_entities(test1)))
tag_all_words(test1)

In [59]:
insertPred(test2,substract_list(get_all_entities(test2)))
tag_all_words(test2)

In [48]:
test1.to_csv("Submission_trained_spacy_1.csv",index=False,sep = ';',encoding='utf-8')


In [60]:
test2.to_csv("Submission_trained_spacy_2.csv",index=False,sep = ';',encoding='utf-8')