# Retrieve all file names

Firstly, I need to get all corpus in one place. As it all inside different folders, we go through directories to retrieve all documents' names.

In [1]:
import os

def read_files_path(path:str, extension: str):
    files_path = []

    for r, d, f in os.walk(path):
        for file in f:
            if extension in file:
                files_path.append(os.path.join(r, file))
                
    return files_path

In [2]:
files_path = read_files_path("entval-events/", '.txt')
files_path_ann = read_files_path("entval-events/", '.ann')

In [3]:
files_path = sorted(files_path)
files_path_ann = sorted(files_path_ann)

# Preprocessing of document corpus

Secondly, I need to detect sentences in the document.

Pipeline:
- get all documents and its id in dictionary
- detect sentences in all documents
- tokenize sentences

## Get all documents and its id in dictionary

In [4]:
'''
    documents - dictionary {doc_id: text}
    map_ids   - dictionary {doc_id: path}
'''

def read_documents(files_path):

    documents = {}
    map_ids = {}

    doc_id = 0
    for file_path in files_path:
        with open(file_path, 'r') as fh:
            map_ids[doc_id] = file_path
            documents[doc_id] = fh.read()
        doc_id += 1
        
    return documents, map_ids

In [5]:
documents, map_ids = read_documents(files_path)
documents_ann, map_ids_ann = read_documents(files_path_ann)

## Detect sentences

In [6]:
'''
    sentences_index - dictionary {doc_id: [sentence_1, sentence_2, ...]}
'''

import re

from nltk.tokenize import sent_tokenize, word_tokenize

sentences_index = {}
for doc in documents:

    new_doc = re.sub(r"(?<=\w)(\n[А-Я])", r".\1", documents[doc])

    sentences = sent_tokenize(new_doc)
    sentences = [sent.replace(u'\xa0', u' ') for sent in sentences]
    sentences = [sent.replace('\n', ' ') for sent in sentences]
    sentences_index[doc] = sentences

## Tokenize sentences

In [7]:
'''
    index_word - dictionary {doc_id_1: [sentence_1 - [word_1, word_2, ...], sentence_2, ...]}
'''

documents_output = {}
index = {}
for doc in sentences_index:
    documents_output[doc] = ''
    
    for sentence in sentences_index[doc]:

        sentence = re.sub(r'[ёЁ]', 'е', sentence)[:-1]
        sentence = re.sub(r'[@|©] ', '', sentence)
        sentence = re.sub(r'[,\/:;] ', ' ', sentence)
        
        sentence = re.sub(r'[»«]', '\"', sentence)
        sentence = re.sub(r' [\"\/] ', ' ', sentence)
        sentence = re.sub(r' — ', ' ', sentence)

        words = sentence.split(' ')
        words = list(filter(lambda a: a != '', words))
        
        documents_output[doc] += ' '.join(words)

        if doc not in index:
            index[doc] = []

        index[doc].append(words)

## Create output format

- T - (entity)
- E - (event)
- type - type of entity/event.
- O - nothing
- B-T/E-type - beginning of found token
- I-T/E-type - next part of found token

In [51]:
import re

from nltk.tokenize import sent_tokenize, word_tokenize

new_docs = {}
for doc in documents:

#    new_doc = re.sub(r"(?<=\w)(\n[А-Я])", r".\1", documents[doc])
    new_doc = documents[doc]
#     new_doc = re.sub(r'\xa0', ' ', new_doc)
#     new_doc = re.sub(r'\n', ' ', new_doc)
#     new_doc = re.sub(r'[ёЁ]', 'е', new_doc)
#     new_doc = re.sub(r'[@|©] ', '  ', new_doc)
#     new_doc = re.sub(r'[,\/:;] ', '  ', new_doc)

#     new_doc = re.sub(r'[»«]', '\"', new_doc)
#     new_doc = re.sub(r' [\"\/] ', '   ', new_doc)
#     new_doc = re.sub(r' — ', '   ', new_doc)
    
    sentences = sent_tokenize(new_doc)
    sentences = [sent + ' ' for sent in sentences]
    
    new_docs[doc] = sentences

In [53]:
sent_lengths = {}
for doc, sentences in new_docs.items():
    sum_ = 0
    sent_lengths[doc] = []
    for sentence in sentences:
        sum_ += len(sentence)
        sent_lengths[doc].append(sum_)

In [54]:
sent_lengths[0]

[74,
 400,
 690,
 737,
 861,
 1008,
 1197,
 1357,
 1429,
 1469,
 1562,
 1623,
 1689,
 1812,
 1995,
 2229]

In [10]:
'''
    ann_index - DataFrame; [doc_id, start_id, end_id, words, type]
'''

doc_ids = []
start_ids = []
end_ids = []
words = []
types = []

for doc in documents_ann:
    entities = documents_ann[doc].split('\n')
    for entity in entities:
        ent = entity.split('\t')

        if ent == [''] or 'A' in ent[0] or '#' in ent[0]:
            continue

        # change entity token to an event token
        if 'E' in ent[0]:
            types[-1] = 'E' + types[-1][1:]
            continue

        info = ent[1].split(' ')
        # parse type
        types.append('T-' + info[0])
        # add doc id
        doc_ids.append(doc)      
        # add words
        words.append(ent[2])

        # parse start end indices
        start_ids.append(int(info[1]))
        try:
            end_ids.append(int(info[2]))
        except:
            end_ids.append(int(info[3]))

In [11]:
import pandas as pd

ann_index = pd.DataFrame([doc_ids, start_ids, end_ids, words, types],
                         index=['doc', 'start', 'end', 'word', 'type'])
ann_index = ann_index.T

In [12]:
# sort by start id
ann_index = ann_index.sort_values(['doc', 'start'], ignore_index=True)

In [23]:
ann_index.to_csv('check.csv', index=False)

In [91]:
def add_words(u_w, di, si, tag='O'):
    # split string on words by space
    u_w = u_w.split(' ')
    u_w = list(filter(lambda a: a != '' and a != '\xa0' and a != '\n', u_w))

    upd_u_w = []
    for word in u_w:
        upd_word = list(
            filter(None, re.split(r"([.\/@|©,\/:;»«\"\/—()\n\xa0])", word)))
        upd_word = list(
            filter(lambda a: a != '' and a != '\xa0' and a != '\n', upd_word))

        for upd_w in upd_word:
            doc_ids.append(di)
            sentence_ids.append(si)

            words.append(upd_w)

            if tag != 'O':
                tagge = 'B-' + tag if i == 0 else 'I-' + tag
            else:
                tagge = tag

            types.append(tagge)


doc_ids = []
sentence_ids = []
words = []
types = []

last_doc_id = -1
last_end = -1
last_sent_id = -1
for index, row in ann_index.iterrows():
    # doc_ids.append(cur_doc_id)

    # get relative index of tagged words in sentence
    for i, length in enumerate(sent_lengths[row.doc]):
        if row.start < length:
            sent_id = i
            relative_start = row.start - sent_lengths[row.doc][
                i - 1] if i != 0 else row.start
            relative_end = row.end - sent_lengths[row.doc][
                i - 1] if i != 0 else row.end
            break

    if last_doc_id == row.doc and last_sent_id == sent_id and relative_start < last_end:
        continue

#     print('last doc:', last_doc_id, 'last_end:', last_end, 'last_sent_id:',
#           last_sent_id, 'sent_id:', sent_id, 'start:', row.start,
#           relative_start, 'length:', sent_lengths[row.doc][sent_id], 'word:',
#           row.word)

#     if row.doc == 1:
#         break

# get all untagged words
# if in the same doc
    if last_doc_id == row.doc:
        # if the same sentence untagged words start from ending
        # of last tagged word till start of current one
        if last_sent_id == sent_id:
            untagged_words = new_docs[
                row.doc][sent_id][last_end:relative_start]
            add_words(untagged_words, row.doc, sent_id)
        # if different sentence untagged words are from the ending
        # of last tagged word till the end of last sentence
        # and from begginning of current sentence till the start
        # of current tagged words
        else:
            untagged_words = new_docs[row.doc][last_sent_id][last_end:]
            add_words(untagged_words, row.doc, last_sent_id)

            s_i = last_sent_id + 1
            while s_i < sent_id:
                untagged_words = new_docs[row.doc][s_i]
                add_words(untagged_words, row.doc, s_i)
                s_i += 1

            untagged_words = new_docs[row.doc][sent_id][:relative_start]
            add_words(untagged_words, row.doc, sent_id)
    # if not the same doc, untagged words start from the ending of last tagged
    # word in the last sentence till the end of doc
    # and from the beginning of sentence till the start of tagged words
    else:
        # if last end is equal to -1, it is the first doc
        if last_end != -1:
            untagged_words = new_docs[last_doc_id][last_sent_id][last_end:]
            add_words(untagged_words, last_doc_id, last_sent_id)

            l_s_i = len(new_docs[last_doc_id])
            s_i = last_sent_id + 1
            while s_i < l_s_i:
                untagged_words = new_docs[last_doc_id][s_i]
                add_words(untagged_words, last_doc_id, s_i)
                s_i += 1

            d_i = last_doc_id + 1
            while d_i < row.doc:
                s_i = 0
                l_s_i = len(new_docs[last_doc_id])
                while s_i < l_s_i:
                    untagged_words = new_docs[d_i][s_i]
                    add_words(untagged_words, d_i, s_i)
                    s_i += 1
                d_i += 1

        s_i = 0
        while s_i != sent_id:
            untagged_words = new_docs[row.doc][s_i]
            add_words(untagged_words, row.doc, s_i)
            s_i += 1
        untagged_words = new_docs[row.doc][sent_id][:relative_start]
        add_words(untagged_words, row.doc, sent_id)

    tagged_words = new_docs[row.doc][sent_id][relative_start:relative_end]
    add_words(tagged_words, row.doc, sent_id, tag=row.type)

    # save sentence id of the last occurence of tagged words
    last_sent_id = sent_id
    last_doc_id = row.doc
    last_end = relative_end

In [92]:
df = pd.DataFrame([doc_ids, sentence_ids, words, types],
                  index=['doc', 'sentence', 'word', 'type'])
df = df.T

# Save csv format

In [93]:
df.to_csv('output.csv', index=False)

In [46]:
df.iloc[441]

doc         1
sentence    6
word        а
type        O
Name: 441, dtype: object