# Retrieve all file names

Firstly, I need to get all corpus in one place. As it all inside different folders, we go through directories to retrieve all documents' names.

In [2]:
import os

def read_files_path(path:str, extension: str):
    files_path = []

    for r, d, f in os.walk(path):
        for file in f:
            if extension in file:
                files_path.append(os.path.join(r, file))
                
    return files_path

In [4]:
files_path = read_files_path("entval-events/", '.txt')
files_path_ann = read_files_path("entval-events/", '.ann')
files_path = sorted(files_path)
files_path_ann = sorted(files_path_ann)

# Preprocessing of document corpus

get all documents and its id in dictionary

In [1]:
'''
    documents - dictionary {doc_id: text}
    map_ids   - dictionary {doc_id: path}
'''

def read_documents(files_path):

    documents = {}
    map_ids = {}

    doc_id = 0
    for file_path in files_path:
        with open(file_path, 'r') as fh:
            map_ids[doc_id] = file_path
            documents[doc_id] = fh.read()
        doc_id += 1
        
    return documents, map_ids

In [5]:
documents, map_ids = read_documents(files_path)
documents_ann, map_ids_ann = read_documents(files_path_ann)

In [6]:
len(documents)

1111

# Explore data

## Text information

In [14]:
# combine all text together
full_text = ''

for doc in documents.values():
    full_text += doc

In [29]:
import nltk
nltk.download('punkt')

# get unique words
all_words = nltk.word_tokenize(full_text)
words = set(all_words)

[nltk_data] Downloading package punkt to /Users/innolina/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


number of unique words, which were used in the documents: 74589

In [13]:
len(words)

74589

In [30]:
len(words), len([w for w in words if w.isalpha()])

(51823, 47953)

In [32]:
[w for w in words if not w.isalpha()]

['5-м',
 'Северо-Западном',
 'text-decoration',
 'постель…',
 '53,4',
 'извлёк…',
 '3,134',
 'дали/',
 '-10C',
 '45—55',
 '18:25',
 'вице-губернатор',
 'информационно-коммуникационные',
 '17-этажных',
 '400-500',
 'Санкт-Морице',
 'шведско-финского',
 '55-й',
 '2007г',
 '23:26',
 '7-го',
 '33-летнюю',
 '128',
 'культурно-историческая',
 '//content.izvestia.ru/media/3/news/2013/05/550673/RIAN_00616996.LR.ru_240x140.jpg',
 'Завод-производитель',
 'масс-медиа',
 '16.30',
 '3:0',
 'интернет-сервисам',
 'АиФ-Камчатка',
 'тыс.—',
 '10,5',
 'Лебедев/',
 '€100',
 'похмелье-то',
 'ТВ-6-Москва',
 'Хиль-Роблеса',
 '6,046',
 'AF1945',
 '3376',
 'жилищно-коммунального',
 '20-летнего',
 'буксирно-моторных',
 '663,1',
 'вице-премьеру',
 'век-другой',
 'Yac/m',
 '2006',
 'ИТАР-',
 '84-летнего',
 '439',
 '1кв.м',
 ':',
 '15-м',
 '30-60',
 'Металлург-Форум',
 '-7C',
 '-14C',
 '1,920',
 'веб-сайтов',
 'Иссык-Куля',
 '10-15',
 '2050',
 '305',
 '№77',
 'черепно-мозговую',
 'Запад-2013',
 'возбуждено…',
 'т

## Target information

In [16]:
types = []

for doc in documents_ann:
    entities = documents_ann[doc].split('\n')
    for entity in entities:
        ent = entity.split('\t')

        if ent == [''] or 'A' in ent[0] or '#' in ent[0]:
            continue

        # change entity token to an event token
        if 'E' in ent[0]:
            continue

        info = ent[1].split(' ')
        # parse type
        types.append(info[0])

### Getting number of unique types

In [20]:
print('Number of unique types:', end=' ')
un_types = set(types)
print(len(un_types))

print(un_types)

Number of unique types: 29
{'End-Position', 'URL', 'Time', 'Phone-Number', 'Facility', 'Person', 'Indict-Sue', 'GPE-Organization', 'GPE-GPE', 'Die', 'Declare-Bankruptcy', 'GPE-Person', 'E-Mail', 'Trial-Hearing', 'Job', 'Transfer-Ownership', 'Location', 'Sentence', 'Transfer-Money', 'Crime', 'Merge-Org', 'Money', 'Injure', 'End-Org', 'Start-Position', 'Fine', 'GPE-Location', 'Organization', 'Start-Org'}


### Amount of each type

In [21]:
from collections import Counter

types_dict = Counter(types)

In [27]:
dict(types_dict)

{'Organization': 12230,
 'GPE-Location': 3355,
 'Die': 482,
 'Person': 20110,
 'Time': 5723,
 'Location': 1086,
 'GPE-Person': 186,
 'GPE-GPE': 4278,
 'GPE-Organization': 790,
 'Money': 873,
 'Transfer-Money': 425,
 'Facility': 2305,
 'Start-Org': 80,
 'Sentence': 227,
 'Indict-Sue': 338,
 'End-Org': 43,
 'End-Position': 123,
 'Start-Position': 172,
 'Trial-Hearing': 109,
 'Merge-Org': 8,
 'Transfer-Ownership': 97,
 'Injure': 239,
 'Fine': 56,
 'Phone-Number': 40,
 'URL': 71,
 'Job': 114,
 'Declare-Bankruptcy': 9,
 'E-Mail': 8,
 'Crime': 45}