# Extracting entities
Using Spacy to extract entities for storing in relational structure

### Imports

In [1]:
import pandas as pd
import spacy
import os
import time
import itertools
from tqdm import tqdm

### Global variables

In [2]:
wiki_folder = 'data'

file_list = os.listdir(os.path.join(wiki_folder))
if '.DS_Store' in file_list: 
    file_list.remove('.DS_Store')
    
main_delimiter = ';'

In [3]:
def ascii_char(character):
    """ Not a perfect function to convert a single
        charachter to ascii representation
        in: character : single characters (string)
        out: string
    """
    assert len(character) == 1, 'The character argument has to be one letter long'
    return '0x' + str(character).encode('ascii').hex()

def read_single_file(file_name, file_path=''):
    """ reads bytes like object from file and
        returns string
        in: file_name : name of the file
            file_path : optional folder where 
        out: string
    """
    with open(os.path.join(file_path, file_name), 'rb') as f:
        file_text = ''
        for line in f:
            try:
                file_text += line.decode("utf-8").strip()
            except:
                print('Line skipped in file {0}'.format(file_name))
    return file_text.replace(main_delimiter, ascii_char(main_delimiter))

def extract_entities(file_text, file_id):
    """ Extracts entities using Spacy which is 
        based on a neural network
        in: file_text : string
            file_id   : id to be mapped on
        out: list of tuple(s)
    """
    nlp = spacy.load("en_core_web_sm")
    text = nlp(file_text)
    return [tuple([file_id, time.strftime('%Y-%m-%d %H:%M:%S')
                   ,ent.text, ent.start_char, ent.end_char, ent.label_])
            for ent in text.ents]

def read_save_all_files(file_list):
    """ Load, tag and save files
    in: file_list : list of file locations
    out: save to a csv file
    """
    pd.DataFrame(itertools.chain.from_iterable(extract_entities(read_single_file(file_list[i], 'data'), i)
                 for i in tqdm(range(len(file_list))))).to_csv('structured_table2.csv', sep=main_delimiter)
#     with tqdm(total=len(my_list)) as pbar:
#     for x in my_list:
#         pbar.update(1)

def structure_source_table(sentence, file_id):
    return [tuple([file_id, time.strftime('%Y-%m-%d %H:%M:%S'), sentence])]
    
def read_save_all_source_texts(file_list):
    pd.DataFrame(itertools.chain.from_iterable(structure_source_table(read_single_file(file_list[i], 'data'), i)
                 for i in tqdm(range(len(file_list))))).to_csv('structured_table_source2.csv', sep=main_delimiter)

In [4]:
# for x in file_list:
#     if ';;;' in read_single_file(x, 'data'):
#         print(x)

In [5]:
read_save_all_files(file_list)

100%|██████████| 2699/2699 [27:30<00:00,  1.16it/s]


In [6]:
read_save_all_source_texts(file_list)

100%|██████████| 2699/2699 [00:01<00:00, 2456.30it/s]


In [9]:
cols_structured = {'Unnamed: 0': 'entity_id'
                   , '0': 'source_id'
                   , '1': 'datetime'
                   , '2': 'entity_text'
                   , '3': 'str_start'
                   , '4': 'str_end'
                   , '5': 'entity_tag'}
df_structured = pd.read_csv('structured_table2.csv', sep=main_delimiter).rename(columns=cols_structured)

In [10]:
print(df_structured.sample(10).to_latex(index=False))

\begin{tabular}{rrllrrl}
\toprule
 entity\_id &  source\_id &             datetime &                   entity\_text &  str\_start &  str\_end & entity\_tag \\
\midrule
     83520 &       1020 &  2020-03-17 17:01:02 &                     Gateshead &        503 &      512 &        ORG \\
    183492 &       2171 &  2020-03-17 17:12:41 &                      Bordeaux &        186 &      194 &        GPE \\
    173449 &       2044 &  2020-03-17 17:11:26 &  Darstellende Kunst Stuttgart &       1310 &     1338 &        ORG \\
     16494 &        194 &  2020-03-17 16:52:50 &                           two &      36392 &    36395 &   CARDINAL \\
     49651 &        566 &  2020-03-17 16:56:34 &                  Dáil Éireann &       1964 &     1976 &     PERSON \\
     89788 &       1129 &  2020-03-17 17:02:05 &                          U.S. &         25 &       29 &        GPE \\
     35993 &        411 &  2020-03-17 16:55:02 &            the spring of 1392 &       3537 &     3555 &       DATE \\