In [5]:
import os
app_root = '../../../../'
wiki_downloaded_dir_path = '/mnt/c/Users/Marco/Downloads'
wiki_save_path = os.path.join(app_root, 'model', 'common', 'wiki_dumps')

In [6]:
import sys, os, wget

def download_wiki(lang = 'en', dir_path = './'):
    ''' Download latest Wikipedia dump in chosen language '''

    def bar_progress(current, total, width=80):
        current = current * 1e-6
        total = total * 1e-6
        progress_message = "Downloading: %d%% [%d / %d] MB" % (current / total * 100, current, total)
        sys.stdout.write("\r" + progress_message)
        sys.stdout.flush()

    lang = lang.lower()
    WIKI_DUMP_NAME = f'{lang}wiki-latest-pages-articles.xml.bz2'
    WIKI_DUMP_URL = f'https://dumps.wikimedia.org/{lang}wiki/latest/{WIKI_DUMP_NAME}'

    save_path_name = os.path.join(dir_path, WIKI_DUMP_NAME)

    if not os.path.exists(save_path_name):
        print(f'Downloading the latest {lang}-language Wikipedia dump from {WIKI_DUMP_URL}...')
        save_path_name = wget.download( WIKI_DUMP_URL, save_path_name, bar=bar_progress )
        print(f'Successfully downloaded!')
    else:
        print(f'Already downloaded!')

    return save_path_name

In [7]:
def extract_wiki(path_to_file, path_dir_out = None):
    file_name_ext_in = os.path.basename(path_to_file)
    file_name_ext_out = file_name_ext_in.split('.')[0] + '.txt'
    path_file_out = os.path.join( os.path.dirname(path_to_file) if path_dir_out is None else path_dir_out , file_name_ext_out )
    if not os.path.exists(path_file_out):
        print(f'Extracting and cleaning {path_to_file} to {path_file_out}...')
        ! python3 -m wikiextractor.WikiExtractor  $path_to_file --processes 79 -q -o - \
        | sed "/^\s*\$/d" \
        | grep -v "^<doc id=" \
        | grep -v "</doc>\$" \
        > $path_file_out
        print(f'Successfully extracted!')
    else:
        print(f'Already extracted!')
    return path_file_out

In [8]:
import sys, os
from blingfire import text_to_sentences
from random import random

def preprocess_wiki(path_to_file, path_dir_out = None, max_n_sentences = 100000):
    MIN_SENTENCE_LEN = 16
    MAX_SENTENCE_LEN = 110
    PERCENTAGE_SENTENCE = 0.01
    file_name_ext_in = os.path.basename(path_to_file)
    file_name_ext_out = file_name_ext_in.split('.')[0] + '_preprocessed.txt'
    path_file_out = os.path.join( os.path.dirname(path_to_file) if path_dir_out is None else path_dir_out , file_name_ext_out )
    print(f'Preprocessing {path_to_file} to {path_file_out}...')
    counter = 0
    with open(path_file_out, 'w', encoding='utf-8') as out_f:
        with open(path_to_file, 'r', encoding='utf-8') as in_f:
            for line in in_f:
                if counter >= max_n_sentences:
                    break
                sentences = text_to_sentences(line)
                for sentence in sentences.split('\n'):
                    if len(sentence.split()) >= MIN_SENTENCE_LEN and len(sentence.split()) <= MAX_SENTENCE_LEN and (random() <= PERCENTAGE_SENTENCE):
                        counter += 1
                        out_f.write(sentence + ('\n' if counter < max_n_sentences else '') )
                        
    print(f'Successfully preprocessed {counter} lines!')
    return path_file_out

# English Pipeline

In [69]:
saved_path_en = download_wiki(lang = 'en', dir_path = wiki_downloaded_dir_path)

Already downloaded!


In [70]:
saved_path_extracted_en = extract_wiki(saved_path_en)

Already extracted!


In [71]:
saved_path_preprocessed_en = preprocess_wiki(saved_path_extracted_en, path_dir_out = wiki_save_path)

Preprocessing /mnt/c/Users/Marco/Downloads/enwiki-latest-pages-articles.txt to ../../../../model/common/wiki_dumps/enwiki-latest-pages-articles_preprocessed.txt...
Successfully preprocessed 100000 lines!


# Spanish Pipeline

In [72]:
saved_path_es = download_wiki(lang = 'es', dir_path = wiki_downloaded_dir_path)

Already downloaded!


In [73]:
saved_path_extracted_es = extract_wiki(saved_path_es)

Already extracted!


In [74]:
saved_path_preprocessed_es = preprocess_wiki(saved_path_extracted_es, path_dir_out = wiki_save_path)

Preprocessing /mnt/c/Users/Marco/Downloads/eswiki-latest-pages-articles.txt to ../../../../model/common/wiki_dumps/eswiki-latest-pages-articles_preprocessed.txt...
Successfully preprocessed 100000 lines!


# French Pipeline

In [75]:
saved_path_fr = download_wiki(lang = 'fr', dir_path = wiki_downloaded_dir_path)

Already downloaded!


In [76]:
saved_path_extracted_fr = extract_wiki(saved_path_fr)

Already extracted!


In [77]:
saved_path_preprocessed_fr = preprocess_wiki(saved_path_extracted_fr, path_dir_out = wiki_save_path)

Preprocessing /mnt/c/Users/Marco/Downloads/frwiki-latest-pages-articles.txt to ../../../../model/common/wiki_dumps/frwiki-latest-pages-articles_preprocessed.txt...
Successfully preprocessed 100000 lines!


# URLs

In [9]:
verbatlas_dependency_url = 'http://127.0.0.1:3003/api/model'

# Generate dataset (Spanish)

In [10]:
import os
from tqdm import tqdm
import requests
import json

def generate_data(dump_path, verbatlas_link, lang="EN", chunk_dim=1, n_max_sentences = 8_000):
    null_tag = '_'
    result = {}

    sentences_skipped = 0
    sentences_total = 0
    to_send = []

    with open(dump_path, 'r') as file:
        pbar = tqdm(enumerate(file))
        for data_i, line in pbar:
            data_text = line.rstrip()

            to_send.append({ 'text': data_text, 'lang':lang, 'key_id':lang+'_n_'+str(data_i) })

            sentences_total += 1
            if sentences_total - sentences_skipped >= n_max_sentences:
                break

            if len(to_send)%chunk_dim != 0:
                continue
            
            pbar.set_description(f'index={data_i}')

            res_v = requests.post(verbatlas_link, json = to_send)
            status_code_good = res_v.status_code == 200

            if not status_code_good:
                print(f'verbatlas={res_v.status_code} | skipping chunk!')
                sentences_skipped += len(to_send)
                to_send = []

            res_v = json.loads(res_v.text)
            for t_i, r_v_i in zip(to_send, res_v):

                k_id = t_i['key_id']

                result[k_id] = {'roles':{}}
                # words
                result[k_id]['words'] = [ e['rawText'] for e in r_v_i['tokens'] ]

                # predicates + roles
                result[k_id]['predicates'] = [null_tag]*len(r_v_i['tokens'])
                for annotation in r_v_i['annotations']:
                    tokenIndex = annotation['tokenIndex']
                    if annotation['verbatlas']['frameName'] != null_tag:
                        result[k_id]['predicates'][tokenIndex] = annotation['verbatlas']['frameName']

                        result[k_id]['roles'][str(tokenIndex)] = [null_tag]*len(r_v_i['tokens'])

                        for _, role_for_i in enumerate(annotation['verbatlas']['roles']):
                            span_pos = role_for_i['span'][0]
                                    
                            result[k_id]['roles'][str(tokenIndex)][span_pos] = role_for_i['role'].lower()
                    
            to_send = []
    
    print(f'skipped {sentences_skipped} sentences out of {sentences_total}')
    return result

In [11]:
es_wiki_gen = generate_data(os.path.join(wiki_save_path,'eswiki-latest-pages-articles_preprocessed.txt'), verbatlas_dependency_url, lang="ES", chunk_dim=1)

index=1883: : 1881it [01:40, 18.92it/s]

verbatlas=500 | skipping chunk!


index=3778: : 3777it [03:11, 21.94it/s]

verbatlas=500 | skipping chunk!


index=5361: : 5359it [04:24, 22.30it/s]

verbatlas=500 | skipping chunk!


index=5599: : 5598it [04:35, 21.86it/s]

verbatlas=500 | skipping chunk!


index=6448: : 6446it [05:15, 21.94it/s]

verbatlas=500 | skipping chunk!


index=8003: : 8004it [06:29, 20.57it/s]

skipped 5 sentences out of 8005





In [12]:
import json

with open(os.path.join(app_root,'data','ES','train_wiki_es.json'), 'w') as outfile:
    json.dump(es_wiki_gen, outfile, indent=4)