In [91]:
import pandas as pd
import numpy as np
from nltk import word_tokenize
from typing import List
from datasets import load_dataset
from datasets import Dataset
import pandas as pd
from tqdm import tqdm
from pathlib import Path

from nltk.tokenize import sent_tokenize
from nltk import word_tokenize
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /raid/bsalyp/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [92]:
# Helper functions

def get_token_labels(tokens: List[str], tag: str) -> List[str]:
    seq_len = len(tokens)
    if seq_len == 0:
        return []
    return [tag] + ['O']*(seq_len - 1)

In [93]:
# load data and split by convenient chunks
# take only part of data, 6.5M is too much anyway
data = load_dataset("wikipedia", "20220301.en", 
       split =[f'train[{k}:{k+10000}]' for k in range(0, 10)])

Found cached dataset wikipedia (/raid/bsalyp/.cache/huggingface/datasets/wikipedia/20220301.en/2.0.0/aa542ed919df55cc5d3347f42dd4521d05ca68751f50dbc32bae2a7f1e167559)
100%|██████████| 10/10 [00:00<00:00, 46.63it/s]


In [94]:
print('General info')
data[:3]

General info


[Dataset({
     features: ['id', 'url', 'title', 'text'],
     num_rows: 10000
 }),
 Dataset({
     features: ['id', 'url', 'title', 'text'],
     num_rows: 10000
 }),
 Dataset({
     features: ['id', 'url', 'title', 'text'],
     num_rows: 10000
 })]

In [95]:
def process_chunck(data:Dataset )->pd.DataFrame:
    df = pd.DataFrame(data)

    # Split in subtexts
    df['text'] = df['title'] + '\n' + df['text']
    df['subtexts'] = df['text'].str.split('\n')
    df = df.explode('subtexts')

    # Get text features
    df['sents_amount'] = df['subtexts'].apply(lambda x: len(sent_tokenize(x)))
    df['start_with_space'] = df['subtexts'].apply(lambda x: x[0] == ' ' if x else False)
    df = df[df['sents_amount'] != 0]

    #Detect text parts
    df['type'] = None

    size_one = df['sents_amount'] == 1
    space_start = df['start_with_space'] == True
    is_title = df['subtexts'].isin(df['title'])

    df.loc[size_one & (~space_start), 'type'] = 'H2'
    df.loc[is_title, 'type'] = 'H1'
    df.loc[space_start, 'type'] = 'LIST'
    df['type'] = df['type'].fillna('BLOCK')

    # Tokenize subtexts
    df['tokens'] = df['subtexts'].apply(lambda x: word_tokenize(x))
    # Create labels
    df['token_labels'] = df.apply(lambda x: get_token_labels(x.tokens, x.type), axis=1)

    # Aggregate results
    result_df = df.groupby('url')[['tokens', 'token_labels']].sum()
    result_df = result_df.explode(['tokens', 'token_labels'])
    result_df = result_df[result_df['tokens'] != '']
    result_df = result_df.dropna() #drom NaN's of URL rows

    result_df = result_df.reset_index(drop=True)
    # print(f'Total tokens amount: {len(result_df):,}')

    return result_df

In [None]:
for idx, data_chank in enumerate(tqdm(data)):
    tmp_df = process_chunck(data_chank)
    tmp_df.to_csv(f'data/tmp_results/wiki_{idx}.csv')
    

In [106]:
dfs = []
for path_name in Path('data/tmp_results').iterdir():
    dfs.append(pd.read_csv(path_name, 
                           usecols=['tokens', 'token_labels'], 
                           engine='python'))

In [None]:
full_df = pd.concat(dfs)

In [None]:
full_df.head(3)

Unnamed: 0.1,Unnamed: 0,tokens,token_labels
0,0,``,H1
1,1,Hello,O
2,2,",",O


In [None]:
print('Total tokens amount in dataset')
print(f'{len(full_df):,}')

Total tokens amount in dataset
404,780,955


In [None]:
# 98%, 1%, 1% 
train, validate, test = np.split(full_df, [int(.98*len(full_df)), int(.99*len(full_df))])

In [None]:
print(f'Train size: {len(train):,}')
print(f'Validation size: {len(validate):,}')
print(f'Test size: {len(test):,}')

Train size: 396,685,335
Validation size: 4,047,810
Test size: 4,047,810


In [None]:
train.to_csv(r'data/markdown/train_wiki.txt', header=None, index=None, sep='\t', mode='w')
validate.to_csv(r'data/markdown/validate_wiki.txt', header=None, index=None, sep='\t', mode='w')
test.to_csv(r'data/markdown/test_wiki.txt', header=None, index=None, sep='\t', mode='w')

KeyboardInterrupt: 