In [32]:
import pandas as pd
import numpy as np
from nltk import word_tokenize
from typing import List

In [2]:
DATA_PATH = 'data/markdown/wiki_texts_new.csv'

In [3]:
df = pd.read_csv(DATA_PATH)
print('Raw csv length', len(df))

Raw csv length 136500


In [4]:
# Drop empty  rows
df = df.loc[~df.isna().all(axis=1), :]
print('Filtered csv length', len(df))

Filtered csv length 136500


In [8]:
# Fill NaNs for correct text aggregation
df.loc[~df['url'].isna(), 'full_text'] = None
df['full_text'] = df['full_text'].fillna('')

In [9]:
df = df.reset_index(drop=True) 

Tokenize text by NLTK word tokenizer

In [12]:
%%time
df['tokens'] = df['full_text'].apply(lambda x: word_tokenize(x))

CPU times: user 1min 10s, sys: 537 ms, total: 1min 11s
Wall time: 1min 57s


In [13]:
df.head(3)

Unnamed: 0,url,full_text,h1,h2,h3,block,list,tokens
0,https://en.wikipedia.org/wiki/Federal_enterpri...,,,,,,,[]
1,,Federal enterprise architecture,Federal enterprise architecture,,,,,"[Federal, enterprise, architecture]"
2,,A federal enterprise architecture framework (F...,,,,A federal enterprise architecture framework (F...,,"[A, federal, enterprise, architecture, framewo..."



Create classifier labels

In [14]:
columns = ['h1', 'h2', 'block', 'list']
clmn_idx2clmn_name = {clmn_idx: clmn_name for clmn_idx, clmn_name in enumerate(columns)}

In [24]:
df_labels = (~df[columns].isna())
index, value = df_labels.values.nonzero()
df_labels['tag_idx'] = None
df_labels.loc[index, 'tag_idx'] = value

In [25]:
df_labels['tag_label'] = df_labels['tag_idx'].map(clmn_idx2clmn_name)
df_labels['tag_label'] = df_labels['tag_label'].str.upper()

In [26]:
df_labels.head(3)

Unnamed: 0,h1,h2,block,list,tag_idx,tag_label
0,False,False,False,False,,
1,True,False,False,False,0.0,H1
2,False,False,True,False,2.0,BLOCK


In [29]:
df = df[['url', 'full_text', 'tokens']]
df = df.join(df_labels['tag_label'])

In [30]:
df.head(3)

Unnamed: 0,url,full_text,tokens,tag_label
0,https://en.wikipedia.org/wiki/Federal_enterpri...,,[],
1,,Federal enterprise architecture,"[Federal, enterprise, architecture]",H1
2,,A federal enterprise architecture framework (F...,"[A, federal, enterprise, architecture, framewo...",BLOCK


Allign tokens with labels

In [31]:
def get_token_labels(tokens: List[str], tag: str) -> List[str]:
    seq_len = len(tokens)
    if seq_len == 0:
        return []
    return [tag] + ['O']*(seq_len - 1)

In [34]:
df['token_labels'] = df.apply(lambda x: get_token_labels(x.tokens, x.tag_label), axis=1)

In [35]:
df.head(3)

Unnamed: 0,url,full_text,tokens,tag_label,token_labels
0,https://en.wikipedia.org/wiki/Federal_enterpri...,,[],,[]
1,,Federal enterprise architecture,"[Federal, enterprise, architecture]",H1,"[H1, O, O]"
2,,A federal enterprise architecture framework (F...,"[A, federal, enterprise, architecture, framewo...",BLOCK,"[BLOCK, O, O, O, O, O, O, O, O, O, O, O, O, O,..."


Group texts by URL

In [37]:
# fillna by previous non-Nan for group by URL 
df['url'] = df['url'].fillna(method='ffill')

In [38]:
result_df = df.groupby('url')[['tokens', 'token_labels']].sum()

In [40]:
result_df.head(3)

Unnamed: 0_level_0,tokens,token_labels
url,Unnamed: 1_level_1,Unnamed: 2_level_1
https://en.wikipedia.org//wiki/%C3%85land,"[Åland, Åland, (, Finnish, :, Ahvenanmaa, :, [...","[H1, BLOCK, O, O, O, O, O, O, O, O, O, O, O, O..."
https://en.wikipedia.org//wiki/108th_United_States_Congress,"[108th, United, States, Congress, The, 108th, ...","[H1, O, O, O, BLOCK, O, O, O, O, O, O, O, O, O..."
https://en.wikipedia.org//wiki/1988_Brazilian_Constitution,"[Constitution, of, Brazil, The, Constitution, ...","[H1, O, O, BLOCK, O, O, O, O, O, O, O, O, O, O..."


Save results

In [45]:
data = result_df.explode(['tokens', 'token_labels'])

In [80]:
data = data[data['tokens'] != '']
data = data.dropna() #drom NaN's of URL rows

In [81]:
print(f'Total tokens amount: {len(data):,}')

Total tokens amount: 7,853,561


In [82]:
data.to_csv(r'data/markdown/all_data.txt', header=None, index=None, sep='\t', mode='w')

In [83]:
# 90%, 5%, 5% 
train, validate, test = np.split(data, [int(.9*len(data)), int(.95*len(data))])

In [84]:
print(f'Train size: {len(train):,}')
print(f'Validation size: {len(validate):,}')
print(f'Test size: {len(test):,}')

Train size: 7,068,204
Validation size: 392,678
Test size: 392,679


In [85]:
train.to_csv(r'data/markdown/train.txt', header=None, index=None, sep='\t', mode='w')
validate.to_csv(r'data/markdown/validate.txt', header=None, index=None, sep='\t', mode='w')
test.to_csv(r'data/markdown/test.txt', header=None, index=None, sep='\t', mode='w')