In [87]:
import os
import numpy as np
import nltk
import re
import pickle

In [88]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/itukh/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [89]:
RESOURCES_PATH = './resources'

In [90]:
def load_text(text_path, encoding='utf-8'):
    with open(text_path, 'r', encoding=encoding) as text_file:
        return text_file.read()

In [91]:
def save_chunks(output_path, chunks):
    for ind, chunk in enumerate(chunks):
        with open(os.path.join(output_path, f'text_{ind}.txt'), 'w') as output_file:
            output_file.write(chunk)

In [129]:
def split_text(text, chunks_number=400, bias=0):
    lines = text.split(os.linesep)
    lens = [len(line) + 1 for line in lines]
    chunks = []
    deltas = []
    n_words = []
    
    avg_size = 1. * len(text) / chunks_number
    n = len(lines)
    
    cur_len = 0
    cur_ind = 0
    chapter_ind = 1
    min_len = 1000000000
    max_len = -1
    for chunk_ind in range(chunks_number):
        if len(chunks) == 0 or chunks[-1] != []:
            chunks.append([])
        while cur_ind < n and cur_len < avg_size + bias:
            if (lines[cur_ind] == f'Глава {chapter_ind}'):
                chapter_ind += 1
                break
            chunks[-1].append(lines[cur_ind])
            cur_len += lens[cur_ind]
            cur_ind += 1
        if cur_len != 0:
            chunks[-1] = os.linesep.join(chunks[-1]) + os.linesep
            min_len = min(min_len, len(chunks[-1]))
            max_len = max(max_len, len(chunks[-1]))
            deltas.append((len(chunks[-1]) - avg_size) ** 2)
            n_words.append(len([word for word in re.split('\W+', chunks[-1]) if word != '']))
            cur_len = 0
            
    chunk_ind = len(chunks)
    while chunk_ind > 0 and len(chunks[chunk_ind - 1]) == 0:
        chunk_ind -= 1
    chunks = chunks[:chunk_ind]
    chunks[-1] = chunks[-1][:-1]
    
    mse = np.sqrt(np.array(deltas).mean())
    print(f'Expected chunk len: {avg_size}')
    print(f'Min len: {min_len}')
    print(f'Max len: {max_len}')
    print(f'Mean square error: {mse}')
    print(f'Average words number : {np.array(n_words).mean()}')
    return chunks, mse, avg_size

In [130]:
text = load_text(os.path.join(RESOURCES_PATH, '215591.txt'))

In [131]:
chunks, _, avg_size = split_text(text)

Expected chunk len: 2328.545
Min len: 5
Max len: 3741
Mean square error: 375.6759329442437
Average words number : 367.2817258883249


In [132]:
print(sum([len(chunk) for chunk in chunks]), len(text))

931418 931418


In [133]:
def join_chunks(chunks, avg_size):
    cur_ind = 0
    final_chunks = []
    
    lens = []
    while cur_ind < len(chunks):
        chunk = chunks[cur_ind]
        if len(chunk) < 100 and len(final_chunks) > 0:
            final_chunks[-1] += chunk
            cur_ind += 1
            continue
        if len(chunk) > avg_size / 2 or cur_ind == len(chunks) - 1 or 'Глава' in chunks[cur_ind + 1]:
            final_chunks.append(chunk)
            cur_ind += 1
        else:
            final_chunks.append(chunk + chunks[cur_ind + 1])
            cur_ind += 2
        lens.append(len(final_chunks[-1]))
    print(f'Min len: {min(lens)}')
    print(f'Max lenL {max(lens)}')
    
    return final_chunks

In [135]:
chunks = join_chunks(chunks, avg_size)

Min len: 223
Max lenL 3741


In [136]:
cur_len = 0
next_len = 0

pos = []

for chunk in chunks:
    cur_len, next_len = next_len, next_len + len(chunk) / len(text)
    pos.append((cur_len, next_len))
    print(f'[{cur_len}; {next_len})')

[0; 0.0028504924749145925)
[0.0028504924749145925; 0.004016456628495477)
[0.004016456628495477; 0.0065663321945678525)
[0.0065663321945678525; 0.009136606765168807)
[0.009136606765168807; 0.011925902226497663)
[0.011925902226497663; 0.014542342965242244)
[0.014542342965242244; 0.017067524999516866)
[0.017067524999516866; 0.019577676188349376)
[0.019577676188349376; 0.02241099055418727)
[0.02241099055418727; 0.024912552688481433)
[0.024912552688481433; 0.02747745909999592)
[0.02747745909999592; 0.02998761028882843)
[0.02998761028882843; 0.032686720677504624)
[0.032686720677504624; 0.03532248678896049)
[0.03532248678896049; 0.037911013100455436)
[0.037911013100455436; 0.04008726479410963)
[0.04008726479410963; 0.042672570210152694)
[0.042672570210152694; 0.04544254029877026)
[0.04544254029877026; 0.047953765119420066)
[0.047953765119420066; 0.05057986854452029)
[0.05057986854452029; 0.05324999087412955)
[0.05324999087412955; 0.055794498281115455)
[0.055794498281115455; 0.0582960604154096

In [137]:
print(sum([len(chunk) for chunk in chunks]))

931418


In [138]:
with open(os.path.join(RESOURCES_PATH, 'chunks_poses.pkl'), 'wb') as chunks_file:
    pickle.dump(pos, chunks_file)

In [141]:
save_chunks(os.path.join(RESOURCES_PATH, '215591'), chunks)

In [142]:
count = 0

for ind, chunk in enumerate(chunks):
    if 'Глава' in chunk:
        print(ind)
        count += 1

print(f'Total {count} fragments with \'Глава\'')

2
16
27
42
56
77
87
108
128
141
156
171
185
200
211
223
237
249
263
275
286
299
311
325
341
362
391
Total 27 fragments with 'Глава'
