In [1]:
data = open("../dataset/train.txt", "r", encoding="utf-8").readlines()

In [2]:
print(len(data))

50000


In [3]:
import re


def split_and_recombine(sequence, max_len, split_char=".", split_regex=r"\."):
    split_sequence = re.split(split_regex, sequence)
    if len(split_sequence) > 1:
        temp_data = []
        temp_sequence = ''
        for part in split_sequence:
            if len(temp_sequence) + len(part) + 1 <= max_len:
                if temp_sequence:
                    temp_sequence += split_char + part
                else:
                    temp_sequence += part
            else:
                temp_data.append(temp_sequence)
                temp_sequence = part
        if temp_sequence:
            temp_data.append(temp_sequence)
        return temp_data
    return [sequence]

def split_training_sequences(data, max_len):
    split_data = []
    below_max = 0
    count = 0
    split_accepted = 0
    for sequence in data:
        if len(sequence) <= max_len:
            below_max += 1
            split_data.append(sequence)
        else:
            dot_splited = split_and_recombine(sequence, max_len, split_char='.', split_regex=r"\.")
            for part in dot_splited:
                if len(part) <= max_len:
                    # if empty or contains only spaces
                    if part.strip():
                        split_accepted += 1
                        split_data.append(part)
                    continue
                else:
                    comma_splited = split_and_recombine(part, max_len, split_char='،', split_regex=r"[،,]")
                    for part in comma_splited:
                        if len(part) <= max_len:
                            if part.strip():
                                split_accepted += 1
                                split_data.append(part)
                            continue
                        else:
                            count += 1
                            # Discard the sample as splitting attempts failed
                            pass
    print('Total samples: ', len(data))
    print('Below max length: ', below_max)
    print('Discarded samples: ', count)
    print('Accepted Split samples: ', split_accepted)
    return split_data

# Example usage:
max_length = 600
result = split_training_sequences(data, max_length)


Total samples:  50000
Below max length:  40740
Discarded samples:  3663
Accepted Split samples:  17199


In [4]:
# result length statistics
print('Result length: ', len(result))
print('Result length statistics: ')
print('Min: ', min([len(x) for x in result]))
print('Max: ', max([len(x) for x in result]))
print('Avg: ', sum([len(x) for x in result]) / len(result))

Result length:  57939
Result length statistics: 
Min:  3
Max:  600
Avg:  259.4917585736723


In [5]:
# write result to file
with open('../dataset/train_split.txt', 'w', encoding='utf-8') as f:
    f.writelines([res.strip() + '\n' for res in result])
    