In [9]:
import sentencepiece as spm
import pandas as pd 

In [7]:
df= pd.read_csv('../data/normalized.csv')

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 341400 entries, 0 to 341399
Data columns (total 1 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   text    340935 non-null  object
dtypes: object(1)
memory usage: 2.6+ MB


In [None]:
# Step 1: Convert all entries in the 'text' column to strings
df['text'] = df['text'].astype(str)

# Step 2: Define a function to split long sentences
def split_long_sentences(texts, max_length=100000):
    new_texts = []
    for text in texts:
        while len(text) > max_length:
            split_point = text.rfind(' ', 0, max_length)
            if split_point == -1:
                split_point = max_length
            new_texts.append(text[:split_point])
            text = text[split_point:].strip()
        new_texts.append(text)
    return new_texts

# Step 3: Process data in chunks
chunk_size = 1000  # Adjust chunk size based on memory constraints
output_files = []

for i in range(0, len(df), chunk_size):
    chunk = df['text'][i:i + chunk_size]
    preprocessed_chunk = split_long_sentences(chunk, max_length=100000)
    
    output_file = f'preprocessed_chunk_{i // chunk_size}.txt'
    with open(output_file, 'w') as f:
        for line in preprocessed_chunk:
            f.write(line + '\n')
    output_files.append(output_file)

# Step 4: Create a combined iterator from all preprocessed files
def combined_iterator(files):
    for file in files:
        with open(file, 'r') as f:
            for line in f:
                yield line.strip()

# Step 5: Use the combined iterator to train the SentencePiece model
spm.SentencePieceTrainer.Train(
    sentence_iterator=combined_iterator(output_files),
    model_prefix='amh_tokenizer_model',
    vocab_size=1000,
    pad_id=0,
    unk_id=1,
    bos_id=2,
    eos_id=3,
    max_sentence_length=100000  # Set to maximum allowed length
)


In [14]:
sp = spm.SentencePieceProcessor(model_file='amh_tokenizer_model.model')

In [24]:
sp.encode('የአዲስ ዘመን ጋዜጣ ቀደምት ዘገባዎች በእጅጉ ተነባቢ ዛሬም ላገኛቸው')

[7,
 217,
 501,
 903,
 583,
 6,
 617,
 66,
 11,
 138,
 106,
 104,
 42,
 30,
 45,
 111,
 389,
 8,
 4,
 34,
 18,
 148,
 91]

In [25]:
encoded = sp.encode('የአዲስ ዘመን ጋዜጣ ቀደምት ዘገባዎች በእጅጉ ተነባቢ ዛሬም ላገኛቸው')

In [26]:
sp.decode(encoded)

'የአዲስ ዘመን ጋዜጣ ቀደምት ዘገባዎች በእጅጉ ተነባቢ ዛሬም ላገኛቸው'