In [1]:
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
import regex as re
import multiprocessing
from gensim.models.callbacks import CallbackAny2Vec

In [2]:
# Read Shona text data from a file
with open('shona_corpus.txt', 'r', encoding='utf-8') as file:
    lines = file.readlines()

In [3]:
# Define the GPT-4 tokenization pattern
GPT4_SPLIT_PATTERN = r"""'(?i:[sdmt]|ll|ve|re)|[^\r\n\p{L}\p{N}]?+\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]++[\r\n]*|\s*[\r\n]|\s+(?!\S)|\s+"""

def shona_tokenize(text):
    # Use regex to tokenize Shona text
    tokens = re.findall(GPT4_SPLIT_PATTERN, text)
    return tokens

In [4]:
# Tokenize Shona text
word_tokens = [shona_tokenize(line) for line in lines]



In [5]:
len(word_tokens)

1232529

In [6]:
# Configure Word2Vec parameters
vector_size = 100  # Dimensionality of word vectors
window = 5  # Maximum distance between the current and predicted word within a sentence
min_count = 1  # Ignores all words with a total frequency lower than this
epochs = 200 # Number of training epochs


In [9]:
# Train Word2Vec model for multiple epochs
model = Word2Vec(vector_size=vector_size,sentences=word_tokens,min_count=min_count, epochs=epochs)



In [10]:
# Save the trained model
model.save("shona_word2vec_100v.bin")


In [11]:
# Example usage of word vectors
print(model.wv['mwari'])

[ 2.4800901  -2.4462605  -0.5620928  -4.0146427  -5.931522   -2.5026622
 -6.7239256   0.84800726  3.5956128   1.3697836   2.5355656   3.0576656
  1.1785955  -1.2922791  -1.1786398   4.184401    0.08780147 -0.3715545
 -0.8753085   1.2553934  -1.8640904   5.280042    1.2764175  -1.1019186
  0.27422938  0.58728844  0.16113763  0.15737386  0.3983518   8.344677
  1.0794947  -0.32144943 -6.830682    2.3552854   2.6356559   1.1357268
 -1.6503016  -2.6065474  -2.4918022  -0.49499315  0.14990868  0.24240625
 -4.1462307  -0.5355225   2.9920757   5.9263325   4.932553   -2.9086018
 -7.3028603  -3.0883338  -0.95288455  0.2485519   4.6712794   4.390667
  0.49413794  2.315377   -3.0482666   1.5172665  -7.3865848  -1.4324028
  2.341667   -4.2083187   1.7560204   4.463658    1.0819036   2.28142
 -0.8031522   2.2467227  -5.75244    -2.209407    3.570298   -0.9275468
 -5.4589114  -0.4458149   2.2662156   2.7048895   0.3646489  -3.4331353
 -1.3079004   1.6441617  -0.9167239   2.2608063  -2.0626817  -0.391

In [53]:
# Load model
model = Word2Vec.load('shona_word2vec_300v.bin')
print(model)

Word2Vec(vocab=57115, vector_size=300, alpha=0.025)


In [22]:
from gensim.models import Word2Vec


# Example of similarity operation
result = model.wv.most_similar(positive=['musikana', 'baba'], negative=['mukomana'], topn=3)
print(result)


[('munonyatsa', 0.38684016466140747), ('kwechinguva', 0.386617511510849), ('ndaizodzoka', 0.3853031396865845)]


In [23]:
# Test solving word analogies
word1 = 'mambo'
word2 = 'murume'
word3 = 'mukadzi'
analogy_result = model.wv.most_similar(positive=[word1, word3], negative=[word2], topn=3)
print(f"Word analogy: '{word1}' is to '{word2}' as '{word3}' is to: {analogy_result}")


Word analogy: 'mambo' is to 'murume' as 'mukadzi' is to: [('charakupa', 0.39902248978614807), ('dzinomisawo', 0.3958755135536194), ('pamusuo', 0.3640866279602051)]


In [24]:
len(model.wv['mambo'])

100

In [26]:
from gensim.models import Word2Vec


# Test finding most similar words
word = 'amai'
most_similar_words = model.wv.most_similar(word)
print(f"Most similar words to '{word}': {most_similar_words}")



Most similar words to 'amai': [('kunotinzi', 0.43546196818351746), ('murimuka', 0.36470407247543335), ('achazarura', 0.3626631200313568), ('zvinonzizve', 0.36215704679489136), ('kungonokanda', 0.3575122654438019), ('musazoita', 0.3566194176673889), ('chigidi', 0.3547627031803131), ('angazofara', 0.3530261218547821), ('tinetanga', 0.35171061754226685), ('ngatirovei', 0.3508082628250122)]


In [27]:
def predict_next_word(sentence, model, topn=5):
    # Tokenize the input sentence
    tokens = sentence.split()
    
    # Get the vector representation of the last word in the sentence
    if tokens[-1] in model.wv:
        last_word_vector = model.wv[tokens[-1]]
    else:
        print(f"Word '{tokens[-1]}' not found in the vocabulary.")
        return None
    
    # Find the most similar words to the vector of the last word
    most_similar_words = model.wv.similar_by_vector(last_word_vector, topn=topn)
    
    return most_similar_words

# Example usage
sentence = "mavambo kusikwa kwezvinhu zvose pakutanga mwari "
next_word_predictions = predict_next_word(sentence, new_model)
print(f"Predictions for the next word in '{sentence}': {next_word_predictions}")


Predictions for the next word in 'mavambo kusikwa kwezvinhu zvose pakutanga mwari ': [('mwari', 1.0), ('wamwewo', 0.3974516689777374), ('ndakazoombekwa', 0.3922135829925537), ('kutoseka', 0.3852808177471161), ('ndoopakaperera', 0.36315077543258667)]
