In [1]:
from gensim.models import FastText
import regex as re
import time
import os
from gensim.utils import simple_preprocess
from gensim.models import FastText
import re

In [2]:

def preprocess_text(text):
    text = text.lower()  # Lowercase
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    return simple_preprocess(text)

def read_corpus(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            yield preprocess_text(line)

In [3]:
corpus_file_path = 'shona_corpus_E.txt'
# Read and preprocess the corpus
sentences = list(read_corpus(corpus_file_path))


In [4]:
sentences[:3]

[['mavambo',
  'kusikwa',
  'kwezvinhu',
  'zvose',
  'pakutanga',
  'mwari',
  'akasika',
  'denga',
  'nepasi'],
 ['zvino',
  'rakanga',
  'risina',
  'chiumbo',
  'risina',
  'uye',
  'rakanga',
  'riri',
  'pamusoro',
  'pehwenje'],
 ['mweya', 'wamwari', 'wakanga', 'uchidzengerera', 'pamusoro', 'pemvura']]

In [5]:
start_time = time.time()

# Train FastText model
model = FastText(
    sentences, 
    vector_size=50,  # Higher dimension for better performance
    window=5, 
    min_count=5, 
    workers=4, 
    sg=1,  # Skip-gram model
    epochs=100,  # More epochs for thorough training
    bucket=2000000,  # Large bucket size for handling subwords
    min_n=3,  # Minimum length of char n-grams
    max_n=6   # Maximum length of char n-grams
)
end_time = time.time()
# Calculate the elapsed time
elapsed_time = end_time - start_time
print("Time taken:", elapsed_time, "minutes")


Time taken: 3669.8039407730103 minutes


In [6]:
# Save the model
model.save("shona_fasttext_50d.model")
model.wv.save("shona_fasttext_vectors_50d.kv")

In [7]:
print(model)

FastText(vocab=107228, vector_size=50, alpha=0.025)


In [None]:
# Load the model
model = load_fasttext_model(50)

In [8]:
def evaluate_similarity(model, word_pairs):
    similarity_scores = []
    for word1, word2, score in word_pairs:
        similarity_score = model.wv.similarity(word1, word2)
        similarity_scores.append((word1, word2, score, similarity_score))
    print("Similarity task evaluation:")
    for word1, word2, human_score, model_score in similarity_scores:
        print(f"{word1}-{word2}: Human score = {human_score}, Model score = {model_score}")

# Example similarity word pairs
similarity_word_pairs = [("murume", "mukadzi", 0.8), ("mwana", "mukomana", 0.6)]
evaluate_similarity(model, similarity_word_pairs)


Similarity task evaluation:
murume-mukadzi: Human score = 0.8, Model score = 0.8765901327133179
mwana-mukomana: Human score = 0.6, Model score = 0.7090246081352234


In [16]:
def perform_analogical_reasoning(model, a, b, c, topn=5):
    d = model.wv[b] - model.wv[a] + model.wv[c]
    closest_words = model.wv.similar_by_vector(d, topn=topn + 3)  # Add extra to ensure we get at least topn unique words
    result_words = [word for word, _ in closest_words if word not in [a, b, c]]
    return result_words[:topn]

# Example usage
a = "mukomana"  # man
b = "amai"   # king
c = "musikana" # woman

predicted_words = perform_analogical_reasoning(model, a, b, c)
if predicted_words:
    print(f"{a} is to {b} as {c} is to: {', '.join(predicted_words)}")
else:
    print("No suitable words found.")


mukomana is to amai as musikana is to: naamai, vevana, vamargaret, vaninina, namai


In [34]:
# Perform Analogical Reasoning
def perform_analogical_reasoning(model, a, b, c, topn=5):
    # Calculate the vector d as b - a + c
    d = model.wv[b] - model.wv[a] + model.wv[c]
    
    # Find the words that best complete the analogy
    closest_words = model.wv.similar_by_vector(d, topn=topn + 4)  # Add extra to ensure we get at least topn unique words
    result_words = [word for word, _ in closest_words if word not in [a, b, c]]
    
    # Ensure we return exactly 'topn' words
    return result_words[:topn]

# Example usage
a = "mukadzi"  # man
b = "ambuya"   # king
c = "murume" # woman

predicted_words = perform_analogical_reasoning(model, a, b, c)
if predicted_words:
    print(f"{a} is to {b} as {c} is to: {', '.join(predicted_words)}")
else:
    print("No suitable words found.")

mukadzi is to ambuya as murume is to: vokwanhingi, mhamha, amainini, hamenowo, mbuya


In [31]:
# Test similarity
similar_words = model.wv.most_similar("kutonga", topn=10)
print(similar_words)

[('kutongwa', 0.7632266879081726), ('nokutonga', 0.7399135828018188), ('kwamwari', 0.7255278825759888), ('kupona', 0.7216368317604065), ('kugumisa', 0.7109279632568359), ('kurwira', 0.7098906636238098), ('musimba', 0.7028589248657227), ('huchakurumidza', 0.6985704302787781), ('kururama', 0.6965538263320923), ('kushora', 0.6942458152770996)]


In [7]:
from blessmore import load_fasttext_model,clean_text_from_file,train_fasttext_model

In [8]:
# Load the model
model = load_fasttext_model(50)

In [4]:
# Test the model
word_vector = model.wv['mwari']
print(word_vector)

[ 5.67073941e-01 -1.40327856e-01  2.46757850e-01 -1.19709358e-01
  3.33165154e-02 -1.27217919e-01 -2.70771205e-01 -2.94649298e-03
 -4.37316597e-01  3.24908376e-01  1.84016023e-02 -4.91404563e-01
  6.82000518e-01 -9.34935957e-02 -3.72670740e-01  2.63547480e-01
 -1.18853301e-01 -7.24551618e-01  2.77097493e-01 -2.68609971e-01
 -3.52165788e-01 -4.62470114e-01  1.93486243e-01 -4.70782608e-01
  1.01021312e-01  5.08193195e-01  2.36431822e-01 -3.45228672e-01
  8.06312859e-02  2.12296277e-01 -3.76502760e-02  2.87234217e-01
  3.94492626e-01  2.65002131e-01  4.81813252e-01 -3.98817480e-01
 -2.38584206e-01 -4.00748253e-01  2.77814418e-01 -2.83835769e-01
  1.32060057e-04  1.09159574e-01  5.09238124e-01 -1.78759381e-01
  3.66199493e-01 -7.16258585e-01 -1.49325162e-01 -1.98036402e-01
  1.75683662e-01  2.44939819e-01]


In [5]:
corpus_file_path = 'D.txt'
vector_size = 100  # Specify the dimension you want to train
model = train_fasttext_model(corpus_file_path, vector_size)

Time taken for cleaning: 0.015781641006469727 seconds
Time taken for training: 1.3132991790771484 seconds


In [6]:
input_file = 'D.txt'
output_file = 'cleaned_shona_corpus.txt'
clean_text_from_file(input_file, output_file)

Time taken: 0.012693405151367188 seconds
