In [11]:
import os
from os.path import join

import fasttext as ft
import gensim

# Params

In [12]:
dim = 256
stem = False

In [13]:
os.makedirs("embeddings", exist_ok=True)
data_folder = join("data", "SMADC")

In [14]:
if stem:
    from farasa.stemmer import FarasaStemmer

    files = [
        "GLF.txt",
        "EGY.txt",
        "IRQ.txt",
        "NOR.txt",
        "LEV.txt"
    ]

    texts = []

    for file in files:
        with open(join(data_folder, file), encoding="utf8") as file:
            texts.append(file.read())

    stemmer = FarasaStemmer()
    stemmed_texts = [stemmer.stem(text) for text in texts]
    
    embedding_GLF = ft.train_unsupervised(stemmed_texts[0], dim=dim)
    embedding_EGY = ft.train_unsupervised(stemmed_texts[1], dim=dim)
    embedding_IRQ = ft.train_unsupervised(stemmed_texts[2], dim=dim)
    embedding_NOR = ft.train_unsupervised(stemmed_texts[3], dim=dim)
    embedding_LEV = ft.train_unsupervised(stemmed_texts[4], dim=dim)
else:
    embedding_GLF = ft.train_unsupervised(join(data_folder, "GLF.txt"), dim=dim)
    embedding_EGY = ft.train_unsupervised(join(data_folder, "EGY.txt"), dim=dim)
    embedding_IRQ = ft.train_unsupervised(join(data_folder, "IRQ.txt"), dim=dim)
    embedding_NOR = ft.train_unsupervised(join(data_folder, "NOR.txt"), dim=dim)
    embedding_LEV = ft.train_unsupervised(join(data_folder, "Lev.txt"), dim=dim)


In [None]:
embedding_GLF.save_model("embeddings/embedding_GLF.bin")
embedding_EGY.save_model("embeddings/embedding_EGY.bin")
embedding_IRQ.save_model("embeddings/embedding_IRQ.bin")
embedding_NOR.save_model("embeddings/embedding_NOR.bin")
embedding_LEV.save_model("embeddings/embedding_LEV.bin")

In [None]:
gensim.models.FastText.load_fasttext_format('embeddings/embedding_GLF.bin').wv.save_word2vec_format("embeddings/glf.vec", binary=False)
gensim.models.FastText.load_fasttext_format('embeddings/embedding_EGY.bin').wv.save_word2vec_format("embeddings/egy.vec", binary=False)
gensim.models.FastText.load_fasttext_format('embeddings/embedding_IRQ.bin').wv.save_word2vec_format("embeddings/irq.vec", binary=False)
gensim.models.FastText.load_fasttext_format('embeddings/embedding_NOR.bin').wv.save_word2vec_format("embeddings/nor.vec", binary=False)
gensim.models.FastText.load_fasttext_format('embeddings/embedding_LEV.bin').wv.save_word2vec_format("embeddings/lev.vec", binary=False)

  gensim.models.FastText.load_fasttext_format('embeddings/embedding_GLF.bin').wv.save_word2vec_format("embeddings/glf.vec", binary=False)
  gensim.models.FastText.load_fasttext_format('embeddings/embedding_EGY.bin').wv.save_word2vec_format("embeddings/egy.vec", binary=False)
  gensim.models.FastText.load_fasttext_format('embeddings/embedding_IRQ.bin').wv.save_word2vec_format("embeddings/irq.vec", binary=False)
  gensim.models.FastText.load_fasttext_format('embeddings/embedding_NOR.bin').wv.save_word2vec_format("embeddings/nor.vec", binary=False)
  gensim.models.FastText.load_fasttext_format('embeddings/embedding_LEV.bin').wv.save_word2vec_format("embeddings/lev.vec", binary=False)
