<h1 style= color:red;><b>Data Set</b> </h1>
<p>Corpus: MultiUN</p>
<p>Content: The MultiUN parallel corpus is extracted from the United Nations Website</p>
<p>Sentences: 20.3M</p>
<p>Link hugging face: <a href="https://huggingface.co/datasets/Helsinki-NLP/un_pc/viewer/ar-fr">Link to data set</a> </p>



<h1 style= color:red;><b>Mount google drive </b> </h1>


In [None]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


<h1 style = color:red => <b>Imports</b><h1>

In [None]:
!pip install datasets gensim langid nltk




In [None]:
from datasets import load_dataset



# NLTK is a leading platform for building Python programs to work with human language data.
import nltk


#Library detecting the language used
import langid

# For calculating the duration of training
import time

# For stop word removal
nltk.download('stopwords')
from nltk.corpus import stopwords

# Convert a document into a list of tokens.
from gensim.utils import simple_preprocess

# To shuffle the list of words randomly
from random import shuffle

# Word2Vec
from gensim.models import Word2Vec



import re


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


<h1 style= color:red> <b>Data set preperations</b><h1>

In [None]:
def data_set_preperations():
    # Use this dataset
    ds = load_dataset("Helsinki-NLP/un_pc",split='train', data_dir ="ar-fr",streaming = True)
    return ds




<h1 style= color:red> <b>identifying ideal sentence  to keep the model as clean as possible</b><h1>


In [None]:
def identify_ideal_sentence(ar,fr):

    return langid.classify(ar)[0] == 'ar' and langid.classify(fr)[0] =='fr'

<h1 style= color:red> <b>Resume logic </b><h1>
<p> when colab runtime unexpectedly disconnects...the last index can be retrieved from output
and affected to stopped_count argument so the training can restart where it left off (simply by passing already trained pairs)
</p>

In [None]:
def Resume_logic(stopped_count,dataset):
    # generator version of dataset
    g = (iter(dataset))
    for i in range(0,stopped_count):
        next(g)
    return g


<h1 style= color:red> <b>Stop word removal </b><h1>


In [None]:
def stopWordsRemover(ar_list,fr_list):
  ar_nS = []
  fr_nS = []

  ar_stopwords_list = stopwords.words('arabic')
  fr_stopwords_list = stopwords.words('french')

  for word in ar_list:
    if word not in ar_stopwords_list:
      ar_nS.append(word)


  for word in fr_list:
    if word not in fr_stopwords_list:
      fr_nS.append(word)

  return {"ar":ar_nS,"fr":fr_nS}

<h1 style= color:red> <b>Arabic preprocessing</b><h1>


In [None]:
def arabic_preprocesser(line):
  # remove commas and points
  nLine = ""
  for char in line:
    if char not in [u'.', u'،']:
      nLine += char
  line = nLine
  # remove_diacritics
  regex = re.compile(r'[\u064B\u064C\u064D\u064E\u064F\u0650\u0651\u0652]')
  line = re.sub(regex, '', line)

  # remove_urls
  regex = re.compile(r"(http|https|ftp)://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+")
  line = re.sub(regex, ' ', line)
  # remove elongation
  regex = re.compile(r'\u0640')
  line = regex.sub('', line)
  # remove_numbers
  regex = re.compile(r"(\d|[\u0660\u0661\u0662\u0663\u0664\u0665\u0666\u0667\u0668\u0669])+")
  line = re.sub(regex, ' ', line)

  # noramlize
  regex = re.compile(r'[إأٱآا]')
  line = re.sub(regex, 'ا', line)
  regex = re.compile(r'ا+')
  line = re.sub(regex, 'ا', line)
  regex = re.compile(r'[ي]')
  line = re.sub(regex, 'ى', line)
  regex = re.compile(r'[ئ]')
  line = re.sub(regex, 'ء', line)
  regex = re.compile(r'[ؤ]')
  line = re.sub(regex, 'و', line)
  regex = re.compile(r'[ة]')
  line = re.sub(regex, 'ه', line)
  # remove one_character words
  regex = re.compile(r'\s.\s')
  line = re.sub(regex, ' ', line)
  line = ' '.join([word for word in line.split() if not re.findall(r'[^\s\u0621\u0622\u0623\u0624\u0625\u0626\u0627\u0628\u0629\u062A\u062B\u062C\u062D\u062E\u062F\u0630\u0631\u0632\u0633\u0634\u0635\u0636\u0637\u0638\u0639\u063A\u0640\u0641\u0642\u0643\u0644\u0645\u0646\u0647\u0648\u0649\u064A]', word)])

  return line


<h1 style= color:red> <b>random shuffle </b><h1>


In [None]:
def random_shuffle(ar, fr):
    # clean Arabic first
    ar_clean = arabic_preprocesser(ar)

    # Arabic list of words
    ar_w_list = simple_preprocess(ar_clean)

    # French list of words
    fr_w_list = simple_preprocess(fr)

    dic_ar_fr = stopWordsRemover(ar_w_list, fr_w_list)
    temp = dic_ar_fr['ar'] + dic_ar_fr['fr']
    shuffle(temp)
    return temp


<h1 style= color:red> <b>Training</b><h1>


In [None]:
def trainer(modelLocation,re_train,stopped_count =0):
    ds = data_set_preperations()
    #number of rows
    #pairsNumber = 20281645 in the original dataset
    pairsNumber = 400000
    g = (stopped_count,ds)

    documents = []
    start = time.time()
    for i in range(0, 33000):
        row =next(g)['translation']
        ar = row['ar']
        fr = row['fr']
        if(not identify_ideal_sentence(ar,fr)):
            pass
        else:
            documents.append((random_shuffle(ar,fr)))
    if (re_train == 0):
        print("creating model")
        model = Word2Vec(documents, vector_size = 300, window = 5, min_count = 10, workers = 8, sg = 1)
        model.save(modelLocation)
        print("sentence {}: model initialized and trained on the suitable part of first 33000 sentence pairs, vocab now holds {} words".format(i + 1 + stopped_count, len(model.wv)))
    elif (re_train ==1):
        print("loading model")
        model = Word2Vec.load(modelLocation)
        model.build_vocab(corpus_iterable = documents, update = True)
        model.train(documents,total_examples=len(documents),epochs=10)
        model.save(modelLocation)
        print("sentence {}: model loaded and trained on the suitable part of the other 33000 sentence pairs, vocab now holds {} words".format(i + 1 + stopped_count,len(model.wv)))

    documents = []
    for i in range(0, pairsNumber - 33000 - stopped_count):
        row =next(g)['translation']
        ar = row['ar']
        fr = row['fr']
        if(not identify_ideal_sentence(ar,fr)):
            pass
        else:
            documents.append((random_shuffle(ar,fr)))
            if(len(documents)==33000):
                model.build_vocab(corpus_iterable  = documents, update = True)
                model.train(documents,total_examples=len(documents),epochs=10)
                model.save(modelLocation)
                print("sentence {}: model loaded and trained on the suitable part of the other 33000 sentence pairs, vocab now holds {} words".format(i + 1 + 33000 + stopped_count,len(model.wv)))
                documents = []
    model.build_vocab(corpus_iterable = documents,update=True)
    model.train(documents,total_examples=len(documents),epochs=10)
    model.save(modelLocation)
    print("sentence {}: model trained on the remaining suitable sentence pairs, vocab now holds {} words".format(i + 1 + 33000 + stopped_count, len(model.wv)))
    end = time.time()

    print("DONE :)")
    print("time spent in traning (in seconds): {}".format(end-start))


<h1 style= color:red> <b>Test</b><h1>


In [None]:
trainer("/content/drive/Shared drives/CS476/randomshuffle_5window_skipgram_300size.model",re_train = 0,stopped_count =0)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/11.9k [00:00<?, ?B/s]

Resolving data files:   0%|          | 0/18 [00:00<?, ?it/s]

creating model
sentence 33000: model initialized and trained on the suitable part of first 33000 sentence pairs, vocab now holds 9779 words




sentence 71544: model loaded and trained on the suitable part of the other 33000 sentence pairs, vocab now holds 14417 words




sentence 111404: model loaded and trained on the suitable part of the other 33000 sentence pairs, vocab now holds 16253 words




sentence 147925: model loaded and trained on the suitable part of the other 33000 sentence pairs, vocab now holds 18464 words




sentence 183395: model loaded and trained on the suitable part of the other 33000 sentence pairs, vocab now holds 19720 words




sentence 200000: model trained on the remaining suitable sentence pairs, vocab now holds 19948 words
DONE :)
time spent in traning (in seconds): 1626.3873028755188
