In [None]:
# Install the required libraries
!pip install fasttext umap-learn pandas

# Import necessary libraries
import fasttext
import umap
import pandas as pd



PAPER REPLICATION

In [None]:
import numpy as np
from scipy.spatial import procrustes
from sklearn import metrics

In [None]:
def create_set(dataset_path):
    foo = open(dataset_path, 'r')
    foow = open(dataset_path[:-4]+'.csv', 'a')
    content = foo.readlines()

    foow.write('Word,English\n')
    for i in content:
      create = i.split()
      if len(create) >= 2:
        a = create[0]
        b = create[1]
        foow.write(a+','+b+'\n')
    foo.close()
    foow.close()

In [None]:
def model_training(dataset_path, eng_mod_name, new_mod_name, lang):
    # LOADING DATASET
    df = pd.read_csv(dataset_path)
    column_names = list(df.columns.values)
    # Display the first few rows of the dataset
    df.head()

    # MODEL TRAINING
    # Combine English and New Language words into a single list
    all_words = df[column_names[1]].tolist() + df[column_names[0]].tolist()

    # Save the combined words to a text file for training FastText
    with open('temp.txt', 'w', encoding='utf-8') as file:
        file.write('\n'.join(str(all_words)))

    # Train FastText model for English
    #english_model_path = 'english_fasttext_model.bin'
    english_model_path = eng_mod_name
    english_model = fasttext.train_unsupervised('temp.txt', model='skipgram')
    english_model.save_model(english_model_path)

    # Train FastText model for the New Language
    #new_language_model_path = 'new_language_fasttext_model.bin'
    new_language_model_path = new_mod_name
    new_language_model = fasttext.train_unsupervised('temp.txt', model='skipgram')
    new_language_model.save_model(new_language_model_path)

    #GETTING VECTORS
    # English
    english_word_vectors = {word: english_model.get_word_vector(word) for word in df[column_names[1]].tolist()}
    # New Language word vectors
    new_language_word_vectors = {word: new_language_model.get_word_vector(word) for word in df[column_names[0]].tolist()}


    #PROCRUSTES ALIGNMENT

    eng_vec = []
    lang_vec = []

    for i in range(len(df)):
        eng_vec.append(english_model.get_word_vector(df[column_names[1]].tolist()[i]))
        lang_vec.append(new_language_model.get_word_vector(df[column_names[0]].tolist()[i]))

    mtx1, mtx2, disparity = procrustes(eng_vec, lang_vec)

    score1 = metrics.mean_squared_error(mtx1, mtx2)

    return english_word_vectors, new_language_word_vectors, df

In [None]:
#UMAP ALIGNMENT

def umap_alignment(eng_vec, new_vec, lang, df):

    # Concatenate English and New Language vectors for alignment
    # all_vectors = list(english_word_vectors.values()) + list(new_language_word_vectors.values())
    all_vectors = list(eng_vec.values()) + list(new_vec.values())

    # Use UMAP to align vectors
    mapper = umap.UMAP()
    aligned_vectors = mapper.fit_transform(all_vectors)

    # Split the aligned vectors back into English and New Language
    aligned_english_vectors = aligned_vectors[:len(eng_vec)]
    aligned_new_language_vectors = aligned_vectors[len(eng_vec):]

    # Save or use the aligned vectors as needed
    aligned_english_vectors_df = pd.DataFrame(aligned_english_vectors, columns=['Aligned_English_1', 'Aligned_English_2'])
    aligned_new_language_vectors_df = pd.DataFrame(aligned_new_language_vectors, columns=['Aligned_New_Language_1', 'Aligned_New_Language_2'])

    aligned_df = pd.concat([df, aligned_english_vectors_df, aligned_new_language_vectors_df], axis=1)

    # Save the aligned vectors to a new CSV file
    aligned_df.dropna(axis = 0)
    aligned_df.to_csv('aligned_vectors_'+lang+'.csv', index=False)

    # GETTING SCORES
    foo = open('aligned_vectors_'+lang+'.csv', 'r')
    content = foo.readlines()
    foo.close()

    y_true = []
    y_pred = []
    for i in range(1, len(content)):
        bw, w, ae1, ae2, ab1, ab2 = content[i].split(',')
        if ae1 != "" and ae2!= "" and ab1 != "" and ab2!= "":
            y_true.append([float(ae1), float(ae2)])
            y_pred.append([float(ab1), float(ab2)])

    score = metrics.mean_squared_error(y_true, y_pred)

In [None]:
#COMPILED

files = ['af-en.txt', 'als-en.txt', 'az-en.txt', 'jv-en.txt', 'kn-en.txt', 'sco-en.txt', 'tg-en.txt']
languages = ['af', 'als', 'az', 'jv', 'kn', 'sco', 'sh', 'tg']

def combine_run(file_path, lang):
    create_set(file_path)
    eng_vec, lang_vec, lang_df = model_training(file_path[:-4]+'.csv', 'en-'+lang+'-model.bin', lang+'-eng-model.bin', lang)
    umap_alignment(eng_vec, lang_vec, lang, lang_df)

for i in range(len(files)):
    combine_run(files[i], languages[i])

In [None]:
#COMPILED

files = ['bodo-en.txt']
languages = ['bodo']

def combine_run(file_path, lang):
    create_set(file_path)
    eng_vec, lang_vec, lang_df = model_training(file_path[:-4]+'.csv', 'en-'+lang+'-model.bin', lang+'-eng-model.bin', lang)
    umap_alignment(eng_vec, lang_vec, lang, lang_df)

for i in range(len(files)):
    combine_run(files[i], languages[i])
    print("\n")