In [10]:
# Install the required libraries
!pip install fasttext umap-learn pandas

# Import necessary libraries
import fasttext
import umap
import pandas as pd

# Load your dataset
# Replace 'your_dataset.csv' with the actual path to your dataset
dataset_path = 'data.csv'
df = pd.read_csv(dataset_path)

# Display the first few rows of the dataset
df.head()




Unnamed: 0,New_Language,English
0,reg,right
1,reg,correct
2,reg,real
3,vlak,flat
4,vlak,level


In [14]:
column_names = list(df.columns.values)
print(column_names)

['New_Language', ' English']


In [20]:
# Preprocess the data (assuming you've already preprocessed it)
# Combine English and New Language words into a single list
all_words = df[column_names[1]].tolist() + df[column_names[0]].tolist()

# Save the combined words to a text file for training FastText
with open('combined_words.txt', 'w', encoding='utf-8') as file:
    file.write('\n'.join(all_words))

# Train FastText model for English
english_model_path = 'english_fasttext_model.bin'
english_model = fasttext.train_unsupervised('combined_words.txt', model='skipgram')

# Save the English model
english_model.save_model(english_model_path)

# Train FastText model for the New Language
new_language_model_path = 'new_language_fasttext_model.bin'
new_language_model = fasttext.train_unsupervised('combined_words.txt', model='skipgram')

# Save the New Language model
new_language_model.save_model(new_language_model_path)

# Align New Language word vectors with UMAP
# Get English word vectors
english_word_vectors = {word: english_model.get_word_vector(word) for word in df[column_names[1]].tolist()}

# Get New Language word vectors
new_language_word_vectors = {word: new_language_model.get_word_vector(word) for word in df[column_names[0]].tolist()}

In [22]:
# Concatenate English and New Language vectors for alignment
all_vectors = list(english_word_vectors.values()) + list(new_language_word_vectors.values())

# Use UMAP to align vectors
mapper = umap.UMAP()
aligned_vectors = mapper.fit_transform(all_vectors)

# Split the aligned vectors back into English and New Language
aligned_english_vectors = aligned_vectors[:len(english_word_vectors)]
aligned_new_language_vectors = aligned_vectors[len(english_word_vectors):]

# Save or use the aligned vectors as needed
# Save or use the aligned vectors as needed
# Save or use the aligned vectors as needed
aligned_english_vectors_df = pd.DataFrame(aligned_english_vectors, columns=['Aligned_English_1', 'Aligned_English_2'])
aligned_new_language_vectors_df = pd.DataFrame(aligned_new_language_vectors, columns=['Aligned_New_Language_1', 'Aligned_New_Language_2'])

aligned_df = pd.concat([df, aligned_english_vectors_df, aligned_new_language_vectors_df], axis=1)

# Save the aligned vectors to a new CSV file
aligned_df.to_csv('aligned_vectors.csv', index=False)

In [18]:
# # Preprocess the data (assuming you've already preprocessed it)
# # Combine English and New Language words into a single list
# all_words = df[column_names[1]].tolist() + df[column_names[0]].tolist()

# # Save the combined words to a text file for training FastText
# with open('combined_words.txt', 'w', encoding='utf-8') as file:
#     # Append language code '__label__en' to each English word
#     for word in df[column_names[1]]:
#         file.write(f'__label__en {word}\n')

#     # Append language code '__label__nl' to each New Language word
#     for word in df[column_names[0]]:
#         file.write(f'__label__nl {word}\n')

# # Train FastText model
# model_path = 'fasttext_model.bin'
# model = fasttext.train_unsupervised('combined_words.txt', model='skipgram')

# # Save the model
# model.save_model(model_path)