In [None]:
!pip install numpy tqdm gensim scikit-learn #importing essential libraries





[notice] A new release of pip is available: 24.1.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip





In [None]:
!pip install wget #ignore on a Linux system




[notice] A new release of pip is available: 24.1.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [None]:
!python -m wget https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.vec.gz # downloads pre-trained FastText Embeddings for English and Hindi respectively
!python -m wget https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.hi.300.vec.gz



Saved under cc.en.300.vec.gz

Saved under cc.hi.300.vec.gz


In [None]:
!python -m wget https://dl.fbaipublicfiles.com/arrival/dictionaries/en-hi.txt #downloads the MUSE bilingual dictionary



Saved under en-hi.txt


### Reads and processes the dictionary into a list of word pairs:

In [5]:
eng_hind_pair = {}
with open('en-hi.txt', 'r', encoding='utf-8') as f:
    for line in f:
        eng, hin = line.strip().split()
        eng_hind_pair[eng] = hin


In [6]:
print("Total number of English-Hindi word pairs are" , len(eng_hind_pair))

Total number of English-Hindi word pairs are 31719


### Loads embeddings

In [7]:
from gensim.models import KeyedVectors

eng_embeddings = KeyedVectors.load_word2vec_format('cc.en.300.vec.gz')
hin_embeddings = KeyedVectors.load_word2vec_format('cc.hi.300.vec.gz')

### Extracts aligned word vectors

In [8]:
import numpy as np
# Extract the embeddings for the bilingual lexicon pairs
# Extract aligned word vectors
common_words = [word for word in eng_hind_pair if word in eng_embeddings and eng_hind_pair[word] in hin_embeddings]

# Extract aligned word vectors with same shape
X = np.array([eng_embeddings[word] for word in common_words])  # Shape: (N, 300)
Y = np.array([hin_embeddings[eng_hind_pair[word]] for word in common_words])  # Shape: (N, 300)


### Procrustes alignment algorithm

In [9]:
def procrustes_algo(X, Y):
    A, _, Bt = np.linalg.svd(np.dot(X.T, Y))
    U = np.dot(A, Bt)
    return U

### Computes Procrustes alignment and transforms english embeddings

In [10]:
# Compute the optimal orthogonal matrix using the bilingual lexicon
U = procrustes_algo(X, Y)

# Align the English embeddings with the learned transformation matrix
aligned_eng_emb = np.dot(eng_embeddings.vectors, U)

### Word translation using nearest neighbours

In [11]:
from sklearn.neighbors import NearestNeighbors

# Fit nearest neighbor search on the aligned Hindi embeddings
hin_vectors = np.array([hin_embeddings[word] for word in hin_embeddings.key_to_index])  # Hindi embedding matrix
hin_words = list(hin_embeddings.key_to_index.keys())  # Hindi word list

# Use Nearest Neighbors to find closest Hindi words
def translate_word(word, top_k=5):
    if word not in eng_embeddings:
        return []
    
    # Align the English word vector
    eng_vector = np.dot(eng_embeddings[word], U).reshape(1, -1)

    # Find nearest Hindi words
    nn_model = NearestNeighbors(n_neighbors=top_k, metric='cosine').fit(hin_vectors)
    distances, indices = nn_model.kneighbors(eng_vector)

    return [hin_words[idx] for idx in indices[0]]  # Return top-k translated words


In [12]:
# Test an example English word
example_word = "life"
top_k_predictions = translate_word(example_word, top_k=5)

print(f"Top 5 Hindi translations for '{example_word}': {top_k_predictions}")


Top 5 Hindi translations for 'life': ['जीवन', 'जिंदगी', 'जिनगीभर', 'जिन्दगी', 'संघर्षमयी']


### Evaluating translations

In [13]:
from sklearn.metrics.pairwise import cosine_similarity

def evaluate_translation(test_dict, aligned_eng_emb, eng_embeddings, hin_embeddings, k=5):
    correct_at_1 = 0
    correct_at_5 = 0
    total = len(test_dict)
    
    for eng_word, hin_word in test_dict.items():  # Use .items() since test_dict is a dictionary
        if eng_word not in eng_embeddings.key_to_index or hin_word not in hin_embeddings.key_to_index:
            continue  # Skip words not in vocabulary
        
        # Get aligned English vector
        eng_vector = aligned_eng_emb[eng_embeddings.key_to_index[eng_word]]

        # Compute cosine similarity with all Hindi embeddings
        similarities = cosine_similarity(eng_vector.reshape(1, -1), hin_embeddings.vectors)

        # Get top-k closest words
        top_k_indices = similarities[0].argsort()[-k:][::-1]
        top_k_words = [hin_embeddings.index_to_key[i] for i in top_k_indices]

        # Compute accuracy
        if hin_word == top_k_words[0]:
            correct_at_1 += 1
        if hin_word in top_k_words:
            correct_at_5 += 1

    precision_at_1 = correct_at_1 / total
    precision_at_5 = correct_at_5 / total
    return precision_at_1, precision_at_5


In [None]:
# Take a subset of 2000 word pairs from MUSE dictionary
muse_test_dict = dict(list(eng_hind_pair.items())[:2000])  # Convert to dict for easy lookup


### Precision@1 and Precision@5 results

In [15]:
precision_at_1, precision_at_5 = evaluate_translation(muse_test_dict, aligned_eng_emb, eng_embeddings, hin_embeddings)

print(f'Precision@1: {precision_at_1:.4f}')
print(f'Precision@5: {precision_at_5:.4f}')


Precision@1: 0.3510
Precision@5: 0.5650


### Cosine Similarity Analysis

In [16]:
def compute_cosine_similarity(test_dict, aligned_eng_emb, eng_embeddings, hin_embeddings):
    cosine_similarities = {}

    for eng_word, hin_word in test_dict.items():
        if eng_word not in eng_embeddings.key_to_index or hin_word not in hin_embeddings.key_to_index:
            continue  # Skip words not in vocab
        
        # Get vectors
        eng_vector = aligned_eng_emb[eng_embeddings.key_to_index[eng_word]]
        hin_vector = hin_embeddings[hin_embeddings.key_to_index[hin_word]]

        # Compute cosine similarity
        similarity = np.dot(eng_vector, hin_vector) / (np.linalg.norm(eng_vector) * np.linalg.norm(hin_vector))

        cosine_similarities[eng_word] = similarity

    return cosine_similarities


In [17]:
cosine_sims = compute_cosine_similarity(muse_test_dict, aligned_eng_emb, eng_embeddings, hin_embeddings)

# Print a few example similarities
for word, sim in list(cosine_sims.items())[:10]:  # Show top 10 examples
    print(f"Cosine similarity for '{word}': {sim:.4f}")


Cosine similarity for 'and': 0.4399
Cosine similarity for 'was': 0.4678
Cosine similarity for 'for': 0.4909
Cosine similarity for 'that': 0.4552
Cosine similarity for 'with': 0.4831
Cosine similarity for 'from': 0.3387
Cosine similarity for 'this': 0.4511
Cosine similarity for 'utc': 0.3704
Cosine similarity for 'his': 0.4868
Cosine similarity for 'not': 0.6378


### Ablation studies

In [18]:
def run_ablation_study(eng_hind_pair, eng_embeddings, hin_embeddings, size):
    results = {}

    subset_dict = dict(list(eng_hind_pair.items())[:size])

    # Extract aligned word vectors
    common_words = [word for word in subset_dict if word in eng_embeddings and subset_dict[word] in hin_embeddings]
    X_subset = np.array([eng_embeddings[word] for word in common_words])
    Y_subset = np.array([hin_embeddings[subset_dict[word]] for word in common_words])

    # Compute Procrustes transformation
    U_subset = procrustes_algo(X_subset, Y_subset)
    aligned_eng_emb_subset = np.dot(eng_embeddings.vectors, U_subset)

    # Evaluate performance
    precision_at_1, precision_at_5 = evaluate_translation(subset_dict, aligned_eng_emb_subset, eng_embeddings, hin_embeddings)

    # Store results
    results[size] = (precision_at_1, precision_at_5)
    print(f"Dictionary Size: {size} → Precision@1: {precision_at_1:.4f}, Precision@5: {precision_at_5:.4f}")

    return results


In [20]:
ablation_results_5000 = run_ablation_study(eng_hind_pair, eng_embeddings, hin_embeddings, size=5000)


Dictionary Size: 5000 → Precision@1: 0.4296, Precision@5: 0.6404


In [22]:
ablation_results_10000 = run_ablation_study(eng_hind_pair, eng_embeddings, hin_embeddings, size=10000)

Dictionary Size: 10000 → Precision@1: 0.3093, Precision@5: 0.4870


In [24]:
ablation_results_20000 = run_ablation_study(eng_hind_pair, eng_embeddings, hin_embeddings, size=20000)

Dictionary Size: 20000 → Precision@1: 0.1950, Precision@5: 0.3257
