In [1]:
# Imports
import re
import string
import json
from datetime import datetime
from collections import defaultdict, Counter

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm

import torch
import torch.nn as nn
from torch.nn import Module
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

from nltk.corpus import stopwords

device = 'cpu'

import random

random.seed(26)
np.random.seed(62)
torch.manual_seed(2021)

<torch._C.Generator at 0x7f678e4adfb0>

In [2]:
embedding_size = 300

In [3]:
# load HINDI mapping {word -> id} and {id -> word}
with open('../Task_1/save/word_to_int_dict.json') as f:
    hindi_word_to_int = json.load(f)
with open('../Task_1/save/int_to_word_dict.json') as f:
    hindi_int_to_word = json.load(f)

# get vocab_size
hindi_vocab_size = len(hindi_word_to_int)
print(f'hindi vocab_size: {hindi_vocab_size}')

# load embedding
hindi_embedding_path = '../Task_1/save/embedding_weights.pt'
hindi_embed_layer = nn.Embedding(hindi_vocab_size, embedding_size)
hindi_embed_layer.load_state_dict(torch.load(hindi_embedding_path, map_location=torch.device(device)))
hindi_embed_matrix = hindi_embed_layer.weight.data

hindi vocab_size: 20402


In [4]:
# load BENGALI mapping {word -> id} and {id -> word}
with open('hindi_bengali/save/word_to_int_dict.json') as f:
    bengali_word_to_int = json.load(f)
with open('hindi_bengali/save/int_to_word_dict.json') as f:
    bengali_int_to_word = json.load(f)

# get vocab_size
bengali_vocab_size = len(bengali_word_to_int)
print(f'bengali vocab_size: {bengali_vocab_size}')

# load embedding
bengali_embedding_path = '../Task_2/hindi_bengali/save/embedding_weights.pt'
bengali_embed_layer = nn.Embedding(bengali_vocab_size, embedding_size)
bengali_embed_layer.load_state_dict(torch.load(bengali_embedding_path, map_location=torch.device(device)))
bengali_embed_matrix = bengali_embed_layer.weight.data

bengali vocab_size: 16005


In [5]:
def standardize(v):
    return v.sub_(v.mean(dim=1)[:, None]).div_((v.var(dim=1)**0.5)[:, None])

hindi_embed_matrix = standardize(hindi_embed_matrix)
bengali_embed_matrix = standardize(bengali_embed_matrix)

In [6]:
# normalize embedding matrices?

In [7]:
examples = [ #(hindi_word, bengali_word, english word)
    ('गाड़ी', 'গাড়ি', 'car'),
    ('विश्व', 'বিশ্ব', 'world'),
    ('शब्द', 'শব্দ', 'word'),
    ('संगणक', 'কম্পিউটার', 'computer'),
    ('लोग', 'মানুষ', 'people'),
    ('युद्ध', 'যুদ্ধ', 'war'),
    ('फ़ोन', 'ফোন', 'phone'),
    ('होशियार', 'স্মার্ট', 'smart'),
    ('सुंदर', 'সুন্দর', 'beautiful'),
    ('तेज', 'দ্রুত', 'fast'),
    #('', ''), # 
]

In [8]:
for hindi_word, bengali_word, english_word in examples:
    if hindi_word in hindi_word_to_int and bengali_word in bengali_word_to_int:
        print(hindi_word, bengali_word, english_word)
        hindi_id = hindi_word_to_int[hindi_word]
        bengali_id = bengali_word_to_int[bengali_word]
        
        ben_ranking = torch.argsort(bengali_embed_matrix@hindi_embed_matrix[hindi_id], descending=True)
        hin_ranking = torch.argsort(hindi_embed_matrix@bengali_embed_matrix[bengali_id], descending=True)
        
        print(f'{ben_ranking[bengali_id]} {hin_ranking[hindi_id]}')

गाड़ी গাড়ি car
1401 12996
विश्व বিশ্ব world
12982 8447
शब्द শব্দ word
8654 14377
लोग মানুষ people
12398 19582
युद्ध যুদ্ধ war
12632 11059
फ़ोन ফোন phone
895 6872
सुंदर সুন্দর beautiful
6050 9795
तेज দ্রুত fast
12512 5323
