In [1]:
# Imports
import re
import string
import json
from datetime import datetime
from collections import defaultdict, Counter

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm

import torch
import torch.nn as nn
from torch.nn import Module
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

from nltk.corpus import stopwords

device = 'cpu'

import random

random.seed(26)
np.random.seed(62)
torch.manual_seed(2021)

<torch._C.Generator at 0x7f4011bcaf90>

In [2]:
embedding_size = 300

In [3]:
# load HINDI mapping {word -> id} and {id -> word}
with open('hindi_hindi/save/hindi_word_to_int_dict.json', 'r') as f:
    hindi_word_to_int = json.load(f)
with open('hindi_hindi/save/hindi_int_to_word_dict.json', 'r') as f:
    hindi_int_to_word = json.load(f)
with open('hindi_hindi/save/hindi_word_counter.json') as f:
    hindi_word_counter = json.load(f)

# get vocab_size
hindi_vocab_size = len(hindi_word_to_int)
print(f'hindi vocab_size: {hindi_vocab_size}')

# get total words
hindi_total_words = np.sum(list(hindi_word_counter.values()))
print(f'hindi total words: {hindi_total_words}')

# load embedding
hindi_embedding_path = 'hindi_hindi/save/hindi_embedding_weights_5_wsize_10_negfac.pt'
hindi_embed_layer = nn.Embedding(hindi_vocab_size, embedding_size)
hindi_embed_layer.load_state_dict(torch.load(hindi_embedding_path, map_location=torch.device(device)))
hindi_embed_matrix = hindi_embed_layer.weight.data

hindi vocab_size: 19379
hindi total words: 84861


In [4]:
# load BENGALI mapping {word -> id} and {id -> word}
with open('hindi_bengali/save/bengali_word_to_int_dict.json') as f:
    bengali_word_to_int = json.load(f)
with open('hindi_bengali/save/bengali_int_to_word_dict.json') as f:
    bengali_int_to_word = json.load(f)
with open('hindi_bengali/save/bengali_word_counter.json') as f:
    bengali_word_counter = json.load(f)

# get vocab_size
bengali_vocab_size = len(bengali_word_to_int)
print(f'bengali vocab_size: {bengali_vocab_size}')

# get total words
bengali_total_words = np.sum(list(bengali_word_counter.values()))
print(f'bengali total words: {bengali_total_words}')

# load embedding
bengali_embedding_path = 'hindi_bengali/save/bengali_embedding_weights.pt'
bengali_embed_layer = nn.Embedding(bengali_vocab_size, embedding_size)
bengali_embed_layer.load_state_dict(torch.load(bengali_embedding_path, map_location=torch.device(device)))
bengali_embed_matrix = bengali_embed_layer.weight.data

bengali vocab_size: 15190
bengali total words: 47848


In [5]:
# make all embeddings 0-mean and unit variance
def standardize(v):
    return v.sub_(v.mean(dim=1)[:, None]).div_((v.var(dim=1)**0.5)[:, None])

hindi_embed_matrix = standardize(hindi_embed_matrix)
bengali_embed_matrix = standardize(bengali_embed_matrix)

In [6]:
examples = [ #format: (hindi_word, bengali_word, english word)
    ('गाड़ी', 'গাড়ি', 'car'),
    ('विश्व', 'বিশ্ব', 'world'),
    ('शब्द', 'শব্দ', 'word'),
    ('पुरुष', 'পুরুষ', 'male'),
    ('लोग', 'মানুষ', 'people'),
    ('युद्ध', 'যুদ্ধ', 'war'),
    ('फ़ोन', 'ফোন', 'phone'),
    ('बड़े', 'বিশাল', 'big'),
    ('सुंदर', 'সুন্দর', 'beautiful'),
    ('तेज', 'দ্রুত', 'fast'),
    #('', ''), # 
]

In [8]:
# find similarity ranking of equivalent word pairs in Hindi and Bengali

for hindi_word, bengali_word, english_word in examples:
    if hindi_word in hindi_word_to_int and bengali_word in bengali_word_to_int:
        print(hindi_word, bengali_word, english_word)
        
        print(f'{hindi_word} occurrences: {hindi_word_counter[hindi_word]/hindi_total_words:%}')
        print(f'{bengali_word} occurrences: {bengali_word_counter[bengali_word]/bengali_total_words:%}')
        
        
        hindi_id = hindi_word_to_int[hindi_word]
        bengali_id = bengali_word_to_int[bengali_word]
        
        hin_ranking = torch.argsort(hindi_embed_matrix@bengali_embed_matrix[bengali_id], descending=True)
        ben_ranking = torch.argsort(bengali_embed_matrix@hindi_embed_matrix[hindi_id], descending=True)
        
        print(f'{hin_ranking[hindi_id]}/{len(hindi_embed_matrix)} {ben_ranking[bengali_id]}/{len(bengali_embed_matrix)}')
        print('===================')

गाड़ी গাড়ি car
गाड़ी occurrences: 0.009427%
গাড়ি occurrences: 0.002090%
5290/19379 7817/15190
विश्व বিশ্ব world
विश्व occurrences: 0.016498%
বিশ্ব occurrences: 0.008360%
19032/19379 5688/15190
शब्द শব্দ word
शब्द occurrences: 0.159084%
শব্দ occurrences: 0.016720%
13014/19379 2704/15190
पुरुष পুরুষ male
पुरुष occurrences: 0.001178%
পুরুষ occurrences: 0.022989%
17856/19379 13067/15190
लोग মানুষ people
लोग occurrences: 0.262783%
মানুষ occurrences: 0.440980%
13469/19379 13425/15190
युद्ध যুদ্ধ war
युद्ध occurrences: 0.004714%
যুদ্ধ occurrences: 0.012540%
12893/19379 1531/15190
फ़ोन ফোন phone
फ़ोन occurrences: 0.001178%
ফোন occurrences: 0.016720%
13929/19379 14651/15190
बड़े বিশাল big
बड़े occurrences: 0.050671%
বিশাল occurrences: 0.006270%
3183/19379 10061/15190
सुंदर সুন্দর beautiful
सुंदर occurrences: 0.004714%
সুন্দর occurrences: 0.108677%
13003/19379 5675/15190
तेज দ্রুত fast
तेज occurrences: 0.002357%
দ্রুত occurrences: 0.029259%
8391/19379 14739/15190
