In [44]:
# Imports
import re
import string
import json
from datetime import datetime
from collections import defaultdict, Counter

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm

import torch
import torch.nn as nn
from torch.nn import Module
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

device = 'cpu'

import random

random.seed(26)
np.random.seed(62)
torch.manual_seed(2021)

<torch._C.Generator at 0x7f9649fca530>

In [45]:
embedding_size = 300
embedding_path = f'save/embedding_weights_100_epoch_{embedding_size}_dim_5_wsize_10_negfac.pt'
train_path = 'save/bengali_train_preprocessed.csv'

In [46]:
# train data
train_df = pd.read_csv(train_path)
train_sentences = [[int(s) for s in text.split()] for text in train_df['sentence']]
train_labels = train_df['hate'].to_numpy()

# word <-> convertion
with open('save/word_to_int_dict.json', 'r') as f:
    word_to_int = json.load(f)
with open('save/int_to_word_dict.json', 'r') as f:
    int_to_word = json.load(f)
    int_to_word = {int(k) : v for k, v in int_to_word.items()}

# word-counter
with open('save/word_counter.json', 'r') as f:
    word_counter = Counter(json.load(f))
    
vocab_size = len(word_to_int)
print('vocab size:', vocab_size)

vocab size: 15983


In [21]:
embed_layer = nn.Embedding(vocab_size, embedding_size)
embed_layer.load_state_dict(torch.load(embedding_path, map_location=torch.device(device)))
embed_matrix = embed_layer.weight.data

In [43]:
for word, cnt in word_counter.most_common(10):
    iword = word_to_int[word]
    word_embed = embed_matrix[iword].unsqueeze(1)
    dot_prod = (embed_matrix@word_embed).squeeze(1)
    dot_prod[iword] = float('-Inf')
    _, most_similar_iwords = torch.topk(dot_prod, 5)
    most_similar_words = [int_to_word[iw.item()] for iw in most_similar_iwords]
    print('{}...\n {}'.format(word, '.\n '.join(most_similar_words)))

একটা...
 ব্যপারটাতে.
 সূলভ.
 লেখা।.
 সাজিয়ে.
 গাছের
কথা...
 মনেরাখবি.
 পরিদর্শকের.
 be.
 জীবিত.
 চাল্লি
তুই...
 ফিলিম.
 কোতাকানের.
 পোন্দানো.
 আলামত।.
 জমাট
বাচ্চা...
 জিবণ.
 জানাইলেই.
 গেছেখানকি.
 অমানুষের.
 লক্ষন।
তোর...
 টেংরি.
 বাইঞ্চুদ.
 বিগগাপন.
 খানিক.
 ধনের
ভাই...
 খেললে.
 মর্জাদা.
 বুঝজ.
 জয়ের.
 দিতো
মানুষ...
 ১০০কোটি.
 অতুষ্ট.
 হারাইলে.
 অসহায়।.
 পায়।ঠিক
সাথে...
 সাহশের.
 প্রতারক.
 ভঙ্গিমায়.
 নেন।এবং.
 একথা
ভালো...
 চালাতো.
 মেয়েদেরকেও.
 চ্যনেলটা.
 সবচেয়ে.
 চামচিকার
।...
 ব্যবিচারী.
 রেজা.
 দে‌খি.
 চট্টগ্রামের.
 বিছনায়


In [38]:
torch.topk(dot_prod, 5)

torch.return_types.topk(
values=tensor([22.4407, 21.9294, 21.5347, 21.4544, 21.0807]),
indices=tensor([ 9767,  9416, 14145,  2751, 10973]))

In [49]:
word_to_int['রাজা']
word_to_int['মানুষ']
word_to_int['মহিলা']
word_to_int['রাণী']

KeyError: 'রাণী'