In [1]:
import os
import sys

import sklearn
import torch

import numpy as np
import pandas as pd

from sklearn.cluster import KMeans, AgglomerativeClustering
from transformers import AutoTokenizer, AutoModel, BatchEncoding

In [2]:
sys.path.append(os.path.expanduser('../utils'))

In [3]:
from df_utils import df_preparation, get_word_location
from visualization_utils import make_plot
from clustering_utils import clustering

## Данные

In [None]:
task = 'wiki-wiki' #'active-dict' 'bts-rnc' 'wiki-wiki'

number_of_clusters = {'wiki-wiki' : 2, 'bts-rnc' : 3, 'active-dict' : 3}

In [None]:
train_df = pd.read_csv(f'../russe-wsi-kit/data/main/{task}/train.csv', delimiter='\t')

In [None]:
train_df = df_preparation(train_df, task)

In [None]:
train_df

## BERT

In [None]:
def get_embedding(tokens, location):
    model.eval()
    with torch.no_grad():
        ids = tokenizer.convert_tokens_to_ids(tokens)
        ids = torch.tensor(ids).reshape((1, len(tokens)))
        embeds = model(ids)['last_hidden_state'].squeeze()
        word_embeds = embeds[location, :]
        
        return word_embeds.sum(dim=0).numpy()

In [None]:
tokenizer = AutoTokenizer.from_pretrained('DeepPavlov/rubert-base-cased')
model = AutoModel.from_pretrained('DeepPavlov/rubert-base-cased')

In [None]:
train_df['tokens'] = train_df['context'].apply(tokenizer.tokenize)

In [None]:
train_df['word_location'] = train_df.apply(lambda x: get_word_location(x['word_form'], x['tokens']), axis=1)

In [None]:
len(set(train_df[train_df['word_location'] == 'not found']['word']))

In [None]:
%%time
train_df['embedding'] = train_df.apply(lambda x: get_embedding(x['tokens'], x['word_location']), axis=1)

In [None]:
train_df

## K-means and Agglomerative clustering

In [None]:
words_info, total, ari_sum = clustering(train_df, KMeans, kwargs={'init' : 'random', 'max_iter' : 1000}, task=task)

In [None]:
words_info

In [None]:
ari_sum/total

In [None]:
words_info_ac, total_ac, ari_sum_ac = clustering(train_df, AgglomerativeClustering, task=task)

In [None]:
words_info_ac

In [None]:
ari_sum_ac/total_ac

## Черновик

In [None]:
words_info = {}
#     total = 0
#     ari_sum = 0
for i in range(len(train_df['embedding'].iloc[0])-1):
    word = 'декабрист'
    df = train_df[train_df['word']==word]
    n_clusters = number_of_clusters[task]
    n_contexts = df.shape[0]
    labels_true = df['gold_sense_id'].to_numpy()
    X = df['embedding'].to_numpy()
    X = np.stack(X)
    clusterizator = KMeans(n_clusters=n_clusters, **{'init' : 'random', 'max_iter' : 1000})
    labels_pred = clusterizator.fit_predict(X[:, i:i+2])#+1
    ari = adjusted_rand_score(labels_true, labels_pred)
    words_info[f'{word} {i}'] = {'ari' : ari, 'count' : n_contexts}
#     ari_sum += ari*n_contexts
#     total += n_contexts
        
#     print(i)
#     print(ari)
#     make_plot(df, ari)

In [None]:
max_ari = 0
for key in words_info:
    if words_info[key]['ari'] > max_ari:
        max_ari = words_info[key]['ari']
        max_key = key
print(max_ari, max_key)

In [None]:
words_info = {}
#     total = 0
#     ari_sum = 0
word = 'декабрист'
df = train_df[train_df['word']==word]
n_clusters = number_of_clusters[task]
n_contexts = df.shape[0]
labels_true = df['gold_sense_id'].to_numpy()
X = df['embedding'].to_numpy()
X = np.stack(X)
clusterizator = KMeans(n_clusters=n_clusters, **{'init' : 'random', 'max_iter' : 1000})
labels_pred = clusterizator.fit_predict(X[:, 221:222])#+1
ari = adjusted_rand_score(labels_true, labels_pred)
words_info[f'{word} last'] = {'ari' : ari, 'count' : n_contexts}
#     ari_sum += ari*n_contexts
#     total += n_contexts

#     print(i)
#     print(ari)
#     make_plot(df, ari)

In [None]:
words_info

In [None]:
ids = {'суда': 0,
 'лук': 1,
 'замок': 2,
 'бор': 3}

train_df['word_id'] = train_df['word'].apply(lambda x: ids[x])

In [None]:
labels_pred

In [None]:
labels_true

In [None]:
# df = train_df[train_df['word']=='лук']
df = train_df
n_clusters = 4
n_contexts = df.shape[0]
labels_true = df['word_id'].to_numpy()
X = df['embedding'].to_numpy()
X = np.stack(X)
kmeans = KMeans(n_clusters=n_clusters, init='random', max_iter=1000)
labels_pred = kmeans.fit_predict(X)#+1
ari = adjusted_rand_score(labels_true, labels_pred)
print(ari)

In [None]:
labels_pred

In [None]:
labels_true

In [None]:
df['embedding']

In [None]:
pca = sklearn.decomposition.PCA(n_components=3)

In [None]:
d3 = pca.fit_transform(np.stack(df['embedding'].to_numpy()))

In [None]:
d3 = pd.DataFrame(d3)

In [None]:
d3

In [None]:
px.scatter_3d(d3, x=0, y=1, z=2, color=labels_true)

In [None]:
def get_word_location(target, tokens):
    current = ''
    current_indices = []
    for i, token in enumerate(tokens):
        if token[:2] == '##':
            current += token[2:]
            current_indices.append(i)
        else:
            current = token
            current_indices = [i]
        if i < len(tokens)-1:
            next_ = tokens[i+1]
            if next_[:2] != '##':
                if current == target:
                    return current_indices
        else:
            if current == target:
                return current_indices
    print(target, tokens)
    return 'not found'

In [None]:
model('иван борисович тугой лук иван борисович тугой [MASK] ( ок . — ) \xa0— суздальско - нижегородский княжич , младший из двух сыновей бориса константиновича , младший брат даниила борисовича иван борисович родился в нижнем новгороде в')