In [31]:
import sklearn
import torch

import numpy as np
import pandas as pd

import plotly.express as px

from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.metrics.cluster import adjusted_rand_score
from transformers import AutoTokenizer, AutoModel, BatchEncoding

## Данные

In [2]:
def get_word_form(context, position, task):
    if task == 'bts-rnc':
        raw = context[int(position[0]): int(position[1])+1]
    else:
        raw = context[int(position[0]): int(position[1])]
    fixed = ''
    for letter in raw.lower():
        if letter.isalpha():
            if letter != 'й':
                fixed += letter
            else:
                fixed += 'и'
    
    return fixed

In [3]:
task = 'wiki-wiki' #'active-dict' 'bts-rnc' 'wiki-wiki'

In [4]:
train_df = pd.read_csv(f'../russe-wsi-kit/data/main/{task}/train.csv', delimiter='\t')

In [5]:
train_df = train_df.iloc[train_df['positions'].dropna().index]

In [6]:
train_df['positions'] = train_df['positions'].apply(lambda x: x.split(','))

In [7]:
train_df['positions'] = train_df['positions'].apply(lambda x: x[0].split('-'))

In [8]:
train_df['word_form'] = train_df.apply(lambda x: get_word_form(x['context'], x['positions'], task), axis=1)

In [9]:
train_df

Unnamed: 0,context_id,word,gold_sense_id,predict_sense_id,positions,context,word_form
0,1,замок,1,,"[0, 5]",замок владимира мономаха в любече . многочисле...,замок
1,2,замок,1,,"[11, 16]","шильонский замок замок шильйон ( ) , известный...",замок
2,3,замок,1,,"[299, 304]",проведения архитектурно - археологических рабо...,замок
3,4,замок,1,,"[111, 116]","топи с . , л . белокуров легенда о завещании м...",замок
4,5,замок,1,,"[134, 139]",великий князь литовский гедимин после успешной...,замок
...,...,...,...,...,...,...,...
434,435,бор,2,,"[10, 13]",ленточный бор ле́нточные бо́ры — сосновые трав...,бор
435,436,бор,2,,"[101, 104]","в окрестностях барнаула , составляет — км . н...",бор
436,437,бор,2,,"[17, 20]",также в сосновом бору открыта секция биатлона ...,бор
437,438,бор,2,,"[183, 186]","экспресс банк , мособлбанк , внешпромбанк , ба...",бор


## BERT

In [10]:
def get_word_location(target, tokens):
    current = ''
    current_indices = []
    for i, token in enumerate(tokens):
        if token[:2] == '##':
            current += token[2:]
            current_indices.append(i)
        else:
            current = token
            current_indices = [i]
        if current == target:
            return current_indices
    print(target, tokens)
    return 'not found'

In [11]:
def get_embedding(tokens, location):
    model.eval()
    with torch.no_grad():
        ids = tokenizer.convert_tokens_to_ids(tokens)
        ids = torch.tensor(ids).reshape((1, len(tokens)))
        embeds = model(ids)['last_hidden_state'].squeeze()
        word_embeds = embeds[location, :]
        
        return word_embeds.sum(dim=0).numpy()

In [12]:
target = 'дарами,'  
tokens = ['покупать', 'преданность', 'дара', '##ми']

In [13]:
tokenizer = AutoTokenizer.from_pretrained('DeepPavlov/rubert-base-cased')
model = AutoModel.from_pretrained('DeepPavlov/rubert-base-cased')

Some weights of the model checkpoint at DeepPavlov/rubert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [14]:
train_df['tokens'] = train_df['context'].apply(tokenizer.tokenize)

In [15]:
train_df['word_location'] = train_df.apply(lambda x: get_word_location(x['word_form'], x['tokens']), axis=1)

In [16]:
len(set(train_df[train_df['word_location'] == 'not found']['word']))

0

In [17]:
%%time
train_df['embedding'] = train_df.apply(lambda x: get_embedding(x['tokens'], x['word_location']), axis=1)

CPU times: user 1min 31s, sys: 7.49 s, total: 1min 39s
Wall time: 1min 37s


In [18]:
train_df

Unnamed: 0,context_id,word,gold_sense_id,predict_sense_id,positions,context,word_form,tokens,word_location,embedding
0,1,замок,1,,"[0, 5]",замок владимира мономаха в любече . многочисле...,замок,"[замок, владимир, ##а, моном, ##аха, в, люб, #...",[0],"[0.42294908, -0.683447, 0.85798216, -0.7495152..."
1,2,замок,1,,"[11, 16]","шильонский замок замок шильйон ( ) , известный...",замок,"[ши, ##ль, ##онс, ##ки, ##и, замок, замок, ши,...",[5],"[-0.23945573, -0.5021833, 1.1147155, -0.563345..."
2,3,замок,1,,"[299, 304]",проведения архитектурно - археологических рабо...,замок,"[проведения, архитектурно, -, археологических,...",[54],"[0.1208969, -0.7678792, 0.12769553, 0.11809321..."
3,4,замок,1,,"[111, 116]","топи с . , л . белокуров легенда о завещании м...",замок,"[топ, ##и, с, ., ,, л, ., белок, ##уров, леген...",[36],"[-0.39998552, -0.55716443, 0.7601832, -0.24298..."
4,5,замок,1,,"[134, 139]",великий князь литовский гедимин после успешной...,замок,"[велики, ##и, князь, литов, ##ски, ##и, ге, ##...",[28],"[-0.065683216, -1.2504479, -0.14175451, -0.541..."
...,...,...,...,...,...,...,...,...,...,...
434,435,бор,2,,"[10, 13]",ленточный бор ле́нточные бо́ры — сосновые трав...,бор,"[ленточ, ##ны, ##и, бор, ленточ, ##ные, бор, #...",[3],"[-0.05079735, -0.36621428, -0.62873137, -0.705..."
435,436,бор,2,,"[101, 104]","в окрестностях барнаула , составляет — км . н...",бор,"[в, окрестностях, бар, ##нау, ##ла, ,, составл...",[51],"[0.18823141, -0.0059549212, 0.30406272, -0.867..."
436,437,бор,2,,"[17, 20]",также в сосновом бору открыта секция биатлона ...,бор,"[также, в, сос, ##новом, бор, ##у, открыта, се...",[4],"[0.72732365, 0.36458915, 0.6134486, -0.8007269..."
437,438,бор,2,,"[183, 186]","экспресс банк , мособлбанк , внешпромбанк , ба...",бор,"[экспресс, банк, ,, мо, ##со, ##бл, ##банк, ,,...",[51],"[0.3698071, -0.5387132, -0.78578746, -0.876430..."


## K-means and Agglomerative clustering

In [19]:
def make_plot(df, score):
    title = df['word'].iloc[0] + f', {score}'
    labels_true = df['gold_sense_id'].to_numpy()
    pca = sklearn.decomposition.PCA(n_components=3)
    d3 = pca.fit_transform(np.stack(df['embedding'].to_numpy()))
    
    fig = px.scatter_3d(d3, x=0, y=1, z=2, color=labels_true, title=title)
    fig.show()

In [34]:
def clustering(train_df, clusterizator_class, kwargs=None):
    words_info = {}
    total = 0
    ari_sum = 0
    for word in set(train_df['word']):
        df = train_df[train_df['word']==word]
        n_clusters = len(set(df['gold_sense_id']))
        n_contexts = df.shape[0]
        labels_true = df['gold_sense_id'].to_numpy()
        X = df['embedding'].to_numpy()
        X = np.stack(X)
        if kwargs is None:
            clusterizator = clusterizator_class(n_clusters=n_clusters)
        else:
            clusterizator = clusterizator_class(n_clusters=n_clusters, **kwargs)
        labels_pred = clusterizator.fit_predict(X)#+1
        ari = adjusted_rand_score(labels_true, labels_pred)
        words_info[word] = {'ari' : ari, 'count' : n_contexts}
        ari_sum += ari*n_contexts
        total += n_contexts

        make_plot(df, ari)
        
    return words_info, total, ari_sum

In [35]:
words_info, total, ari_sum = clustering(train_df, KMeans, kwargs={'init' : 'random', 'max_iter' : 1000})

In [36]:
words_info

{'замок': {'ari': -0.06918226650023966, 'count': 138},
 'лук': {'ari': 0.19284262114092604, 'count': 110},
 'бор': {'ari': 0.7186424003935071, 'count': 56},
 'суда': {'ari': 1.0, 'count': 135}}

In [37]:
ari_sum/total

0.4257619817095791

In [38]:
words_info_ac, total_ac, ari_sum_ac = clustering(train_df, AgglomerativeClustering)

In [40]:
words_info_ac

{'замок': {'ari': -0.07179781270596061, 'count': 138},
 'лук': {'ari': 0.19284262114092604, 'count': 110},
 'бор': {'ari': 0.7864938108840548, 'count': 56},
 'суда': {'ari': 0.9375737040349529, 'count': 135}}

In [41]:
ari_sum_ac/total_ac

0.41439793536743735

## Черновик

In [None]:
ids = {'суда': 0,
 'лук': 1,
 'замок': 2,
 'бор': 3}

train_df['word_id'] = train_df['word'].apply(lambda x: ids[x])

In [None]:
labels_pred

In [None]:
labels_true

In [None]:
# df = train_df[train_df['word']=='лук']
df = train_df
n_clusters = 4
n_contexts = df.shape[0]
labels_true = df['word_id'].to_numpy()
X = df['embedding'].to_numpy()
X = np.stack(X)
kmeans = KMeans(n_clusters=n_clusters, init='random', max_iter=1000)
labels_pred = kmeans.fit_predict(X)#+1
ari = adjusted_rand_score(labels_true, labels_pred)
print(ari)

In [None]:
labels_pred

In [None]:
labels_true

In [None]:
df['embedding']

In [None]:
pca = sklearn.decomposition.PCA(n_components=3)

In [None]:
d3 = pca.fit_transform(np.stack(df['embedding'].to_numpy()))

In [None]:
d3 = pd.DataFrame(d3)

In [None]:
d3

In [None]:
px.scatter_3d(d3, x=0, y=1, z=2, color=labels_true)

In [None]:
def get_word_location(target, tokens):
    current = ''
    current_indices = []
    for i, token in enumerate(tokens):
        if token[:2] == '##':
            current += token[2:]
            current_indices.append(i)
        else:
            current = token
            current_indices = [i]
        if i < len(tokens)-1:
            next_ = tokens[i+1]
            if next_[:2] != '##':
                if current == target:
                    return current_indices
        else:
            if current == target:
                return current_indices
    print(target, tokens)
    return 'not found'