In [1]:
!git clone https://github.com/LizaDerb/probing_compositionality/

Cloning into 'probing_compositionality'...
remote: Enumerating objects: 455, done.[K
remote: Counting objects: 100% (230/230), done.[K
remote: Compressing objects: 100% (220/220), done.[K
remote: Total 455 (delta 69), reused 8 (delta 8), pack-reused 225[K
Receiving objects: 100% (455/455), 156.39 MiB | 24.80 MiB/s, done.
Resolving deltas: 100% (97/97), done.
Updating files: 100% (87/87), done.


# Подготовка данных

In [2]:
import pandas as pd

In [3]:
corpus = pd.read_csv('/content/probing_compositionality/data/rus/corpus.csv', sep=';')
description = pd.read_csv('/content/probing_compositionality/data/rus/data_description.csv', sep=';')

In [4]:
corpus['mwe'].unique()

array(['себе на голову', 'против течения', 'осиное гнездо',
       'не на своем месте', 'второй дом', 'по барабану', 'жирно будет',
       'гладить по голове', 'за пояс заткнут', 'каши не свариш',
       'с пеной у рта', 'через голову', 'под каблуком', 'удар ниже пояса',
       'поезд ушел', 'смотреть в лицо', 'куда ветер дует',
       'поставить точку', 'на дне', 'смотреть в глаза', 'с блеском',
       'балансировать на грани', 'на мою голову', 'наш брат',
       'голова болит', 'не за горами', 'посмотреть в лицо',
       'косой взгляд', 'на свою голову'], dtype=object)

In [5]:
types = description[['russian', 'type']].set_index('russian').to_dict()['type']

In [6]:
types_col = []
for index, row in corpus.iterrows():
    try:
        types_col.append(types[row['mwe']])
    except:
        if row['mwe'] == 'не на своем месте':
            types_col.append('PP')
        elif row['mwe'] == 'за пояс заткнут':
            types_col.append('VP')
        elif row['mwe'] == 'каши не свариш':
            types_col.append('VP')
        elif row['mwe'] == 'поезд ушел':
            types_col.append('CLAUSE')
        else:
            types_col.append(None)

In [7]:
corpus['type'] = types_col

In [8]:
corpus = corpus[['mwe', 'label', 'sentence', 'type']]
corpus.head()

Unnamed: 0,mwe,label,sentence,type
0,себе на голову,I,-- Сейчас покажу. Вчера получила -- вчера и п...,OTHER
1,себе на голову,I,"отока радости, текущего вином в дженнете*!"" --...",OTHER
2,себе на голову,I,-- Сейчас покажу. Вчера получила -- вчера и п...,OTHER
3,себе на голову,I,"отока радости, текущего вином в дженнете*!"" --...",OTHER
4,себе на голову,L,"Я почувствовал, что дальше не выдержу. Какие-...",OTHER


In [9]:
corpus['mwe'].value_counts()

mwe
на дне                    1253
через голову               334
смотреть в глаза           231
с блеском                  218
против течения             141
с пеной у рта               86
поставить точку             83
себе на голову              68
по барабану                 57
второй дом                  47
не на своем месте           45
смотреть в лицо             38
осиное гнездо               37
под каблуком                35
поезд ушел                  34
на свою голову              34
куда ветер дует             30
гладить по голове           30
удар ниже пояса             22
жирно будет                 12
наш брат                    10
за пояс заткнут              9
каши не свариш               6
голова болит                 6
посмотреть в лицо            6
на мою голову                5
не за горами                 5
косой взгляд                 4
балансировать на грани       2
Name: count, dtype: int64

In [10]:
corpus = corpus[corpus['mwe'] != 'на дне']

In [11]:
corpus['type'].value_counts()

type
PP        955
VP        417
NP        120
OTHER      73
CLAUSE     70
Name: count, dtype: int64

In [12]:
corpus.groupby('type')['mwe'].value_counts()

type    mwe                   
CLAUSE  поезд ушел                 34
        куда ветер дует            30
        голова болит                6
NP      второй дом                 47
        осиное гнездо              37
        удар ниже пояса            22
        наш брат                   10
        косой взгляд                4
OTHER   себе на голову             68
        не за горами                5
PP      через голову              334
        с блеском                 218
        против течения            141
        с пеной у рта              86
        по барабану                57
        не на своем месте          45
        под каблуком               35
        на свою голову             34
        на мою голову               5
VP      смотреть в глаза          231
        поставить точку            83
        смотреть в лицо            38
        гладить по голове          30
        жирно будет                12
        за пояс заткнут             9
        каши не сва

In [13]:
idiom_for_test = ['голова болит', 'удар ниже пояса', 'наш брат', 'косой взгляд', 'не за горами',
                  'не на своем месте', 'под каблуком', 'на свою голову', 'на мою голову',
                  'жирно будет', 'за пояс заткнут', 'каши не свариш', 'гладить по голове',
                  'балансировать на грани', 'против течения']

In [14]:
corpus_train = corpus[corpus['mwe'].isin(idiom_for_test) == False]
corpus_test = corpus[corpus['mwe'].isin(idiom_for_test)]

In [15]:
print('train: ', len(corpus_train))
print('test: ', len(corpus_test))

train:  1269
test:  366


In [16]:
stat_train_corpus = corpus_train['label'].value_counts().rename('num').to_frame()
stat_train_corpus['percentage'] = (stat_train_corpus.num * 100 / stat_train_corpus.num.sum()).round(1).astype(str) + '%'
stat_train_corpus

Unnamed: 0_level_0,num,percentage
label,Unnamed: 1_level_1,Unnamed: 2_level_1
I,651,51.3%
L,618,48.7%


In [17]:
stat_test_corpus = corpus_test['label'].value_counts().rename('num').to_frame()
stat_test_corpus['percentage'] = (stat_test_corpus.num * 100 / stat_test_corpus.num.sum()).round(1).astype(str) + '%'
stat_test_corpus

Unnamed: 0_level_0,num,percentage
label,Unnamed: 1_level_1,Unnamed: 2_level_1
L,195,53.3%
I,171,46.7%


#RuBERT

In [None]:
!pip install datasets

In [19]:
from datasets import Dataset
from sklearn.preprocessing import LabelEncoder

In [20]:
import seaborn as sns
from tqdm.notebook import tqdm
import numpy as np
import warnings
import random
import pickle

import torch
from torch.utils.data import DataLoader

from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_auc_score, classification_report
from sklearn import preprocessing
from transformers import AutoTokenizer, AutoModel

In [21]:
train = Dataset.from_pandas(corpus_train)
test = Dataset.from_pandas(corpus_test)

In [22]:
le = LabelEncoder()
le.fit(train['label'])

In [23]:
train_labels = le.transform(train['label'])
test_labels = le.transform(test['label'])

In [24]:
train

Dataset({
    features: ['mwe', 'label', 'sentence', 'type', '__index_level_0__'],
    num_rows: 1269
})

In [25]:
test

Dataset({
    features: ['mwe', 'label', 'sentence', 'type', '__index_level_0__'],
    num_rows: 366
})

In [26]:
rubert_tokenizer = AutoTokenizer.from_pretrained('DeepPavlov/rubert-base-cased')
rubert = AutoModel.from_pretrained('DeepPavlov/rubert-base-cased')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/24.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/642 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/1.65M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of the model checkpoint at DeepPavlov/rubert-base-cased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [27]:
torch.cuda.is_available()

True

In [28]:
device_cuda = torch.device("cuda")
device_cpu = torch.device("cpu")

In [29]:
def vectorize_dataset(data, tokenizer, model):
    model = model.to(device_cuda)
    res_cls = []
    res_mean = []
    for batch in tqdm(data):
        toks = tokenizer(batch['sentence'], padding='max_length', truncation=True, return_tensors='pt', max_length=300)
        with torch.no_grad():
            model_output = model(**{k: v.to(model.device) for k, v in toks.items()})
        res_cls.append(model_output.last_hidden_state[:,0,:]) #достаём вектора для CLS токенов
        res_mean.append(torch.mean(model_output.last_hidden_state, dim=1)) #среднее
    res_cls = torch.vstack(res_cls)
    res_mean = torch.vstack(res_mean)
    return res_cls.detach().cpu().numpy(), res_mean.detach().cpu().numpy()

In [30]:
dl_train = DataLoader(train, batch_size=256, shuffle=False,
                drop_last=False)
dl_test = DataLoader(test, batch_size=256, shuffle=False,
                drop_last=False)
vecs_train_cls, vecs_train_mean = vectorize_dataset(dl_train, rubert_tokenizer, rubert)
vecs_test_cls, vecs_test_mean = vectorize_dataset(dl_test, rubert_tokenizer, rubert)

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

In [31]:
vectors = {'vecs_train_cls' : vecs_train_cls, 'vecs_train_mean' : vecs_train_mean,
           'vecs_test_cls' : vecs_test_cls, 'vecs_test_mean' : vecs_test_mean}

for name in vectors.keys():
    with open(f'{name}.pkl', 'wb') as file:
         pickle.dump(vectors[name], file)

In [32]:
vector_names = ['vecs_train_cls', 'vecs_train_mean', 'vecs_test_cls', 'vecs_test_mean']
vecs = []

for name in vector_names:
    with open(f'{name}.pkl', "rb") as fIn:
        vecs.append(pickle.load(fIn))

train_cls = vecs[0]
train_mean = vecs[1]
test_cls = vecs[2]
test_mean = vecs[3]

In [33]:
train_cls.shape

(1269, 768)

In [34]:
test_mean.shape

(366, 768)

In [35]:
def calculate_metrics(true_labels, predictions):
    matrix = confusion_matrix(true_labels, predictions)
    acc = accuracy_score(true_labels, predictions)
    prec = precision_score(true_labels, predictions)
    rec = recall_score(true_labels, predictions)
    f1 = f1_score(true_labels, predictions)
    roc_auc = roc_auc_score(true_labels, predictions)
    return matrix, acc, prec, rec, f1, roc_auc

In [36]:
def baseline_rand_vec(vecs):
    size = vecs.shape[1]
    num_vecs = vecs.shape[0]
    rand_vecs = []
    for _ in range(num_vecs):
        rand_vecs.append(np.random.uniform(-5.0826163,1.5603778,size))
    return rand_vecs

def baseline_rand_pred(vecs, labels):
    num_vecs = vecs.shape[0]
    rand_vecs = []
    rand_pred = np.array(range(num_vecs), dtype=int)
    for i in range(0, num_vecs):
        rand_pred[i] = random.choice(labels)
    return rand_pred

In [37]:
def rand_pred_func(vecs, labels, true_labels, n=50):
    results_rand_pred = []
    for _ in range(n):
        pred_labels = baseline_rand_pred(vecs, labels)
        results_rand_pred.append(calculate_metrics(true_labels, pred_labels))
    res_rand_pred = pd.DataFrame(results_rand_pred, columns=['matrix', 'accuracy', 'precision', 'recall', 'f1', 'roc_auc'])
    res_rand_pred.to_csv('res_rand_pred.csv')

In [38]:
def get_results(train_data, train_labels, test_data, test_labels):
    results_train = []
    results_test = []
    for n in tqdm(range(50)):
        clf = MLPClassifier(random_state=n, max_iter=300, verbose=1)
        clf.fit(train_data, train_labels)
        train_pred = clf.predict(train_data)
        test_pred = clf.predict(test_data)
        results_train.append(calculate_metrics(train_labels, train_pred))
        results_test.append(calculate_metrics(test_labels, test_pred))
    return results_train, results_test, test_pred

In [39]:
labels = [0, 1]

In [40]:
rand_pred_func(test_cls, labels, test_labels)

In [None]:
rand_vecs = baseline_rand_vec(train_cls)
results_train_rand, results_test_rand, test_pred_rand = get_results(rand_vecs, train_labels, test_cls, test_labels)
res_test_rand = pd.DataFrame(results_test_rand, columns=['matrix', 'accuracy', 'precision', 'recall', 'f1', 'roc_auc'])
res_train_rand = pd.DataFrame(results_train_rand, columns=['matrix', 'accuracy', 'precision', 'recall', 'f1', 'roc_auc'])
res_test_rand.to_csv('res_test_rand_vec.csv')
res_train_rand.to_csv('res_train_rand_vec.csv')

In [None]:
results_train_mean, results_test_mean, test_pred_mean = get_results(train_mean, train_labels, test_mean, test_labels)
res_test_mean = pd.DataFrame(results_test_mean, columns=['matrix', 'accuracy', 'precision', 'recall', 'f1', 'roc_auc'])
res_train_mean = pd.DataFrame(results_train_mean, columns=['matrix', 'accuracy', 'precision', 'recall', 'f1', 'roc_auc'])
res_test_mean.to_csv('res_test_mean.csv')
res_train_mean.to_csv('res_train_mean.csv')

In [43]:
print(classification_report(test_labels, test_pred_mean, target_names=le.classes_))

              precision    recall  f1-score   support

           I       0.66      0.76      0.70       171
           L       0.76      0.65      0.70       195

    accuracy                           0.70       366
   macro avg       0.71      0.71      0.70       366
weighted avg       0.71      0.70      0.70       366



In [None]:
results_train_cls, results_test_cls, test_pred_cls = get_results(train_cls, train_labels, test_cls, test_labels)
res_test_cls = pd.DataFrame(results_test_cls, columns=['matrix', 'accuracy', 'precision', 'recall', 'f1', 'roc_auc'])
res_train_cls = pd.DataFrame(results_train_cls, columns=['matrix', 'accuracy', 'precision', 'recall', 'f1', 'roc_auc'])
res_test_cls.to_csv('res_test_cls.csv')
res_train_cls.to_csv('res_train_cls.csv')

In [45]:
print(classification_report(test_labels, test_pred_cls, target_names=le.classes_))

              precision    recall  f1-score   support

           I       0.65      0.66      0.66       171
           L       0.70      0.69      0.69       195

    accuracy                           0.67       366
   macro avg       0.67      0.67      0.67       366
weighted avg       0.68      0.67      0.68       366



In [None]:
results = corpus_test

results['pred_mean'] = test_pred_mean
results['pred_cls'] = test_pred_cls
results['new_label'] = test_labels

In [53]:
results_PP = results[results['type'] == 'PP']
results_VP = results[results['type'] == 'VP']
results_NP = results[results['type'] == 'NP']

In [56]:
metrics = [calculate_metrics(results_PP['new_label'], results_PP['pred_mean']),
           calculate_metrics(results_PP['new_label'], results_PP['pred_cls']),
           calculate_metrics(results_VP['new_label'], results_VP['pred_mean']),
           calculate_metrics(results_VP['new_label'], results_VP['pred_cls']),
           calculate_metrics(results_NP['new_label'], results_NP['pred_mean']),
           calculate_metrics(results_NP['new_label'], results_NP['pred_cls'])]
groups = ['PP_mean', 'PP_cls', 'VP_mean', 'VP_cls', 'NP_mean', 'NP_cls']

In [58]:
all_metrics = pd.DataFrame(metrics, columns=['matrix', 'accuracy', 'precision', 'recall', 'f1', 'roc_auc'])
all_metrics['group'] = groups
all_metrics = all_metrics[['group', 'accuracy', 'precision', 'recall', 'f1', 'roc_auc']]
all_metrics

Unnamed: 0,group,accuracy,precision,recall,f1,roc_auc
0,PP_mean,0.711538,0.85124,0.64375,0.733096,0.731875
1,PP_cls,0.703846,0.79021,0.70625,0.745875,0.703125
2,VP_mean,0.661017,0.575758,0.76,0.655172,0.674118
3,VP_cls,0.677966,0.6,0.72,0.654545,0.683529
4,NP_mean,0.694444,0.25,0.6,0.352941,0.654839
5,NP_cls,0.472222,0.111111,0.4,0.173913,0.441935


In [59]:
all_metrics.to_csv('group_metrics.csv')

In [62]:
!zip -r /content/file.zip /content/russian_all

updating: content/russian_all/ (stored 0%)
updating: content/russian_all/vecs_train_mean.pkl (deflated 15%)
updating: content/russian_all/vecs_train_cls.pkl (deflated 15%)
updating: content/russian_all/res_test_rand_vec.csv (deflated 71%)
updating: content/russian_all/res_train_mean.csv (deflated 94%)
updating: content/russian_all/vecs_test_mean.pkl (deflated 17%)
updating: content/russian_all/res_test_mean.csv (deflated 71%)
updating: content/russian_all/res_rand_pred.csv (deflated 66%)
updating: content/russian_all/res_train_rand_vec.csv (deflated 92%)
updating: content/russian_all/vecs_test_cls.pkl (deflated 16%)
updating: content/russian_all/res_train_cls.csv (deflated 94%)
updating: content/russian_all/group_metrics.csv (deflated 53%)
updating: content/russian_all/res_test_cls.csv (deflated 70%)


In [66]:
all_test = [res_test_cls.iloc[:, 2:].mean(), res_test_mean.iloc[:, 2:].mean()]
all_train = [res_train_cls.iloc[:, 2:].mean(), res_train_mean.iloc[:, 2:].mean()]

In [67]:
pd.concat(all_test, axis=1).rename(columns={0:'test_cls', 1:'test_mean'})

Unnamed: 0,test_cls,test_mean
precision,0.691919,0.707425
recall,0.706051,0.669949
f1,0.698646,0.687707
roc_auc,0.673318,0.676495


In [68]:
pd.concat(all_train, axis=1).rename(columns={0: 'train_cls', 1:'train_mean'})

Unnamed: 0,train_cls,train_mean
precision,0.999257,0.999095
recall,0.999126,0.999288
f1,0.999191,0.999191
roc_auc,0.99921,0.999214
