In [1]:
import numpy as np
import pandas as pd
import torch
import transformers as ppb
import tensorflow as tf
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_curve, RocCurveDisplay, auc, mean_squared_error
from sklearn.model_selection import train_test_split
from json import load


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
not_discriptors = list()
with open('../../Data/Words/word_pairs_not_disc.txt', 'r', encoding='utf-8') as txt:
    for i in txt:
        not_discriptors.append(i.replace('\n', ''))

with open('../../Data/Events/type_of_events.json', 'r', encoding='utf-8') as js:
    events = load(js)

with open('../../Data/Datasets/data.json', 'r', encoding='utf-8') as js:
    discriptors = load(js)


In [3]:
dict_of_disc = {'expession': [], 'is_discr': []}
for subject in discriptors.values():
    for d in sum(subject.values(), []):
        dict_of_disc['expession'].append(d)
        dict_of_disc['is_discr'].append(True)

for subject in not_discriptors:
    dict_of_disc['expession'].append(subject)
    dict_of_disc['is_discr'].append(False)



In [4]:
df = pd.DataFrame(dict_of_disc)
df

Unnamed: 0,expession,is_discr
0,основные положения теории истории,True
1,основные положения методологии истории,True
2,место истории в системе гуманитарного знания,True
3,закономерности исторического процесса,True
4,этапы исторического процесса,True
...,...,...
1051,Дать практические простые инструменты или рути...,False
1052,Эмбодимент (embodiment)– переводится как вопло...,False
1053,"То есть это выражение качеств, в том числе лид...",False
1054,Римская пословица Mens sana in corpore sano(В ...,False


In [5]:
# model_class, tokenizer_class, pretrained_weights = (
#     ppb.DistilBertModel, ppb.DistilBertTokenizer, 'distilbert-base-uncased')

# Хотите BERT вместо distilBERT? Раскомментируйте следующую строку:
model_class, tokenizer_class, pretrained_weights = (ppb.BertModel, ppb.BertTokenizer, 'bert-base-uncased')

# Загрузка предобученной модели/токенизатора
tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
model = model_class.from_pretrained(pretrained_weights)


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [6]:
tokenized = df['expession'].apply((lambda x: tokenizer.encode(x, add_special_tokens=True)))
padded = [tokenizer.encode(i, add_special_tokens=True) for i in df['expession'].to_list()]
ml = len(max(padded, key=len))
for i in padded:
    for _ in range(ml-len(i)):
        i.append(0)

padded = np.array(padded)

In [7]:
input_ids = torch.tensor(padded)

with torch.no_grad():
    
    last_hidden_states = model(input_ids)


In [8]:
features = last_hidden_states[0][:,0,:].numpy()


In [9]:
labels = df['is_discr']
X_train, X_test, y_train, y_test = train_test_split(features, labels, random_state=71)


In [22]:
import tensorflow as tf

model_class = tf.keras.models.Sequential()
model_class.add(tf.keras.layers.Dense(int(X_train[0].shape[0]*1.5), activation='softmax',
                                      input_shape=(X_train[0].shape[0],), name='input'))
model_class.add(tf.keras.layers.Dense(
    int(X_train[0].shape[0]*2), activation='relu', name='hidden_layer_2'))
model_class.add(tf.keras.layers.Dense(
    int(X_train[0].shape[0]*1.5), activation='relu', name='hidden_layer_3'))
model_class.add(tf.keras.layers.Dense(
    int(X_train[0].shape[0]*0.5), activation='relu', name='hidden_layer_4'))
model_class.add(tf.keras.layers.Dense(1, name='output'))
model_class.summary()


Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input (Dense)               (None, 1152)              885888    
                                                                 
 hidden_layer_2 (Dense)      (None, 1536)              1771008   
                                                                 
 hidden_layer_3 (Dense)      (None, 1152)              1770624   
                                                                 
 hidden_layer_4 (Dense)      (None, 384)               442752    
                                                                 
 output (Dense)              (None, 1)                 385       
                                                                 
Total params: 4,870,657
Trainable params: 4,870,657
Non-trainable params: 0
_________________________________________________________________


In [23]:
model_class.compile(
    loss='mse',  # Функция потерь
    optimizer='Adam',  # Оптимизатор
    metrics=[  # Метрики
        'mse',  # Если у объекта назначено имя, то можно вызвать объект с его помощью
        # tf.keras.metrics.Precision()
    ]
)


In [24]:
model_class.fit(
    X_train,  # Набор входных данных
    y_train.to_numpy().astype(int),  # Набор правильных ответов
    validation_split=0.4,  # Этот параметр автоматически выделит часть обучающего набора на валидационные данные. В данном случа 20%
    epochs=40,  # Процесс обучения завершится после 10 эпох
    # Набор данных будет разбит на пакеты (батчи) по 8 элементов набора в каждом.
    batch_size=2
)


Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40


<keras.callbacks.History at 0x26447e6f370>

In [19]:
predict = model_class.predict(X_test)




In [20]:
mean_squared_error(predict, y_test)


0.15923554

In [21]:
predictA = model_class.predict(features)
print('Accuracy')
print(accuracy_score(labels, predictA > 0.5))
print('Confusion matrix')
print(confusion_matrix(labels, predictA > 0.5))
print('Precision, Recall, F\n', classification_report(
    labels, predictA > 0.5))


Accuracy
0.8106060606060606
Confusion matrix
[[157 192]
 [  8 699]]
Precision, Recall, F
               precision    recall  f1-score   support

       False       0.95      0.45      0.61       349
        True       0.78      0.99      0.87       707

    accuracy                           0.81      1056
   macro avg       0.87      0.72      0.74      1056
weighted avg       0.84      0.81      0.79      1056



In [16]:
print('Accuracy')
print(accuracy_score(y_test, predict > 0.5))
print('Confusion matrix')
print(confusion_matrix(y_test, predict > 0.5))
print('Precision, Recall, F\n', classification_report(
    y_test, predict > 0.5))


Accuracy
0.7840909090909091
Confusion matrix
[[ 41  57]
 [  0 166]]
Precision, Recall, F
               precision    recall  f1-score   support

       False       1.00      0.42      0.59        98
        True       0.74      1.00      0.85       166

    accuracy                           0.78       264
   macro avg       0.87      0.71      0.72       264
weighted avg       0.84      0.78      0.76       264



In [17]:
model_class.save('../../PipeLines/Classifications/checker_is_discriptor')




INFO:tensorflow:Assets written to: ../../PipeLines/Classifications/checker_is_discriptor\assets


INFO:tensorflow:Assets written to: ../../PipeLines/Classifications/checker_is_discriptor\assets
