In [1]:
import numpy as np
import pandas as pd
import torch
import transformers as ppb
import tensorflow as tf
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_curve, RocCurveDisplay, auc, mean_squared_error
from sklearn.model_selection import train_test_split
from json import load


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
not_discriptors = list()
with open('../../Data/Words/word_pairs_not_disc.txt', 'r', encoding='utf-8') as txt:
    for i in txt:
        not_discriptors.append(i.replace('\n', ''))

with open('../../Data/Events/type_of_events.json', 'r', encoding='utf-8') as js:
    events = load(js)

with open('../../Data/Datasets/data.json', 'r', encoding='utf-8') as js:
    discriptors = load(js)


In [3]:
dict_of_disc = {'expession': [], 'is_discr': []}
for subject in discriptors.values():
    for d in sum(subject.values(), []):
        dict_of_disc['expession'].append(d)
        dict_of_disc['is_discr'].append(True)

for subject in not_discriptors:
    dict_of_disc['expession'].append(subject)
    dict_of_disc['is_discr'].append(False)



In [4]:
df = pd.DataFrame(dict_of_disc)
df

Unnamed: 0,expession,is_discr
0,основные положения теории истории,True
1,основные положения методологии истории,True
2,место истории в системе гуманитарного знания,True
3,закономерности исторического процесса,True
4,этапы исторического процесса,True
...,...,...
1060,международные обмены,False
1061,международных стажировок,False
1062,темпа жизни,False
1063,увеличение потока,False


In [5]:
# model_class, tokenizer_class, pretrained_weights = (
#     ppb.DistilBertModel, ppb.DistilBertTokenizer, 'distilbert-base-uncased')

# Хотите BERT вместо distilBERT? Раскомментируйте следующую строку:
model_class, tokenizer_class, pretrained_weights = (ppb.BertModel, ppb.BertTokenizer, 'bert-base-uncased')

# Загрузка предобученной модели/токенизатора
tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
model = model_class.from_pretrained(pretrained_weights)


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [6]:
tokenized = df['expession'].apply((lambda x: tokenizer.encode(x, add_special_tokens=True)))
padded = [tokenizer.encode(i, add_special_tokens=True) for i in df['expession'].to_list()]
ml = len(max(padded, key=len))
for i in padded:
    for _ in range(ml-len(i)):
        i.append(0)

padded = np.array(padded)

In [7]:
input_ids = torch.tensor(padded)

with torch.no_grad():
    
    last_hidden_states = model(input_ids)


In [8]:
features = last_hidden_states[0][:,0,:].numpy()


In [9]:
labels = df['is_discr']
X_train, X_test, y_train, y_test = train_test_split(features, labels, random_state=71)


In [10]:
import tensorflow as tf

model_class = tf.keras.models.Sequential()
model_class.add(tf.keras.layers.Dense(int(X_train[0].shape[0]*1.5), activation='softmax',
                                      input_shape=(X_train[0].shape[0],), name='input'))
model_class.add(tf.keras.layers.Dense(
    int(X_train[0].shape[0]*2), activation='relu', name='hidden_layer_2'))
model_class.add(tf.keras.layers.Dense(
    int(X_train[0].shape[0]*1.5), activation='relu', name='hidden_layer_3'))
model_class.add(tf.keras.layers.Dense(
    int(X_train[0].shape[0]*0.5), activation='relu', name='hidden_layer_4'))
model_class.add(tf.keras.layers.Dense(1, name='output', activation='sigmoid'))
model_class.summary()


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input (Dense)               (None, 1152)              885888    
                                                                 
 hidden_layer_2 (Dense)      (None, 1536)              1771008   
                                                                 
 hidden_layer_3 (Dense)      (None, 1152)              1770624   
                                                                 
 hidden_layer_4 (Dense)      (None, 384)               442752    
                                                                 
 output (Dense)              (None, 1)                 385       
                                                                 
Total params: 4,870,657
Trainable params: 4,870,657
Non-trainable params: 0
_________________________________________________________________


In [11]:
model_class.compile(
    loss=tf.keras.metrics.binary_crossentropy, #  'mse',  # Функция потерь
    optimizer='Adam',  # Оптимизатор
    metrics=[  # Метрики
        'mse', 
        # tf.keras.metrics.Precision()
        # tf.keras.metrics.FalsePositives
    ]
)


In [13]:
model_class.fit(
    X_train,  # Набор входных данных
    y_train.to_numpy().astype(int),  # Набор правильных ответов
    validation_split=0.4,  # Этот параметр автоматически выделит часть обучающего набора на валидационные данные. В данном случа 20%
    epochs=10,  # Процесс обучения завершится после 10 эпох
    # Набор данных будет разбит на пакеты (батчи) по 8 элементов набора в каждом.
    batch_size=2
)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1ba1cfd02b0>

In [14]:
predict = model_class.predict(X_test)




In [15]:
mean_squared_error(predict, y_test)


0.15022235

In [26]:
t = 0.65
predictA = model_class.predict(features)
print('Accuracy')
print(accuracy_score(labels, predictA > t))
print('Confusion matrix')
print(confusion_matrix(labels, predictA > t))
print('Precision, Recall, F\n', classification_report(
    labels, predictA > t))


Accuracy
0.815962441314554
Confusion matrix
[[221 137]
 [ 59 648]]
Precision, Recall, F
               precision    recall  f1-score   support

       False       0.79      0.62      0.69       358
        True       0.83      0.92      0.87       707

    accuracy                           0.82      1065
   macro avg       0.81      0.77      0.78      1065
weighted avg       0.81      0.82      0.81      1065



In [None]:
r = np.append(X_train, X_test, axis=0)
rr = np.append(y_train, y_test)

In [17]:
print('Accuracy')
print(accuracy_score(rr, r > 0.5))
print('Confusion matrix')
print(confusion_matrix(rr, r > 0.5))
print('Precision, Recall, F\n', classification_report(
    rr, r > 0.5))


Accuracy
0.7827715355805244
Confusion matrix
[[ 43  53]
 [  5 166]]
Precision, Recall, F
               precision    recall  f1-score   support

       False       0.90      0.45      0.60        96
        True       0.76      0.97      0.85       171

    accuracy                           0.78       267
   macro avg       0.83      0.71      0.72       267
weighted avg       0.81      0.78      0.76       267



In [None]:
# model_class.save('../../PipeLines/Classifications/checker_is_discriptor')




INFO:tensorflow:Assets written to: ../../PipeLines/Classifications/checker_is_discriptor\assets


INFO:tensorflow:Assets written to: ../../PipeLines/Classifications/checker_is_discriptor\assets
