In [1]:
import numpy as np
import pandas as pd
import spacy
nlp = spacy.load("ru_core_news_sm")
import tensorflow as tf
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_curve, RocCurveDisplay, auc, mean_squared_error
from sklearn.model_selection import train_test_split
from json import load


In [2]:
not_discriptors = list()
with open('../../Data/Words/word_pairs_not_disc.txt', 'r', encoding='utf-8') as txt:
    for i in txt:
        not_discriptors.append(i.replace('\n', ''))

with open('../../Data/Events/type_of_events.json', 'r', encoding='utf-8') as js:
    events = load(js)

with open('../../Data/Datasets/data.json', 'r', encoding='utf-8') as js:
    discriptors = load(js)


In [3]:
dict_of_disc = {'expession': [], 'is_discr': []}
for subject in discriptors.values():
    for d in sum(subject.values(), []):
        dict_of_disc['expession'].append(nlp(d).vector)
        dict_of_disc['is_discr'].append(True)

for subject in not_discriptors:
    dict_of_disc['expession'].append(nlp(subject).vector)
    dict_of_disc['is_discr'].append(False)



In [4]:
features = np.array(dict_of_disc['expession'])
labels = np.array(list(map(int, dict_of_disc['is_discr'])))


In [5]:
X_train, X_test, y_train, y_test = train_test_split(features, labels, random_state=71)


In [6]:
import tensorflow as tf

model_class = tf.keras.models.Sequential()
model_class.add(tf.keras.layers.Dense(int(X_train[0].shape[0]*1.5), activation='softmax',
                                      input_shape=(X_train[0].shape[0],), name='input'))
model_class.add(tf.keras.layers.Dense(
    int(X_train[0].shape[0]*2), activation='relu', name='hidden_layer_2'))
model_class.add(tf.keras.layers.Dense(
    int(X_train[0].shape[0]*3), activation='relu', name='hidden_layer_3'))
model_class.add(tf.keras.layers.Dense(
    int(X_train[0].shape[0]*2), activation='relu', name='hidden_layer_4'))
model_class.add(tf.keras.layers.Dense(
    int(X_train[0].shape[0]*1.5), activation='relu', name='hidden_layer_5'))
model_class.add(tf.keras.layers.Dense(
    int(X_train[0].shape[0]*0.5), activation='relu', name='hidden_layer_6'))
model_class.add(tf.keras.layers.Dense(1, name='output'))
model_class.summary()


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input (Dense)               (None, 144)               13968     
                                                                 
 hidden_layer_2 (Dense)      (None, 192)               27840     
                                                                 
 hidden_layer_3 (Dense)      (None, 288)               55584     
                                                                 
 hidden_layer_4 (Dense)      (None, 192)               55488     
                                                                 
 hidden_layer_5 (Dense)      (None, 144)               27792     
                                                                 
 hidden_layer_6 (Dense)      (None, 48)                6960      
                                                                 
 output (Dense)              (None, 1)                 4

In [7]:
model_class.compile(
    loss='mse',  # Функция потерь
    optimizer='Adam',  # Оптимизатор
    metrics=[  # Метрики
        'mse',  # Если у объекта назначено имя, то можно вызвать объект с его помощью
        # tf.keras.metrics.Precision()
    ]
)


In [8]:
model_class.fit(
    X_train,  # Набор входных данных
    y_train,  # Набор правильных ответов
    validation_split=0.4,  # Этот параметр автоматически выделит часть обучающего набора на валидационные данные. В данном случа 20%
    epochs=40,  # Процесс обучения завершится после 10 эпох
    # Набор данных будет разбит на пакеты (батчи) по 8 элементов набора в каждом.
    batch_size=2
)


Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40


<keras.callbacks.History at 0x1493dca1ca0>

In [9]:
predict = model_class.predict(X_test)




In [10]:
mean_squared_error(predict, y_test)


0.04082511977680413

In [11]:
predictA = model_class.predict(features)
print('Accuracy')
print(accuracy_score(labels, predictA > 0.5))
print('Confusion matrix')
print(confusion_matrix(labels, predictA > 0.5))
print('Precision, Recall, F\n', classification_report(
    labels, predictA > 0.5))


Accuracy
0.9690140845070423
Confusion matrix
[[338  20]
 [ 13 694]]
Precision, Recall, F
               precision    recall  f1-score   support

           0       0.96      0.94      0.95       358
           1       0.97      0.98      0.98       707

    accuracy                           0.97      1065
   macro avg       0.97      0.96      0.97      1065
weighted avg       0.97      0.97      0.97      1065



In [12]:
print('Accuracy')
print(accuracy_score(y_test, predict > 0.98))
print('Confusion matrix')
print(confusion_matrix(y_test, predict > 0.98))
print('Precision, Recall, F\n', classification_report(
    y_test, predict > 0.98))


Accuracy
0.9438202247191011
Confusion matrix
[[ 92   4]
 [ 11 160]]
Precision, Recall, F
               precision    recall  f1-score   support

           0       0.89      0.96      0.92        96
           1       0.98      0.94      0.96       171

    accuracy                           0.94       267
   macro avg       0.93      0.95      0.94       267
weighted avg       0.95      0.94      0.94       267



In [13]:
tn, fp, fn, tp = confusion_matrix(y_test, predict > 0.9).ravel()
(tn, fp, fn, tp)


(92, 4, 8, 163)

In [14]:
model_class.predict(np.array([nlp('рекомендации, чтобы поддерживать внутреннюю').vector]))




array([[0.9099933]], dtype=float32)

In [15]:
model_class.predict(
    np.array([nlp('чтобы поддерживать внутреннюю').vector]))




array([[0.4815378]], dtype=float32)

In [16]:
model_class.predict(
    np.array([nlp('поддерживать внутреннюю структуру').vector]))




array([[0.99752]], dtype=float32)

In [18]:
model_class.save('../../PipeLines/Classifications/checker_is_discriptor_spacy_vectorize.h5')
