Заглушка

Тема “Свёртки”

Берем отызывы за лето (из архива с материалами или предыдущего занятия)
1. Учим conv сеть для классификации
2. Рассмотреть 2-а варианта сеточек
3. Инициализировать tf.keras.layers.Embedding предобученными векторами взять к примеру с https://rusvectores.org/ru/
4. Инициализировать слой tf.keras.layers.Embedding по умолчанию (ну то есть вам ничего не делать с весами)
Сравнить две архитектуры с предобученными весами и когда tf.keras.layers.Embedding обучается сразу со всей сеточкой, что получилось лучше



In [8]:
import pandas as pd

df = pd.read_excel("sample_data/отзывы_за_лето.xls")

In [9]:
df.iloc[:5]

Unnamed: 0,Rating,Content,Date
0,5,It just works!,2017-08-14
1,4,В целом удобноное приложение...из минусов хотя...,2017-08-14
2,5,Отлично все,2017-08-14
3,5,Стал зависать на 1% работы антивируса. Дальше ...,2017-08-14
4,5,"Очень удобно, работает быстро.",2017-08-14


In [10]:
! pip install pymorphy2 stop_words

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [11]:
from string import punctuation
from stop_words import get_stop_words
from pymorphy2 import MorphAnalyzer
from functools import lru_cache
from multiprocessing import Pool
import numpy as np
from sklearn.model_selection import train_test_split
from tqdm.notebook import tqdm
import re

sw = set(get_stop_words("ru"))
exclude = set(punctuation)
morpher = MorphAnalyzer()

regex = re.compile("[А-Яа-я0-1:=!\)\()A-z\_\%/|]+")

def words_only(text, regex=regex):
    try:
        return regex.findall(text)
    except:
        return []

def lemmatize(text, pymorphy=morpher):
    try:
        return " ".join([pymorphy.parse(word)[0].normal_form for word in text if word not in sw])
    except:
        return " "  

def clean_text(text):
    return lemmatize(words_only(text))

In [12]:
with Pool(8) as p:
    lemmas = list(tqdm(p.imap(clean_text, df['Content']), total=len(df)))

    
df['lemmas'] = lemmas
df.head()

  0%|          | 0/20659 [00:00<?, ?it/s]

Unnamed: 0,Rating,Content,Date,lemmas
0,5,It just works!,2017-08-14,it just works!
1,4,В целом удобноное приложение...из минусов хотя...,2017-08-14,в целое удобноной приложение минус хотеть боль...
2,5,Отлично все,2017-08-14,отлично
3,5,Стал зависать на 1% работы антивируса. Дальше ...,2017-08-14,стать зависать 1% работа антивирус далёкий ран...
4,5,"Очень удобно, работает быстро.",2017-08-14,очень удобно работать быстро


In [13]:
X = df.lemmas.tolist()
y = df.Rating.tolist()

X, y = np.array(X), np.array(y)

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3)
print ("total train examples %s" % len(y_train))
print ("total test examples %s" % len(y_test))

total train examples 14461
total test examples 6198


In [14]:
train_corpus = " ".join(X_train)

In [15]:
import nltk
from nltk.tokenize import word_tokenize
nltk.download("punkt")

tokens = word_tokenize(train_corpus)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [16]:
max_words = 100000
max_len = 300
num_classes = 1

# Training
epochs = 20
batch_size = 512
print_batch_n = 100

In [17]:
tokens_filtered = [word for word in tokens if word.isalnum()]

In [18]:
from nltk.probability import FreqDist
dist = FreqDist(tokens_filtered)
tokens_filtered_top = [pair[0] for pair in dist.most_common(max_words-1)]

In [19]:
tokens_filtered_top[:10]

['приложение',
 'удобно',
 'очень',
 'работать',
 'удобный',
 'всё',
 'вс',
 'отлично',
 'я',
 'спасибо']

In [20]:
vocabulary = {v: k for k, v in dict(enumerate(tokens_filtered_top, 1)).items()}

In [21]:
import numpy as np

def text_to_sequence(text, maxlen):
    result = []
    tokens = word_tokenize(text.lower())
    tokens_filtered = [word for word in tokens if word.isalnum()]
    for word in tokens_filtered:
        if word in vocabulary:
            result.append(vocabulary[word])
    padding = [0]*(maxlen-len(result))
    return padding + result[-maxlen:]

In [22]:
X_train = np.asarray([text_to_sequence(text, max_len) for text in X_train], dtype=np.int32)
X_test = np.asarray([text_to_sequence(text, max_len) for text in X_test], dtype=np.int32)

In [23]:
X_train.shape, X_test.shape

((14461, 300), (6198, 300))

In [24]:
import numpy as np
import tensorflow as tf
import keras
from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, Activation, Input, Embedding, Conv1D, GlobalMaxPool1D, Flatten
from keras.callbacks import TensorBoard 
from keras.losses import CategoricalCrossentropy
from keras.callbacks import EarlyStopping  

In [25]:
num_classes = 6
y_train = keras.utils.to_categorical(y_train, num_classes)
y_test = keras.utils.to_categorical(y_test, num_classes)

In [26]:
model = Sequential()
model.add(Embedding(input_dim=max_words, output_dim=max_len, input_length=max_len))
model.add(Conv1D(max_len, 3))
model.add(Activation("relu"))
model.add(GlobalMaxPool1D())
model.add(Flatten())
model.add(Dense(10))
model.add(Activation("relu"))
model.add(Flatten())
model.add(Dense(num_classes))
model.add(Activation('softmax'))

In [27]:
model.compile(loss='CategoricalCrossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [28]:
tensorboard=TensorBoard(log_dir='./logs', write_graph=True, write_images=True)
early_stopping=EarlyStopping(monitor='val_loss')  


history = model.fit(X_train, y_train,
                    batch_size=batch_size,
                    epochs=epochs,
                    verbose=1,
                    validation_split=0.1,
                    callbacks=[tensorboard, early_stopping])

Epoch 1/20
Epoch 2/20


In [29]:
score = model.evaluate(X_test, y_test, batch_size=batch_size, verbose=1)
print('\n')
print('Test score:', score[0])
print('Test accuracy:', score[1])



Test score: 0.7390893697738647
Test accuracy: 0.7683123350143433


In [35]:
!pip install wget
!pip install gensim --upgrade

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [37]:
!wget http://vectors.nlpl.eu/repository/11/180.zip
!unzip 180.zip

--2023-01-13 14:58:36--  http://vectors.nlpl.eu/repository/11/180.zip
Resolving vectors.nlpl.eu (vectors.nlpl.eu)... 129.240.189.181
Connecting to vectors.nlpl.eu (vectors.nlpl.eu)|129.240.189.181|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 484452285 (462M) [application/zip]
Saving to: ‘180.zip.1’


2023-01-13 14:58:40 (99.8 MB/s) - ‘180.zip.1’ saved [484452285/484452285]

Archive:  180.zip
  inflating: README                  
  inflating: meta.json               
  inflating: model.bin               
  inflating: model.txt               


In [38]:
model = gensim.models.KeyedVectors.load_word2vec_format('model.bin', binary=True)

In [39]:
for n in model.most_similar(positive=[u'тушение_NOUN']):
    print (n[0], n[1])

тушений_NOUN 0.7801563143730164
противопожарный_ADJ 0.635464608669281
лесопожарный_ADJ 0.6165294647216797
пожаротушение_NOUN 0.6065576672554016
возгорание_NOUN 0.6054503917694092
пожар_NOUN 0.5795326232910156
задымление_NOUN 0.568307101726532
пожарный_NOUN 0.5492812395095825
загорание_NOUN 0.5449301600456238
пожароопасный_ADJ 0.5265752077102661


In [40]:
n = model.vectors

In [41]:
n.shape

(189193, 300)

In [42]:
model = Sequential()
model.add(Embedding(input_dim=189193, output_dim=300, input_length=max_len, weights=[n]))
model.add(Conv1D(300, 3))
model.add(Activation("relu"))
model.add(GlobalMaxPool1D())
model.add(Flatten())
model.add(Dense(10))
model.add(Activation("relu"))
model.add(Flatten())
model.add(Dense(num_classes))
model.add(Activation('softmax'))

In [43]:
model.compile(loss='CategoricalCrossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [44]:
tensorboard=TensorBoard(log_dir='./logs', write_graph=True, write_images=True)
early_stopping=EarlyStopping(monitor='val_loss')  


history = model.fit(X_train, y_train,
                    batch_size=batch_size,
                    epochs=epochs,
                    verbose=1,
                    validation_split=0.1,
                    callbacks=[tensorboard, early_stopping])

Epoch 1/20
Epoch 2/20


In [45]:
score = model.evaluate(X_test, y_test, batch_size=batch_size, verbose=1)
print('\n')
print('Test score:', score[0])
print('Test accuracy:', score[1])



Test score: 0.7144147157669067
Test accuracy: 0.7592771649360657


Предобученная модель показала результат ниже