### Урок 7. Сверточные нейронные сети для анализа текста
Задание
Берем отызывы за лето (из архива с материалами или предыдущего занятия)
1. Учим conv сеть для классификации
2. Рассмотреть 2-а варианта сеточек
2.1 Инициализировать tf.keras.layers.Embedding предобученными векторами взять к примеру с https://rusvectores.org/ru/
2.2 Инициализировать слой tf.keras.layers.Embedding по умолчанию (ну то есть вам ничего не делать с весами)

Сравнить две архитектуры с предобученными весами и когда tf.keras.layers.Embedding обучается сразу со всей сеточкой, что получилось лучше

In [1]:
max_words = 1000
max_len = 300
num_classes = 5

# Training
epochs = 20
batch_size = 512
print_batch_n = 100

#### Imports, data load

In [2]:
!pip install stop-words -q
!pip install pymorphy2 -q

In [3]:
#!pip install --upgrade xlrd

In [4]:
!pip show xlrd

Name: xlrd
Version: 2.0.1
Summary: Library for developers to extract data from Microsoft Excel (tm) .xls spreadsheet files
Home-page: http://www.python-excel.org/
Author: Chris Withers
Author-email: chris@withers.org
License: BSD
Location: /usr/local/lib/python3.7/dist-packages
Requires: 
Required-by: 


In [5]:
import pandas as pd
import numpy as np
from string import punctuation
from stop_words import get_stop_words
from pymorphy2 import MorphAnalyzer
import re
from sklearn.model_selection import train_test_split

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, Dropout, Activation, Input, Embedding, Conv1D, GlobalMaxPool1D
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import TensorBoard 
from tensorflow.keras.metrics import categorical_crossentropy
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score 

In [6]:
df = pd.pandas.read_excel("отзывы за лето.xls")

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20659 entries, 0 to 20658
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Rating   20659 non-null  int64 
 1   Content  20656 non-null  object
 2   Date     20659 non-null  object
dtypes: int64(1), object(2)
memory usage: 484.3+ KB


In [8]:
df = df.dropna()

In [9]:
df.Rating.unique()

array([5, 4, 2, 3, 1])

In [10]:
df_train_temp, df_test = train_test_split(df, test_size=0.2, random_state=12)
df_train, df_val = train_test_split(df_train_temp, test_size=0.2, random_state=12)
df_train_temp = None

### Preprocessing

In [11]:
sw = set(get_stop_words("ru"))
exclude = set(punctuation)
morpher = MorphAnalyzer()

def preprocess_text(txt):
    txt = str(txt)
    txt = "".join(c for c in txt if c not in exclude)
    txt = txt.lower()
    txt = re.sub("\sне", "не", txt)
    txt = [morpher.parse(word)[0].normal_form for word in txt.split() if word not in sw]
    return " ".join(txt)

df_train['Content'] = df_train['Content'].apply(preprocess_text)
df_val['Content'] = df_val['Content'].apply(preprocess_text)
df_test['Content'] = df_test['Content'].apply(preprocess_text)

In [12]:
# Rating preprocessing 1 - class 0, 5 - class 4
df_train['Rating'] = df_train['Rating'] - 1
df_val['Rating'] = df_val['Rating'] - 1
df_test['Rating'] = df_test['Rating'] - 1

In [13]:
train_corpus = " ".join(df_train["Content"])
train_corpus = train_corpus.lower()

In [14]:
import nltk
from nltk.tokenize import word_tokenize
nltk.download("punkt")

tokens = word_tokenize(train_corpus)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [15]:
tokens_filtered = [word for word in tokens if word.isalnum()]

In [16]:
from nltk.probability import FreqDist
dist = FreqDist(tokens_filtered)
tokens_filtered_top = [pair[0] for pair in dist.most_common(max_words-1)]

In [17]:
vocabulary = {v: k for k, v in dict(enumerate(tokens_filtered_top, 1)).items()}

In [18]:
def text_to_sequence(text, maxlen):
    result = []
    tokens = word_tokenize(text.lower())
    tokens_filtered = [word for word in tokens if word.isalnum()]
    for word in tokens_filtered:
        if word in vocabulary:
            result.append(vocabulary[word])
    padding = [0]*(maxlen-len(result))
    return padding + result[-maxlen:]

In [19]:
x_train = np.asarray([text_to_sequence(text, max_len) for text in df_train["Content"]], dtype=np.int32)
x_test = np.asarray([text_to_sequence(text, max_len) for text in df_test["Content"]], dtype=np.int32)
x_val = np.asarray([text_to_sequence(text, max_len) for text in df_val["Content"]], dtype=np.int32)

In [20]:
x_train.shape, x_val.shape, x_test.shape, 

((13219, 300), (3305, 300), (4132, 300))

### Net

#### Model

In [21]:
num_classes = 5
y_train = keras.utils.to_categorical(df_train["Rating"], num_classes)
y_val = keras.utils.to_categorical(df_val["Rating"], num_classes)
y_test = keras.utils.to_categorical(df_test["Rating"], num_classes)

In [22]:
model = Sequential()
model.add(Embedding(input_dim=max_words, output_dim=300, input_length=max_len))
model.add(Conv1D(128, 3))
model.add(Activation("relu"))
model.add(GlobalMaxPool1D())
model.add(Dense(10))
model.add(Activation("relu"))
model.add(Dense(num_classes))
model.add(Activation('softmax'))

In [23]:
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

#### Model fit

In [24]:
tensorboard=TensorBoard(log_dir='./logs', write_graph=True, write_images=True)
early_stopping=EarlyStopping(monitor='val_loss')  


history = model.fit(x_train, y_train,
                    batch_size=batch_size,
                    epochs=epochs,
                    verbose=1,
                    validation_data=(x_val, y_val),
                    callbacks=[tensorboard, early_stopping])

Epoch 1/20
Epoch 2/20


#### Model evaluate

In [25]:
score = model.evaluate(x_val, y_val, batch_size=batch_size, verbose=1)
print('\n')
print('Validation score:', score[0])
print('Validation accuracy:', score[1])



Validation score: 0.7213513851165771
Validation accuracy: 0.7576399445533752


In [26]:
score = model.evaluate(x_test, y_test, batch_size=batch_size, verbose=1)
print('\n')
print('Test score:', score[0])
print('Test accuracy:', score[1])



Test score: 0.7316172122955322
Test accuracy: 0.7575024366378784


#### Accuracy on test data

In [27]:
results = model.predict(x_test, batch_size=batch_size, verbose=1)



In [28]:
result = [np.argmax(item) for item in results]

In [29]:
accuracy_score(df_test['Rating'].to_numpy(), result)

0.7575024201355276

### RusVectors net

#### Data downloading & processing

https://blog.keras.io/using-pre-trained-word-embeddings-in-a-keras-model.html

In [30]:
!wget 'http://vectors.nlpl.eu/repository/20/220.zip' -q -N

In [31]:
!unzip -qq -o 220.zip 

In [32]:
def line_conversion(line):
  data_list = line.split()
  data = []
  word = None
  for field in data_list:
    try:
      data.append(float(field))
    except ValueError:
      word = field.split('_')[0]
  return word, data

In [33]:
#data_matrix = pd.read_csv('model.txt', sep=' ')

In [34]:
with open('model.txt', 'r') as f:
  embedding_matrix = None
  word_dict = []
  for line in f:
    word, embedding_vector = line_conversion(line)
    if word in vocabulary.keys():
      word_dict.append(word)
      if embedding_matrix is None:
        embedding_matrix = np.array(embedding_vector)
      else:
        embedding_matrix = np.concatenate((embedding_matrix, embedding_vector))
    if len(word_dict) == max_words:
      break

In [35]:
embedding_matrix=np.reshape(embedding_matrix,(max_words, max_len))

In [36]:
#word_dic = pd.read_csv('word.dic', names=['word', 'position'], sep='\t' , error_bad_lines=False)

#### Model

In [37]:
model = Sequential()
model.add(Embedding(input_dim=embedding_matrix.shape[0], 
                    output_dim=embedding_matrix.shape[1],
                    weights=[embedding_matrix],
                    input_length=max_len,
                    trainable=False))
model.add(Conv1D(128, 3))
model.add(Activation("relu"))
model.add(GlobalMaxPool1D())
model.add(Dense(10))
model.add(Activation("relu"))
model.add(Dense(num_classes))
model.add(Activation('softmax'))

In [38]:
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

#### Model fit

In [39]:
tensorboard=TensorBoard(log_dir='./logs', write_graph=True, write_images=True)
early_stopping=EarlyStopping(monitor='val_loss')  


history = model.fit(x_train, y_train,
                    batch_size=batch_size,
                    epochs=epochs,
                    verbose=1,
                    validation_data=(x_val, y_val),
                    callbacks=[tensorboard, early_stopping])

Epoch 1/20
Epoch 2/20


#### Model evaluate

In [40]:
score = model.evaluate(x_val, y_val, batch_size=batch_size, verbose=1)
print('\n')
print('Validation score:', score[0])
print('Validation accuracy:', score[1])



Validation score: 0.7593154311180115
Validation accuracy: 0.7164901494979858


In [41]:
score = model.evaluate(x_test, y_test, batch_size=batch_size, verbose=1)
print('\n')
print('Test score:', score[0])
print('Test accuracy:', score[1])



Test score: 0.7747810482978821
Test accuracy: 0.7212004065513611
