# Введение в искусственные нейронные сети
# Урок 5. Рекуррентные нейронные сети

## Практическое задание

<ol>
    Попробуйте обучить нейронную сеть LSTM на любом другом датасете (любимый временной ряд, текст на русском (другом языке) как генератор или классификатор, или прилагаемый набор airline-passengers - пасажиропоток для авиалиний). Опишите, какой результата вы получили? Что помогло вам улучшить ее точность?

</ol>

Данные возьмем с соревнования: https://www.kaggle.com/competitions/nbme-score-clinical-patient-notes

In [1]:
import pandas as pd
import numpy as np
import itertools
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings("ignore")

In [2]:
import tensorflow as tf
from keras.preprocessing import sequence, text
from keras.models import Sequential
from keras.layers import Dense, Embedding, Dropout, LSTM
from keras.callbacks import EarlyStopping
from keras import regularizers

In [3]:
df_train = pd.read_csv('train.csv')

In [4]:
df_train.head()

Unnamed: 0,id,case_num,pn_num,feature_num,annotation,location
0,00016_000,0,16,0,['dad with recent heart attcak'],['696 724']
1,00016_001,0,16,1,"['mom with ""thyroid disease']",['668 693']
2,00016_002,0,16,2,['chest pressure'],['203 217']
3,00016_003,0,16,3,"['intermittent episodes', 'episode']","['70 91', '176 183']"
4,00016_004,0,16,4,['felt as if he were going to pass out'],['222 258']


In [5]:
list_annotation = []
for index, row in df_train.iterrows():
    list_annotation.append(row['annotation'][2:-2])

In [6]:
df_train['annotation'] = list_annotation

In [7]:
df_train.case_num.value_counts()

5    1800
8    1800
2    1700
9    1700
3    1600
0    1300
1    1300
6    1200
4    1000
7     900
Name: case_num, dtype: int64

In [8]:
MAX_NB_WORDS = 100000
MAX_SEQUENCE_LENGTH = 250
tokenizer = text.Tokenizer(num_words=MAX_NB_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True)
tokenizer.fit_on_texts(df_train['annotation'].values)
word_index = tokenizer.word_index

In [9]:
X = tokenizer.texts_to_sequences(df_train['annotation'].values)
X = sequence.pad_sequences(X, maxlen=MAX_SEQUENCE_LENGTH)
X.shape

(14300, 250)

In [10]:
y = pd.get_dummies(df_train['case_num']).values
y.shape

(14300, 10)

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.10, random_state = 42)
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(12870, 250) (12870, 10)
(1430, 250) (1430, 10)


In [12]:
callback = EarlyStopping( monitor='val_accuracy', patience=10) 

In [13]:
total_words = len(tokenizer.word_index) + 1

In [14]:
column_name_result = ['index', 'iteration', 'loss_name', 'optimizer_name', 
                      'valid_accuracy', 'test_accuracy']
df_result = pd.DataFrame(columns=column_name_result)

In [15]:
def creation_model():
    model = Sequential()
    model.add(Embedding(total_words, 1024, input_length=X.shape[1]))  
    model.add(LSTM(128))
    model.add(Dropout(0.2))
    model.add(Dense(total_words / 2, activation='relu', 
                    kernel_regularizer=regularizers.l2(0.1)))
    model.add(Dropout(0.2))
    model.add(Dense(total_words / 4, activation='relu', 
                    kernel_regularizer=regularizers.l2(0.01)))
    model.add(Dropout(0.2))
    model.add(Dense(total_words / 10, activation='relu', 
                    kernel_regularizer=regularizers.l2(0.1)))
    model.add(Dropout(0.2))
    model.add(Dense(total_words / 25, activation='relu',
                    kernel_regularizer=regularizers.l2(0.01)))
    model.add(Dense(10, activation='softmax'))
    return model

In [16]:
def modeling(loss_func, optimizer_func, epoch, batch, train_features,
             train_labels, df_result, count, test_features, test_labels):
  for iter in [1, 2, 3]:
    model = creation_model()
    model.compile(optimizer=optimizer_func,
                  loss=loss_func,
                  metrics=['accuracy'])
  
    with tf.device("GPU:0"):
      history = model.fit(train_features, 
                    train_labels, 
                    epochs=epoch, 
                    batch_size=batch,
                    validation_split=0.2, 
                    verbose=2)
    
    predictions = model.predict(X_test)
    preds = np.argmax(predictions, axis=1)
    test_real = np.argmax(y_test, axis=1)
    test_accuracy = accuracy_score(test_real, preds)

    dict_result = dict()
    dict_result['index'] = count
    dict_result['iteration'] = int(iter)
    dict_result['loss_name'] = loss_func
    dict_result['optimizer_name'] = optimizer_func
    dict_result['valid_accuracy'] = history.history['val_accuracy'][-1]
    dict_result['test_accuracy'] = test_accuracy
    print(dict_result)
    df_result = pd.concat([df_result, pd.DataFrame(dict_result, index=range(count, count+1))])
    count += 1
  return df_result, count

In [17]:
%%time
epoch = 30
batch = 128
count = 0

optimizer = ['adam', 'nadam', 'sgd', 'rmsprop']  
loss = ['categorical_crossentropy', 'kullback_leibler_divergence']

for loss, optimize in itertools.product(loss, optimizer):
    df_result, count = modeling(loss, optimize, epoch, batch, 
                              X_train, y_train, df_result, count, 
                              X_test, y_test)

Epoch 1/30
81/81 - 13s - loss: 21.0248 - accuracy: 0.1197 - val_loss: 2.8537 - val_accuracy: 0.1309 - 13s/epoch - 156ms/step
Epoch 2/30
81/81 - 9s - loss: 2.3941 - accuracy: 0.1246 - val_loss: 2.2824 - val_accuracy: 0.1309 - 9s/epoch - 106ms/step
Epoch 3/30
81/81 - 9s - loss: 2.2802 - accuracy: 0.1223 - val_loss: 2.2760 - val_accuracy: 0.1282 - 9s/epoch - 107ms/step
Epoch 4/30
81/81 - 9s - loss: 2.2783 - accuracy: 0.1219 - val_loss: 2.2751 - val_accuracy: 0.1309 - 9s/epoch - 107ms/step
Epoch 5/30
81/81 - 9s - loss: 2.2783 - accuracy: 0.1222 - val_loss: 2.2753 - val_accuracy: 0.1282 - 9s/epoch - 107ms/step
Epoch 6/30
81/81 - 9s - loss: 2.2781 - accuracy: 0.1223 - val_loss: 2.2752 - val_accuracy: 0.1309 - 9s/epoch - 107ms/step
Epoch 7/30
81/81 - 9s - loss: 2.2780 - accuracy: 0.1246 - val_loss: 2.2751 - val_accuracy: 0.1309 - 9s/epoch - 107ms/step
Epoch 8/30
81/81 - 9s - loss: 2.2779 - accuracy: 0.1214 - val_loss: 2.2752 - val_accuracy: 0.1309 - 9s/epoch - 108ms/step
Epoch 9/30
81/81 - 9s

In [18]:
df_result

Unnamed: 0,index,iteration,loss_name,optimizer_name,valid_accuracy,test_accuracy
0,0,1,categorical_crossentropy,adam,0.130925,0.125874
1,1,2,categorical_crossentropy,adam,0.130925,0.125874
2,2,3,categorical_crossentropy,adam,0.128205,0.130769
3,3,1,categorical_crossentropy,nadam,0.309246,0.300699
4,4,2,categorical_crossentropy,nadam,0.321678,0.309091
5,5,3,categorical_crossentropy,nadam,0.278166,0.284615
6,6,1,categorical_crossentropy,sgd,0.130925,0.125874
7,7,2,categorical_crossentropy,sgd,0.130925,0.125874
8,8,3,categorical_crossentropy,sgd,0.130925,0.125874
9,9,1,categorical_crossentropy,rmsprop,0.327117,0.311888


Выводы:
1. Наилучший результат по значению accuracy на тесте показала комбинация функции потерь "categorical_crossentropy" и оптимизатора "rmsprop"
2. Наиболее высокие результаты показывают модели, в которых в качестве оптимизатора взяты "rmsprop" или "nadam"
3. Выбор функции потерь между "categorical_crossentropy" и "kullback_leibler_divergence" не оказывает сильного влияния на результат для данной задачи, т.к. результаты для обеих функций потерь схожи