In [30]:
import tensorflow as tf
import pandas as pd
import numpy as np
from IPython.display import display, clear_output
from sklearn.model_selection import train_test_split
from sklearn.metrics import balanced_accuracy_score, accuracy_score
from utils import load_vectors, ae_data_generator, rnd_labeled_data_generator

Загружаем векторы

In [2]:
vectors = {
    "mus": "../data/mus_vectors.csv",
    "auto": "../data/auto_vectors.csv"
}

Отзывы об автомобилях примем за исходный домен, а отзывы о музыкальных инструментах - за целевой 

In [3]:
mus_df = load_vectors(vectors["mus"], manually=False)

In [4]:
# Кернель умирает при загрузке большого файла,
# Поэтому грузим вручную
auto_df = load_vectors(vectors["auto"], manually=True)

'Reading line 20473'

Преобразуем векторы из строк в списки. По какой-то причине Series.apply наглухо стопорит кернель

In [5]:
for i in range(mus_df.shape[0]):
    clear_output(True)
    display("{} / {}".format(i+1, mus_df.shape[0]))
    mus_df.at[i, "vectors"] = eval(mus_df.loc[i, "vectors"])

'10261 / 10261'

Переходим к бинарной классификации и балансируем датасеты

In [6]:
def balanced(df):
    ind_to_drop = np.random.choice(df[df["target_bin"] == 1].index, 
                                   size=(df.shape[0] - 2*df[df["target_bin"] == 0].shape[0]), 
                                   replace=False)
    return df.drop(ind_to_drop, axis=0)

In [7]:
mus_df["target_bin"] = (mus_df["overall"] > 3).astype(int)
mus_df.head()

Unnamed: 0,overall,vectors,target_bin
0,5.0,"[[0.26492563, -0.072336048, 1.0834889, -0.1779...",1
1,5.0,"[[1.9994644, 0.73515296, -0.78534812, -1.10929...",1
2,5.0,"[[-0.37103215, 0.10426682, -1.7503167, -0.0827...",1
3,5.0,"[[-2.6337843, 1.8761243, 0.19478494, -0.978528...",1
4,5.0,"[[-1.039957, -1.9383856, 0.789069, -4.2109776,...",1


In [8]:
mus_df.groupby("target_bin").count()

Unnamed: 0_level_0,overall,vectors
target_bin,Unnamed: 1_level_1,Unnamed: 2_level_1
0,1239,1239
1,9022,9022


In [9]:
balanced_mus_df = balanced(mus_df)
balanced_mus_df.groupby("target_bin").count()

Unnamed: 0_level_0,overall,vectors
target_bin,Unnamed: 1_level_1,Unnamed: 2_level_1
0,1239,1239
1,1239,1239


In [10]:
auto_df["target_bin"] = (auto_df["overall"] > 3).astype(int)
auto_df.head()

Unnamed: 0,overall,vectors,target_bin
0,5.0,"[[-0.67167366, 1.4340448, 0.70772099, 0.267104...",1
1,4.0,"[[-3.4646065, 0.90924335, 0.051792312, -0.0250...",1
2,5.0,"[[-0.34402627, -1.2019368, 0.49981338, -1.2384...",1
3,5.0,"[[-1.187706, 0.5451839, 1.5461149, -0.60059845...",1
4,5.0,"[[-0.78727818, 1.2093724, -2.3347111, -2.42030...",1


In [11]:
balanced_auto_df = balanced(auto_df)
balanced_auto_df.groupby("target_bin").count()

Unnamed: 0_level_0,overall,vectors
target_bin,Unnamed: 1_level_1,Unnamed: 2_level_1
0,2578,2578
1,2578,2578


Обучаем LSTM-автоэнкодер на данных из двух доменов

In [12]:
X_train = pd.concat([auto_df["vectors"], mus_df["vectors"]]).values

In [13]:
X_train.shape

(30734,)

In [14]:
# Converting to ndarray
for i in range(X_train.shape[0]):
    clear_output(True)
    display("{} / {}".format(i+1, X_train.shape[0]))
    X_train[i] = np.array([np.array(vec) for vec in X_train[i]])

'30734 / 30734'

In [15]:
np.random.shuffle(X_train)

In [16]:
train_percent = 0.7

Используем pretraining.
Из-за того, что размер последовательностей не фиксирован, а батч для обучения должен быть тензором, приходится обучать модель по одному сэмплу за раз. Zero-padding приводит к MemoryError даже на маленьком сбалансированном датасете.

In [17]:
from keras.models import Sequential
from keras.layers import LSTM, Dense
import numpy as np

data_dim = 128
num_classes = 2
latent_space_dim = 32


# expected input data shape: (batch_size, timesteps, data_dim)
model = Sequential()
model.add(LSTM(latent_space_dim, return_sequences=True,
               input_shape=(None, data_dim)))
model.add(LSTM(128, return_sequences=True))
# model.add(Dense(num_classes, activation='softmax'))

model.compile(loss='mean_squared_error',
              optimizer='adagrad')
model.fit_generator(ae_data_generator(X_train), steps_per_epoch=3000, epochs=10, verbose=1)

Using TensorFlow backend.


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x25cc4555908>

На всякий случай сохраняем модель

In [18]:
# model.save("./lstm_v1.hdf5")

Удаляем последний слой

In [19]:
model.layers.pop()
model.layers

[<keras.layers.recurrent.LSTM at 0x25c57cdbda0>,
 <keras.layers.recurrent.LSTM at 0x25cc3b1d630>]

In [22]:
X_train, X_test, y_train, y_test = train_test_split(balanced_auto_df["vectors"].values, 
                                                    balanced_auto_df["target_bin"].values, test_size=0.3)

Добавляем классификатор со структурой, полученной в LSTM_classifier.ipynb

In [23]:
hidden_size1 = 32
hidden_size2 = 150

model.add(LSTM(hidden_size1, return_sequences=True, input_shape=(None, 128)))
model.add(Dense(1, activation='hard_sigmoid'))

model.compile(loss='binary_crossentropy',
              optimizer='adagrad', metrics=["accuracy"])

model.fit_generator(rnd_labeled_data_generator(X_train, y_train), 
                    validation_data=rnd_labeled_data_generator(X_test, y_test), 
                    steps_per_epoch=1500, epochs=2, verbose=1, validation_steps=100)

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x25d102b7278>

In [34]:
def predict(X, model):
    predictions = []
    for i, sample in enumerate(X):
        clear_output(True)
        display("Retrieving {}/{}".format(i + 1, X.shape[0]))
        predictions.append(model.predict(np.reshape(sample, [1, *np.array(sample).shape]), steps=1).reshape(-1))
    return predictions

def res_to_bin(y):
    return np.array(list(map(lambda a: np.round(np.mean(a)).astype(int), y)))

def accuracy(bin_res, y_test):
    print("Balanced accuracy:\t\t{}".format(balanced_accuracy_score(y_test, bin_res, adjusted=False)))
    print("Balanced and adjusted accuracy:\t{}".format(balanced_accuracy_score(y_test, bin_res, adjusted=True)))
    print("Unbalanced accuracy:\t\t{}".format(accuracy_score(y_test, bin_res)))
    
def validate(X_test, y_test, model):
    res = predict(X_test, model)
    bin_res = res_to_bin(res)
    accuracy(bin_res, y_test)

Проверка точности на сбалансированных датасетах

In [36]:
print("Source domain test accuracy:")
validate(X_test, y_test, model)


'Retrieving 1547/1547'

Balanced accuracy:		0.7252423093131057
Balanced and adjusted accuracy:	0.4504846186262115
Unbalanced accuracy:		0.7265675500969618


<table style="width:100%">
  <tr>
    <td>Balanced accuracy</td>
    <td>0.7252423093131057</td> 
  </tr>
  <tr>
    <td>Balanced and adjusted accuracy</td>
    <td>0.4504846186262115</td> 
  </tr>
  <tr>
    <td>Unbalanced accuracy</td>
    <td>0.7265675500969618</td> 
  </tr>
</table>

In [37]:
print("\nTarget domain test accuracy:")
validate(balanced_mus_df["vectors"].values, balanced_mus_df["target_bin"].values, model)

'Retrieving 2478/2478'

Balanced accuracy:		0.6872477804681194
Balanced and adjusted accuracy:	0.3744955609362388
Unbalanced accuracy:		0.6872477804681194


<table style="width:100%">
  <tr>
    <td>Balanced accuracy</td>
    <td>0.6872477804681194</td> 
  </tr>
  <tr>
    <td>Balanced and adjusted accuracy</td>
    <td>0.3744955609362388</td> 
  </tr>
  <tr>
    <td>Unbalanced accuracy</td>
    <td>0.6872477804681194</td> 
  </tr>
</table>

Проверка точности на полном датасете целевого домена

In [38]:
print("\nTarget domain test accuracy (full dataset):")
validate(mus_df["vectors"].values, mus_df["target_bin"].values, model)

'Retrieving 10261/10261'

Balanced accuracy:		0.6946184727530891
Balanced and adjusted accuracy:	0.38923694550617816
Unbalanced accuracy:		0.7635708020660754


<table style="width:100%">
  <tr>
    <td>Balanced accuracy</td>
    <td>0.6946184727530891</td> 
  </tr>
  <tr>
    <td>Balanced and adjusted accuracy</td>
    <td>0.38923694550617816</td> 
  </tr>
  <tr>
    <td>Unbalanced accuracy</td>
    <td>0.7635708020660754</td> 
  </tr>
</table>

Сохраняем модель

In [None]:
# model.save("./lstm_dense_v2.hdf5")

Точность достигла хороших значений для обоих доменов.
> Source domain test accuracy: 0.8351154837766662<br>
> Target domain test accuracy: 0.8674490487938746 
