In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import nltk
import string
import tqdm

nltk.download('punkt')
nltk.download('stopwords')
print(tf.__version__)

## Loading data, EDA, data preparation.

In [2]:
train_df = pd.read_csv('../input/genre-classification-dataset-imdb/Genre Classification Dataset/train_data.txt', sep=":::", header=None, engine='python')
train_df.columns = ['id', 'title', 'genre', 'description']
train_df.head()

In [3]:
test_df = pd.read_csv('../input/genre-classification-dataset-imdb/Genre Classification Dataset/test_data_solution.txt', sep=":::", header=None, engine='python')
test_df.columns = ['id', 'title', 'genre', 'description']
test_df.head()

In [4]:
data_type = {0: 'train_data', 1: 'test_data'}

for i, df in enumerate([train_df, test_df]):
    print(f'Dataset "{data_type[i]}" info:')
    print(df.info())
    print('Number of unique genres: ', df['genre'].nunique())
    print('=====================================\n')

plt.figure(figsize=(15,5))
for i, df in enumerate([train_df, test_df]):
    data = df.groupby('genre').count()
    plt.subplot(1, 2, i + 1)
    plt.bar(data.index, data['id'])
    plt.xlabel('Genre')
    plt.ylabel('Number of entries')
    plt.title(f'Genres distribution {data_type[i]}')
    plt.xticks(rotation=90)
plt.show()
del data_type
NUMBER_OF_GENRES = train_df['genre'].nunique()

In [5]:
stop_words = set(nltk.corpus.stopwords.words('english'))

def tokenize_text(raw_text: str):
    tokenized_str = nltk.word_tokenize(raw_text)
    tokens = [i.lower() for i in tokenized_str if (i not in string.punctuation) and (i not in stop_words)]
    # filtered_tokens = [i for i in tokens if ( i not in stop_words )]
    # lemmatized_tokens = [morph.parse(i)[0].normal_form for i in tokens]
    return tokens

train_df['tokenized'] = train_df.description.apply(tokenize_text)
test_df['tokenized'] = test_df.description.apply(tokenize_text)
print('Max description length in tokens: ', train_df.tokenized.apply(len).max())

In [6]:
plt.hist(train_df.tokenized.apply(len), bins=30, log=True)
plt.title('Distribution of the number of tokens in sequences')
plt.xlabel('Length of sequence')
plt.show()

In [7]:
temp = train_df.tokenized.apply(len)
#print('Number of entries with description length > 512: ', temp[temp > 512].count())

In [23]:
words_dict = {'<PAD>': 0}
words_dict['<UNK>'] = 1

index = 2
for seq in tqdm.tqdm(train_df['tokenized']):
    for token in seq:
        if token not in words_dict:
            words_dict[token] = index
            index += 1
print('\nVocabulary length: ', index)
words_dict

In [9]:
inverse_words_dict = {index: token for token, index in words_dict.items()}

In [10]:
words_dict["<UNK>"]
words_dict.get("student")

In [11]:
def decode_text(text):
    return ' '.join([inverse_words_dict.get(i, '?') for i in text])


def encode_text(text):
    words = tokenize_text(text)
    idxs = [words_dict.get(word, words_dict['<UNK>']) for word in words]
    return idxs

In [12]:
sample_text = train_df['description'][4]
print(sample_text, '\n')
print(encode_text(sample_text), '\n')
print(decode_text(encode_text(sample_text)))
del sample_text

In [13]:
genres_dict = {}
index = 0
for gen in train_df.genre.unique():
    genres_dict[gen] = index
    index += 1
reversed_genres_dict = {index: gen for gen, index in genres_dict.items()} # for later use during model evaluation

y_train =  train_df['genre'].map(genres_dict).values
y_test = test_df['genre'].map(genres_dict).values

### Создадим тренировочную и тестовую выборки

In [14]:
x_train = train_df['tokenized'].apply(lambda x: [words_dict.get(i, words_dict['<UNK>']) for i in x]).values
x_test = test_df['tokenized'].apply(lambda x: [words_dict.get(i, words_dict['<UNK>']) for i in x]).values

MAX_SEQ_LEN = 256 
VOCAB_SIZE = len(words_dict)

x_train = tf.keras.preprocessing.sequence.pad_sequences(
    x_train,
    value=words_dict["<PAD>"],
    padding='post',
    maxlen=MAX_SEQ_LEN)

x_test = tf.keras.preprocessing.sequence.pad_sequences(
    x_test,
    value=words_dict["<PAD>"],
    padding='post',
    maxlen=MAX_SEQ_LEN)

In [15]:
from sklearn.model_selection import StratifiedShuffleSplit
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
train_indices, val_indices = next(sss.split(x_test, y_test))
x_val, y_val = x_test[val_indices], y_test[val_indices]
x_test, y_test = x_test[train_indices], y_test[train_indices]

print('TRAIN DATA SHAPE: ', x_train.shape, y_train.shape)
print('VALIDATION DATA SHAPE: ', x_val.shape, y_val.shape)
print('TEST DATA SHAPE: ', x_test.shape, y_test.shape)

In [16]:
data_type = {0: 'train_data', 1: 'test_data', 2: 'val_data'}
plt.figure(figsize=(18,5))
for i, data in enumerate([y_train, y_test, y_val]):
    plt.subplot(1, 3, i + 1)
    plt.hist(data)
    plt.xlabel('Genre code')
    plt.ylabel('Number of entries')
    plt.title(f'Genres distribution {data_type[i]}')
    plt.xticks(rotation=90)
plt.show()

### Создадим и обучим модель

In [17]:
EMB_SIZE = 16

model = tf.keras.models.Sequential([
    tf.keras.layers.Embedding(VOCAB_SIZE, EMB_SIZE, input_length=MAX_SEQ_LEN, mask_zero=True),
    tf.keras.layers.Bidirectional(
        tf.keras.layers.LSTM(EMB_SIZE, return_sequences=True, dropout=0.5, recurrent_dropout=0.5), 
        # merge_mode='sum'
    ),
    tf.keras.layers.Bidirectional(
        tf.keras.layers.LSTM(EMB_SIZE, return_sequences=False, dropout=0.5, recurrent_dropout=0.5)
    ),
    # tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(EMB_SIZE*2, activation='relu'),
    tf.keras.layers.Dense(NUMBER_OF_GENRES, activation=tf.nn.sigmoid),
])

model.summary()

loss = tf.losses.SparseCategoricalCrossentropy()
optimizer = tf.optimizers.Adam(learning_rate=0.001)
# metric = tf.keras.metrics.SparseCategoricalAccuracy()
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir="logs", histogram_freq=1)

model.compile(optimizer=optimizer, loss=loss, metrics=['sparse_categorical_accuracy'])

In [18]:
epochs_counter = 0

!rm -r logs

In [19]:
EPOCHS = 20
BATCH_SIZE = 512

model.fit(x_train, y_train, batch_size=BATCH_SIZE, epochs=EPOCHS + epochs_counter, 
          callbacks=[tensorboard_callback], validation_data=(x_val, y_val), 
          initial_epoch=epochs_counter, verbose=1)
epochs_counter += EPOCHS