In [None]:
import tensorflow as tf
from tokenizers import Tokenizer, models, pre_tokenizers, trainers, Regex
import tokenizers
import pandas as pd
import gensim
import ast

In [None]:
model_name = 'kim_cnn_more_layers_word_tokenizer+w2v_embeddings'

# Embeddings

In [None]:
embeddings = gensim.models.KeyedVectors.load_word2vec_format('ubercorpus.lowercased.tokenized.word2vec.300d')

# Load data

In [None]:
df = pd.read_csv('../analysis/processed_data.csv', usecols=['review_translate_sentences_tokens',
                                                            'dataset_name',
                                                            'rating',
                                                           'translated'])

In [None]:
def str_to_list(x):
    try:
        return list(ast.literal_eval(x))
    except:
        return None


def string_to_list_dataframe(df):
    columns = df.columns.tolist()
    columns_w_lists = []
    for column in columns:
        if df[column].astype(str). \
                apply(lambda x: x.startswith('[') and x.endswith(']')) \
                .astype(int).mean() > 0.8:
            columns_w_lists.append(column)
    for column in columns_w_lists:
        df[column] = df[column].apply(lambda x: str_to_list(x))
        df = df[~df[column].isna()]
    return df

In [None]:
df = string_to_list_dataframe(df)

In [None]:
subsets = pd.read_csv('../analysis/train_val_test_indices.csv')

In [None]:
subsets.head()

In [None]:
subsets = subsets.merge(df[['dataset_name', 'translated']], left_on='index', right_index=True)

# Encode text

In [None]:
import seaborn as sns
import numpy as np
from itertools import chain

In [None]:
sns.set()

In [None]:
sentence_len = df['review_translate_sentences_tokens'].apply(lambda x: [len(i) for i in x]).values.tolist()

In [None]:
sentence_len = list(chain(*sentence_len))

In [None]:
np.mean(sentence_len)

In [None]:
df['tokens'] = df['review_translate_sentences_tokens'].apply(lambda x: list(chain(*x)))

In [None]:
df['tokens'] = df['tokens'].apply(lambda x: [i.lower() for i in x])

In [None]:
np.percentile(df['tokens'].apply(len), 99)

In [None]:
vocab = set(chain(*df['tokens'].values.tolist()))
vocab = dict([(c+1, i) for c, i in enumerate(vocab)])

In [None]:
vocab[0] = 'PAD'

In [None]:
inverse_vocab = dict([(v, k) for k,v in vocab.items()])

In [None]:
len(vocab)

In [None]:
df['encoded'] = df['tokens'].apply(lambda x: [inverse_vocab[i] for i in x])

In [None]:
encoded_tokens = df['encoded'].values

In [None]:
from itertools import chain

In [None]:
padded_tokens = tf.keras.preprocessing.sequence\
.pad_sequences(encoded_tokens, maxlen=300, padding="post")


In [None]:
padded_tokens.shape

In [None]:
embeddings.get_vector('фучся', [])

# Gather embeddings 

In [None]:
def gather_embeddings(vocab, embeddings):
    new_embeddings = np.random.uniform(size=(len(vocab), embeddings.vector_size))
    for k,v in vocab.items():
        try:
            vector = embeddings[v]
            new_embeddings[k] = vector
        except:
            pass
    return new_embeddings

In [None]:
embeddings = gather_embeddings(vocab, embeddings)

In [None]:
embeddings.shape

In [None]:
len(vocab)

# Get labels and split data

In [None]:
mapping = dict([(i,c) for c,i in enumerate(df['rating'].unique())])

In [None]:
y = df['rating'].map(mapping).values

In [None]:
num_classes = len(set(y))

In [None]:
train_indices, val_indices, test_indices = subsets[subsets['split']=='train'].index.tolist(),\
subsets[subsets['split']=='val'].index.tolist(),\
subsets[subsets['split']=='test'].index.tolist()


In [None]:
train_y, val_y, test_y = y[train_indices], y[val_indices], y[test_indices]

In [None]:
train_x, val_x, test_x = padded_tokens[train_indices], padded_tokens[val_indices],\
padded_tokens[test_indices]

In [None]:
train_x.shape

# Create  model

In [None]:
pool_window = 3
n_grams_num = [3, 4, 5, 7, 9, 11]
output_dim = 300

In [None]:
tf.keras.backend.clear_session()
np.random.seed(0)
tf.random.set_seed(0)
input_layer = tf.keras.layers.Input(shape=(300,), name='input')
word_embedding = tf.keras.layers.Embedding(input_dim=len(vocab),
                                                   output_dim=300,
                                                   trainable=True,
                                           name='embedding',
                                           mask_zero=True,
                                           weights=[embeddings]
                                                   )
spat_drop = tf.keras.layers.SpatialDropout1D(0.1, name='spatial_dropout')
relu = tf.keras.layers.ReLU(name='relu')
concat = []
embedded = spat_drop(word_embedding(input_layer))
for c,i in enumerate(n_grams_num):
    conv1d = tf.keras.layers.Conv1D(filters=32, kernel_size=i, activation=None,
                                   name=f'conv_ngram_{i}')
    max_pooling = tf.keras.layers.MaxPool1D(pool_size=pool_window, strides=1,
                                           padding='valid')
    dropout = tf.keras.layers.Dropout(0.1, name=f'dropout_cnn_{c}')
    concat.append(dropout(max_pooling(relu(conv1d(embedded)))))

x = tf.keras.layers.concatenate(concat, axis=1, name='concat')
x = tf.keras.layers.Flatten(name='flatten')(x)
x = tf.keras.layers.Dense(512, activation='relu', name='dense_512')(x)
x = tf.keras.layers.Dropout(0.3, name='dropout')(x)
output = tf.keras.layers.Dense(num_classes, activation='softmax', name='output')(x)
model = tf.keras.Model(input_layer, output)

# Compile model

In [None]:
model.compile(loss='sparse_categorical_crossentropy', \
              optimizer=tf.keras.optimizers.Adam(),
             metrics=['acc'])

In [None]:
model.summary()

# Early stopping

In [None]:
import operator
class EarlyStopping:
    def __init__(self, tolerance=5, mode='min'):
        assert mode in ['min','max'], 'Mode should be min or max'
        self.mode = operator.lt if mode=='min' else operator.gt 
        self.tolerance = tolerance
        self.counter = 0
        self.early_stop = False
        self.extremum_value = None
        self.best_model = None
    
    @staticmethod
    def copy_model(model):
        copied_model = tf.keras.models.clone_model(model)
        copied_model.set_weights(model.get_weights())
        return copied_model
        
    def __call__(self, val, model):
        if self.extremum_value is None:
            self.extremum_value = val
            self.best_model = self.copy_model(model)
        else:
            if not self.mode(val, self.extremum_value):
                self.counter+=1
            else:
                self.extremum_value = val
                self.best_model = self.copy_model(model)
                self.counter = 0
        
        if self.counter==self.tolerance:
            self.early_stop=True

# Train model

In [None]:
from sklearn.metrics import f1_score

In [None]:
def evaluate_on_datasets(y_true, y_pred, split='val'):
    d = {}
    for dataset_name in subsets['dataset_name'].unique():
            idx = subsets[subsets['split']==split].copy()
            idx['index'] = list(range(idx.shape[0]))
            idx = idx[(idx['dataset_name']==dataset_name)]\
            ['index'].values.tolist()
            score = f1_score(y_true=y_true[idx], y_pred=y_pred[idx],
                                 average='micro')
            print(f'{split} f1 score for dataset {dataset_name} : {score}')
            d[f'{split}_f1_{dataset_name}'] = score
            
    for flag in [True, False]:
        idx = subsets[subsets['split']==split].copy()
        idx['index'] = list(range(idx.shape[0]))
        idx = idx[idx['translated']==flag]['index'].values.tolist()
        score = f1_score(y_true=y_true[idx], y_pred=y_pred[idx],
                                 average='micro')
        print(f'{split} f1 score for translated=={flag} : {score}')
        d[f'{split}_f1_translated=={flag}'] = score
    return d

In [None]:
def update_history(history, d):
    for key, value in d.items():
        res = history.get(key, [])
        res.append(value)
        history[key] = res

In [None]:
early_stopping = EarlyStopping(mode='max', tolerance=4)

In [None]:
def training_loop(model, epochs=10, batch_size=128):
    dict_history = {}
    for i in range(epochs):
        
        #train model
        history = model.fit(train_x, train_y, validation_data=(val_x, val_y), 
          epochs=1, batch_size=batch_size,
                           verbose=0)
        train_loss, val_loss = history.history['loss'][-1], history.history['val_loss'][-1]
        
        #evaluate model
        train_prediction = np.argmax(model.predict(train_x), axis=-1)
        val_prediction = np.argmax(model.predict(val_x), axis=-1)
        train_f1 = f1_score(y_true=train_y, y_pred=train_prediction,
                           average='micro')
        val_f1 = f1_score(y_true=val_y, y_pred=val_prediction,
                         average='micro')
        
        #printing evaluation
        print(f'Epoch {i}')
        print(f'Overall train f1 : {train_f1}, overall val f1: {val_f1}')
        print(f'Train loss : {train_loss}, val loss: {val_loss}')
        d_train = evaluate_on_datasets(y_true=train_y, y_pred=train_prediction, split='train')
        d_val = evaluate_on_datasets(y_true=val_y, y_pred=val_prediction, split='val')
            
        if i!=epochs-1:
            print('-'*30)
            
        #save history
        update_history(dict_history, d_train)
        update_history(dict_history, d_val)
        update_history(dict_history, {'train_f1': train_f1})
        update_history(dict_history, {'val_f1': val_f1})
        update_history(dict_history, {'train_loss': train_loss})
        update_history(dict_history, {'val_loss': val_loss})
        #early stopping
        
        early_stopping(val_f1, model)
        if early_stopping.early_stop:
            print('Stopping early')
            model = early_stopping.best_model
            break
        
    return dict_history, model

In [None]:
dict_history, model = \
training_loop(model, epochs=20, batch_size=128)

In [None]:
dict_history

# Show charts

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
def plot_history(dict_history, columns):
    plt.figure(figsize=(12,8))
    for i in columns:
        to_plot = dict_history[i]
        plt.plot(range(len(to_plot)), to_plot, 'o-')
    plt.xticks(range(len(to_plot)), range(len(to_plot)))
    plt.xlabel('Epochs')
    plt.legend(columns)

In [None]:
plot_history(dict_history, ['val_loss', 'train_loss'])

In [None]:
plot_history(dict_history, ['val_f1', 'train_f1'])

# Evaluate model

In [None]:
test_predictions = np.argmax(model.predict(test_x), axis=-1)

In [None]:
test_f1 = f1_score(y_true=test_y, y_pred=test_predictions,
                         average='micro')
print(f'Overall test f1-score : {test_f1}')

In [None]:
test_results = evaluate_on_datasets(y_true=test_y, y_pred=test_predictions,split='test')
                     

In [None]:
test_results

# Save data

In [None]:
history = pd.DataFrame(dict_history)
for k,v in test_results.items():
    history[k] = v

In [None]:
history['model'] = model_name

In [None]:
history.to_csv("training_results.csv", mode='a', header=None, index=False)