In [1]:
!pip install tokenizers
import tensorflow as tf
from tokenizers import Tokenizer, models, pre_tokenizers, trainers, Regex
import tokenizers
import pandas as pd

Collecting tokenizers
  Downloading tokenizers-0.13.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m11.0 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: tokenizers
Successfully installed tokenizers-0.13.3
[0m

2023-05-09 16:27:18.448096: I tensorflow/core/platform/cpu_feature_guard.cc:194] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE3 SSE4.1 SSE4.2 AVX
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
model_name = 'kim_cnn_more_layers_spatial_drop'

# Load data

In [3]:
df = pd.read_csv('/home/user/files_for_research_Vova/processed_data.csv',\
                 usecols=['review_translate',
                                                            'dataset_name',
                                                            'rating',
                                                           'translated'])

In [4]:
df.head()

Unnamed: 0,review_translate,rating,dataset_name,translated
0,"Якісна пластмаса , переворот 360 градусвв",5.0,rozetka,False
1,За такі гроші це самий топ,5.0,rozetka,False
2,За такі гроші - просто супер ! Рекомендую .,5.0,rozetka,False
3,Виконує свою роботу не погано . але що ви хоті...,4.0,rozetka,False
4,Купляв на стару сістему як основний під віндов...,4.0,rozetka,False


In [5]:
subsets = pd.read_csv('/home/user/files_for_research_Vova/train_val_test_indices.csv')

In [6]:
subsets.head()

Unnamed: 0,index,split
0,0,train
1,1,test
2,2,train
3,3,train
4,4,train


In [7]:
subsets = subsets.merge(df[['dataset_name', 'translated']], left_on='index', right_index=True)

In [8]:
df['rating'] = df['rating'].astype(int).map({1:'negative', 2 : 'negative', 
                                          3 : 'neutral', 4 : 'positive',
                                          5 : 'positive'})

# Filter data

In [9]:
bad_indices = pd.read_csv('/home/user/files_for_research_Vova/files_to_check.csv')

In [10]:
subsets = subsets[~subsets.index.isin(bad_indices['id'].values)]

In [11]:
df = df[~df.index.isin(bad_indices['id'].values)]

In [12]:
df, subsets = df.reset_index().drop(columns='index'), subsets.reset_index().drop(columns='index')

# Load tokenizer

In [13]:
tokenizer = Tokenizer.from_file("/home/user/files_for_research_Vova/tokenizer_30k.json")


# Encode text

In [14]:
import seaborn as sns
import numpy as np

In [15]:
sns.set()

In [None]:
df['review_translate'] = df['review_translate'].str.lower()

In [None]:
df['encoded'] = tokenizer.encode_batch(df['review_translate'].values)

In [None]:
df['encoded'] = df['encoded'].apply(lambda x: x.ids)

In [None]:
sns.distplot(np.log10(df['encoded'].apply(len)))

In [None]:
np.percentile(df['encoded'].apply(len), 99)

In [None]:
encoded_tokens = df['encoded'].values

In [None]:
from itertools import chain

In [None]:
padded_tokens = tf.keras.preprocessing.sequence\
.pad_sequences(encoded_tokens, maxlen=300, padding="post")


In [None]:
padded_tokens.shape

# Get labels and split data

In [None]:
mapping = dict([(i,c) for c,i in enumerate(df['rating'].unique())])

In [None]:
y = df['rating'].map(mapping).values

In [None]:
num_classes = len(set(y))

In [None]:
train_indices, val_indices, test_indices = subsets[subsets['split']=='train'].index.tolist(),\
subsets[subsets['split']=='val'].index.tolist(),\
subsets[subsets['split']=='test'].index.tolist()


In [None]:
train_y, val_y, test_y = y[train_indices], y[val_indices], y[test_indices]

In [None]:
train_x, val_x, test_x = padded_tokens[train_indices], padded_tokens[val_indices],\
padded_tokens[test_indices]

In [None]:
train_x.shape

# Create  model

In [None]:
n_grams_max = 8
n_grams_min = 2
pool_window = 3
n_grams_num = [3, 4, 5, 7, 9]
output_dim = 300

In [None]:
tf.keras.backend.clear_session()
np.random.seed(0)
tf.random.set_seed(0)
input_layer = tf.keras.layers.Input(shape=(300,), name='input')
word_embedding = tf.keras.layers.Embedding(input_dim=tokenizer.get_vocab_size(),
                                                   output_dim=300,
                                                   trainable=True,
                                           name='embedding',
                                           mask_zero=True
                                                   )
spat_drop = tf.keras.layers.SpatialDropout1D(0.1, name='spatial_dropout')
relu = tf.keras.layers.ReLU(name='relu')
concat = []
embedded = spat_drop(word_embedding(input_layer))
for c,i in enumerate(n_grams_num):
    conv1d = tf.keras.layers.Conv1D(filters=32, kernel_size=i, activation=None,
                                   name=f'conv_ngram_{i}')
    max_pooling = tf.keras.layers.MaxPool1D(pool_size=pool_window, strides=1,
                                           padding='valid')
    dropout = tf.keras.layers.Dropout(0.1, name=f'dropout_cnn_{c}')
    concat.append(dropout(max_pooling(relu(conv1d(embedded)))))

x = tf.keras.layers.concatenate(concat, axis=1, name='concat')
x = tf.keras.layers.Flatten(name='flatten')(x)
x = tf.keras.layers.Dense(512, activation='relu', name='dense_512')(x)
x = tf.keras.layers.Dropout(0.3, name='dropout')(x)
output = tf.keras.layers.Dense(num_classes, activation='softmax', name='output')(x)
model = tf.keras.Model(input_layer, output)

# Compile model

In [None]:
model.compile(loss='sparse_categorical_crossentropy', \
              optimizer=tf.keras.optimizers.Adam(),
             metrics=['acc'])

In [None]:
model.summary()

# Early stopping

In [None]:
import operator
class EarlyStopping:
    def __init__(self, tolerance=5, mode='min'):
        assert mode in ['min','max'], 'Mode should be min or max'
        self.mode = operator.lt if mode=='min' else operator.gt 
        self.tolerance = tolerance
        self.counter = 0
        self.early_stop = False
        self.extremum_value = None
        self.best_model = None
    
    @staticmethod
    def copy_model(model):
        copied_model = tf.keras.models.clone_model(model)
        copied_model.set_weights(model.get_weights())
        return copied_model
        
    def __call__(self, val, model):
        if self.extremum_value is None:
            self.extremum_value = val
            self.best_model = self.copy_model(model)
        else:
            if not self.mode(val, self.extremum_value):
                self.counter+=1
            else:
                self.extremum_value = val
                self.best_model = self.copy_model(model)
                self.counter = 0
        
        if self.counter==self.tolerance:
            self.early_stop=True

# Train model

In [None]:
from sklearn.metrics import f1_score

In [None]:
def evaluate_on_datasets(y_true, y_pred, split='val'):
    d = {}
    for dataset_name in subsets['dataset_name'].unique():
            idx = subsets[subsets['split']==split].copy()
            idx['index'] = list(range(idx.shape[0]))
            idx = idx[(idx['dataset_name']==dataset_name)]\
            ['index'].values.tolist()
            score = f1_score(y_true=y_true[idx], y_pred=y_pred[idx],
                                 average='macro')
            print(f'{split} f1 score for dataset {dataset_name} : {score}')
            d[f'{split}_f1_{dataset_name}'] = score
            
    for flag in [True, False]:
        idx = subsets[subsets['split']==split].copy()
        idx['index'] = list(range(idx.shape[0]))
        idx = idx[idx['translated']==flag]['index'].values.tolist()
        score = f1_score(y_true=y_true[idx], y_pred=y_pred[idx],
                                 average='macro')
        print(f'{split} f1 score for translated=={flag} : {score}')
        d[f'{split}_f1_translated=={flag}'] = score
    return d

In [None]:
def update_history(history, d):
    for key, value in d.items():
        res = history.get(key, [])
        res.append(value)
        history[key] = res

In [None]:
early_stopping = EarlyStopping(mode='max', tolerance=4)

In [None]:
def training_loop(model, epochs=10, batch_size=128):
    dict_history = {}
    for i in range(epochs):
        
        #train model
        history = model.fit(train_x, train_y, validation_data=(val_x, val_y), 
          epochs=1, batch_size=batch_size,
                           verbose=0)
        train_loss, val_loss = history.history['loss'][-1], history.history['val_loss'][-1]
        
        #evaluate model
        train_prediction = np.argmax(model.predict(train_x, batch_size=batch_size), axis=-1)
        val_prediction = np.argmax(model.predict(val_x, batch_size=batch_size), axis=-1)
        train_f1 = f1_score(y_true=train_y, y_pred=train_prediction,
                           average='macro')
        val_f1 = f1_score(y_true=val_y, y_pred=val_prediction,
                         average='macro')
        
        #printing evaluation
        print(f'Epoch {i}')
        print(f'Overall train f1 : {train_f1}, overall val f1: {val_f1}')
        print(f'Train loss : {train_loss}, val loss: {val_loss}')
        d_train = evaluate_on_datasets(y_true=train_y, y_pred=train_prediction, split='train')
        d_val = evaluate_on_datasets(y_true=val_y, y_pred=val_prediction, split='val')
            
        if i!=epochs-1:
            print('-'*30)
            
        #save history
        update_history(dict_history, d_train)
        update_history(dict_history, d_val)
        update_history(dict_history, {'train_f1': train_f1})
        update_history(dict_history, {'val_f1': val_f1})
        update_history(dict_history, {'train_loss': train_loss})
        update_history(dict_history, {'val_loss': val_loss})
        #early stopping
        
        early_stopping(val_f1, model)
        if early_stopping.early_stop:
            print('Stopping early')
            model = early_stopping.best_model
            break
        
    return dict_history, model

In [None]:
dict_history, model = \
training_loop(model, epochs=20, batch_size=2048)

In [None]:
dict_history

# Show charts

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
def plot_history(dict_history, columns):
    plt.figure(figsize=(12,8))
    for i in columns:
        to_plot = dict_history[i]
        plt.plot(range(len(to_plot)), to_plot, 'o-')
    plt.xticks(range(len(to_plot)), range(len(to_plot)))
    plt.xlabel('Epochs')
    plt.legend(columns)

In [None]:
plot_history(dict_history, ['val_loss', 'train_loss'])

In [None]:
plot_history(dict_history, ['val_f1', 'train_f1'])

# Evaluate model

In [None]:
test_predictions = np.argmax(model.predict(test_x, 2048), axis=-1)

In [None]:
test_f1 = f1_score(y_true=test_y, y_pred=test_predictions,
                         average='macro')
print(f'Overall test f1-score : {test_f1}')

In [None]:
test_results = evaluate_on_datasets(y_true=test_y, y_pred=test_predictions,split='test')
                     

# Save history results

In [None]:
history = pd.DataFrame(dict_history)
for k,v in test_results.items():
    history[k] = v

In [None]:
history['model'] = model_name

In [None]:
history.to_csv("/home/user/jupyter_notebooks/Ukranian-SA/notebooks/training/training_results_sentiment.csv", mode='a', header=None, index=None)

In [54]:
history

Unnamed: 0,train_f1_rozetka,train_f1_tripadvisor_hotels_ukraine,train_f1_tripadvisor_restaurants_ukraine,train_f1_translated==True,train_f1_translated==False,val_f1_rozetka,val_f1_tripadvisor_hotels_ukraine,val_f1_tripadvisor_restaurants_ukraine,val_f1_translated==True,val_f1_translated==False,train_f1,val_f1,train_loss,val_loss,test_f1_rozetka,test_f1_tripadvisor_hotels_ukraine,test_f1_tripadvisor_restaurants_ukraine,test_f1_translated==True,test_f1_translated==False,model
0,0.620909,0.673141,0.702421,0.670601,0.625474,0.588159,0.651386,0.669087,0.640026,0.58029,0.666444,0.633725,0.360949,0.257246,0.657726,0.709472,0.734743,0.705009,0.650857,kim_cnn_more_layers_spatial_drop
1,0.779995,0.811669,0.825774,0.80735,0.794767,0.660475,0.720402,0.726186,0.703494,0.66043,0.806522,0.699413,0.21769,0.250176,0.657726,0.709472,0.734743,0.705009,0.650857,kim_cnn_more_layers_spatial_drop
2,0.862509,0.868174,0.879375,0.870932,0.873722,0.667788,0.697079,0.707785,0.691944,0.672837,0.871544,0.69008,0.159239,0.295959,0.657726,0.709472,0.734743,0.705009,0.650857,kim_cnn_more_layers_spatial_drop
3,0.922871,0.915577,0.929241,0.924043,0.931112,0.686898,0.693732,0.708957,0.698397,0.69017,0.925093,0.69778,0.111425,0.334698,0.657726,0.709472,0.734743,0.705009,0.650857,kim_cnn_more_layers_spatial_drop
4,0.962456,0.96331,0.969231,0.96564,0.964673,0.684853,0.696731,0.708673,0.698321,0.683864,0.965606,0.696969,0.086718,0.415266,0.657726,0.709472,0.734743,0.705009,0.650857,kim_cnn_more_layers_spatial_drop
5,0.95097,0.951253,0.953063,0.951441,0.955102,0.675255,0.680993,0.697199,0.687012,0.672573,0.951997,0.685508,0.054253,0.480626,0.657726,0.709472,0.734743,0.705009,0.650857,kim_cnn_more_layers_spatial_drop
