In [1]:
!pip install tokenizers
import tensorflow as tf
from tokenizers import Tokenizer, models, pre_tokenizers, trainers, Regex
import tokenizers
import pandas as pd

Collecting tokenizers
  Downloading tokenizers-0.13.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m10.9 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: tokenizers
Successfully installed tokenizers-0.13.3
[0m

2023-04-18 08:27:19.547674: I tensorflow/core/platform/cpu_feature_guard.cc:194] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE3 SSE4.1 SSE4.2 AVX
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


# Load data

In [2]:
df = pd.read_csv('/home/user/files_for_research_Vova/processed_data.csv', usecols=['review_translate',
                                                            'dataset_name',
                                                            'rating',
                                                           'translated'])

# Load tokenizer and encode text

In [3]:
tokenizer = Tokenizer.from_file("/home/user/files_for_research_Vova/tokenizer_30k.json")

In [4]:
df['review_translate'] = df['review_translate'].str.lower()
df['encoded'] = tokenizer.encode_batch(df['review_translate'].values)
df['encoded'] = df['encoded'].apply(lambda x: x.ids)
encoded_tokens = df['encoded'].values
padded_tokens = tf.keras.preprocessing.sequence\
.pad_sequences(encoded_tokens, maxlen=300, padding="post")


In [5]:
mapping = dict([(i,c) for c,i in enumerate(df['rating'].unique())])

In [6]:
y = df['rating'].map(mapping).values

# Model creation

In [14]:
class Attention(tf.keras.layers.Layer):
    def __init__(self,  
                 units=128, **kwargs):
        super(Attention,self).__init__(**kwargs)
        self.units = units
    
    def build(self, input_shape):
        self.W1=self.add_weight(name='attention_weights_1', shape=(input_shape[-1], self.units), 
                               initializer='glorot_uniform', trainable=True)
        
        self.W2=self.add_weight(name='attention_weights_2', shape=(1, self.units), 
                               initializer='glorot_uniform', trainable=True) 
        
        super(Attention, self).build(input_shape)
        
    def call(self, x):
        x = tf.transpose(x, perm=[0, 2, 1])
        attention = tf.nn.softmax(tf.matmul(self.W2, tf.nn.tanh(tf.matmul(self.W1, x))))
        weighted_context = tf.reduce_sum(x * attention, axis=-1)
        return weighted_context, attention
    
    def get_config(self):
        config = super().get_config().copy()
        config.update({
            'units': self.units
        })
        return config


In [16]:
def create_model(pool_window = 2,
output_dim = 300, 
                num_classes=5):
    tf.keras.backend.clear_session()
    np.random.seed(0)
    tf.random.set_seed(0)
    # define layers
    attention = Attention(units=128, name='attention')
    input_layer = tf.keras.layers.Input(shape=(300,), name='input')
    word_embedding = tf.keras.layers.Embedding(input_dim=tokenizer.get_vocab_size(),
                                                       output_dim=300,
                                                       trainable=True,
                                               name='embedding',
                                               mask_zero=True
                                                       )
    spatial_dropout = tf.keras.layers.SpatialDropout1D(0.3, name='spatial_dropout')
    lstm = tf.keras.layers.LSTM(128, name='lstm',
                                return_sequences=True, return_state=True)
    dense1 = tf.keras.layers.Dense(128, activation='relu', name='dense')
    dropout = tf.keras.layers.Dropout(0.5, name='dropout')
    logits_layer = tf.keras.layers.Dense(num_classes, activation='softmax', name='output')

    #actual flow
    embedded = spatial_dropout(word_embedding(input_layer))
    context_vector, state_h, _ = lstm(embedded)
    weighted_context, attention_scores = attention(context_vector)
    final_attn_output = tf.concat([state_h, weighted_context], axis=1)
    x = dense1(final_attn_output)
    x = dropout(x)
    x = logits_layer(x)
    model = tf.keras.Model(input_layer, x)
    model.compile(loss='sparse_categorical_crossentropy', \
              optimizer=tf.keras.optimizers.Adam(),
             metrics=['acc'])
    return model

# Early stopping

In [8]:
import operator
class EarlyStopping:
    def __init__(self, tolerance=5, mode='min'):
        assert mode in ['min','max'], 'Mode should be min or max'
        self.mode = operator.lt if mode=='min' else operator.gt 
        self.tolerance = tolerance
        self.counter = 0
        self.early_stop = False
        self.extremum_value = None
        self.best_model = None
    
    @staticmethod
    def copy_model(model):
        copied_model = tf.keras.models.clone_model(model)
        copied_model.set_weights(model.get_weights())
        return copied_model
        
    def __call__(self, val, model):
        if self.extremum_value is None:
            self.extremum_value = val
            self.best_model = self.copy_model(model)
        else:
            if not self.mode(val, self.extremum_value):
                self.counter+=1
            else:
                self.extremum_value = val
                self.best_model = self.copy_model(model)
                self.counter = 0
        
        if self.counter==self.tolerance:
            self.early_stop=True

# Filtering

In [9]:
import gc
from sklearn.model_selection import StratifiedKFold
import time
import numpy as np
from sklearn.metrics import f1_score

In [10]:
def filtering(X, y, ids, path, n_splits=5, epochs=10, batch_size=2024):
    
    kfold = StratifiedKFold(n_splits=n_splits)
    early_stopping = EarlyStopping(4, 'max')
    f1_scores = []
    
    c = 1
    start_time = time.time()
    for train_idx, val_idx in kfold.split(range(len(X)), y):
         # data split
        X_train = X[train_idx]
        X_val = X[val_idx]
        
        y_train = y[train_idx]
        y_val = y[val_idx]
        
        val_ids = ids[val_idx]
        
        #create model
        model = create_model(num_classes=5)
        
        #train model
        for epoch in range(epochs):
            
            history = model.fit(X_train, y_train, validation_data=(X_val, y_val), 
              epochs=1, batch_size=batch_size,
                               verbose=0)
        
            #evaluate model
            val_prediction = np.argmax(model.predict(X_val), axis=-1)
            val_f1 = f1_score(y_true=y_val, y_pred=val_prediction,
                         average='micro')
            
            f1_scores.append(val_f1)
        
            #early stopping
            early_stopping(val_f1, model)
            if early_stopping.early_stop:
                model = early_stopping.best_model
                break
                
        #save predictions
        predicted_y_val = model.predict(X_val)
        df = pd.DataFrame()
        df['id'] = val_ids
        df['y_predicted_proba'] = predicted_y_val.tolist()
        df['y_true'] = y_val
        df.to_csv(path, mode='a', index=False)

        print(f'Done with {c} fold') 
        c+=1 
        
        del model, df, predicted_y_val;
        gc.collect();
        
    end_time = time.time()
    print(f'It took : {(end_time-start_time)/60} minutes')
    
    return f1_scores

In [11]:
ids = np.array(list(range(len(padded_tokens))))

In [12]:
len(set(y))

5

In [17]:
f1_scores = filtering(padded_tokens, y, ids,\
                      '/home/user/files_for_research_Vova/filtering_lstm_attention.csv')

2023-04-18 08:34:54.346775: W tensorflow/core/grappler/costs/op_level_cost_estimator.cc:690] Error in PredictCost() for the op: op: "Softmax" attr { key: "T" value { type: DT_FLOAT } } inputs { dtype: DT_FLOAT shape { unknown_rank: true } } device { type: "GPU" vendor: "NVIDIA" model: "NVIDIA RTX A4000" frequency: 1560 num_cores: 48 environment { key: "architecture" value: "8.6" } environment { key: "cuda" value: "12000" } environment { key: "cudnn" value: "8700" } num_registers: 65536 l1_cache_size: 24576 l2_cache_size: 4194304 shared_memory_size_per_multiprocessor: 102400 memory_size: 14835253248 bandwidth: 448064000 } outputs { dtype: DT_FLOAT shape { unknown_rank: true } }
2023-04-18 08:34:55.116053: I tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:428] Loaded cuDNN version 8700
2023-04-18 08:34:55.225997: I tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:648] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.
2023-04-18 08



2023-04-18 08:45:47.502862: W tensorflow/core/grappler/costs/op_level_cost_estimator.cc:690] Error in PredictCost() for the op: op: "Softmax" attr { key: "T" value { type: DT_FLOAT } } inputs { dtype: DT_FLOAT shape { unknown_rank: true } } device { type: "GPU" vendor: "NVIDIA" model: "NVIDIA RTX A4000" frequency: 1560 num_cores: 48 environment { key: "architecture" value: "8.6" } environment { key: "cuda" value: "12000" } environment { key: "cudnn" value: "8700" } num_registers: 65536 l1_cache_size: 24576 l2_cache_size: 4194304 shared_memory_size_per_multiprocessor: 102400 memory_size: 14835253248 bandwidth: 448064000 } outputs { dtype: DT_FLOAT shape { unknown_rank: true } }


Done with 1 fold


2023-04-18 08:46:47.499461: W tensorflow/core/grappler/costs/op_level_cost_estimator.cc:690] Error in PredictCost() for the op: op: "Softmax" attr { key: "T" value { type: DT_FLOAT } } inputs { dtype: DT_FLOAT shape { unknown_rank: true } } device { type: "GPU" vendor: "NVIDIA" model: "NVIDIA RTX A4000" frequency: 1560 num_cores: 48 environment { key: "architecture" value: "8.6" } environment { key: "cuda" value: "12000" } environment { key: "cudnn" value: "8700" } num_registers: 65536 l1_cache_size: 24576 l2_cache_size: 4194304 shared_memory_size_per_multiprocessor: 102400 memory_size: 14835253248 bandwidth: 448064000 } outputs { dtype: DT_FLOAT shape { unknown_rank: true } }
2023-04-18 08:48:03.094605: W tensorflow/core/grappler/costs/op_level_cost_estimator.cc:690] Error in PredictCost() for the op: op: "Softmax" attr { key: "T" value { type: DT_FLOAT } } inputs { dtype: DT_FLOAT shape { unknown_rank: true } } device { type: "GPU" vendor: "NVIDIA" model: "NVIDIA RTX A4000" frequency



2023-04-18 08:49:01.436395: W tensorflow/core/grappler/costs/op_level_cost_estimator.cc:690] Error in PredictCost() for the op: op: "Softmax" attr { key: "T" value { type: DT_FLOAT } } inputs { dtype: DT_FLOAT shape { unknown_rank: true } } device { type: "GPU" vendor: "NVIDIA" model: "NVIDIA RTX A4000" frequency: 1560 num_cores: 48 environment { key: "architecture" value: "8.6" } environment { key: "cuda" value: "12000" } environment { key: "cudnn" value: "8700" } num_registers: 65536 l1_cache_size: 24576 l2_cache_size: 4194304 shared_memory_size_per_multiprocessor: 102400 memory_size: 14835253248 bandwidth: 448064000 } outputs { dtype: DT_FLOAT shape { unknown_rank: true } }


Done with 2 fold


2023-04-18 08:50:02.929149: W tensorflow/core/grappler/costs/op_level_cost_estimator.cc:690] Error in PredictCost() for the op: op: "Softmax" attr { key: "T" value { type: DT_FLOAT } } inputs { dtype: DT_FLOAT shape { unknown_rank: true } } device { type: "GPU" vendor: "NVIDIA" model: "NVIDIA RTX A4000" frequency: 1560 num_cores: 48 environment { key: "architecture" value: "8.6" } environment { key: "cuda" value: "12000" } environment { key: "cudnn" value: "8700" } num_registers: 65536 l1_cache_size: 24576 l2_cache_size: 4194304 shared_memory_size_per_multiprocessor: 102400 memory_size: 14835253248 bandwidth: 448064000 } outputs { dtype: DT_FLOAT shape { unknown_rank: true } }
2023-04-18 08:51:21.811745: W tensorflow/core/grappler/costs/op_level_cost_estimator.cc:690] Error in PredictCost() for the op: op: "Softmax" attr { key: "T" value { type: DT_FLOAT } } inputs { dtype: DT_FLOAT shape { unknown_rank: true } } device { type: "GPU" vendor: "NVIDIA" model: "NVIDIA RTX A4000" frequency



2023-04-18 08:52:22.188484: W tensorflow/core/grappler/costs/op_level_cost_estimator.cc:690] Error in PredictCost() for the op: op: "Softmax" attr { key: "T" value { type: DT_FLOAT } } inputs { dtype: DT_FLOAT shape { unknown_rank: true } } device { type: "GPU" vendor: "NVIDIA" model: "NVIDIA RTX A4000" frequency: 1560 num_cores: 48 environment { key: "architecture" value: "8.6" } environment { key: "cuda" value: "12000" } environment { key: "cudnn" value: "8700" } num_registers: 65536 l1_cache_size: 24576 l2_cache_size: 4194304 shared_memory_size_per_multiprocessor: 102400 memory_size: 14835253248 bandwidth: 448064000 } outputs { dtype: DT_FLOAT shape { unknown_rank: true } }


Done with 3 fold


2023-04-18 08:53:24.503695: W tensorflow/core/grappler/costs/op_level_cost_estimator.cc:690] Error in PredictCost() for the op: op: "Softmax" attr { key: "T" value { type: DT_FLOAT } } inputs { dtype: DT_FLOAT shape { unknown_rank: true } } device { type: "GPU" vendor: "NVIDIA" model: "NVIDIA RTX A4000" frequency: 1560 num_cores: 48 environment { key: "architecture" value: "8.6" } environment { key: "cuda" value: "12000" } environment { key: "cudnn" value: "8700" } num_registers: 65536 l1_cache_size: 24576 l2_cache_size: 4194304 shared_memory_size_per_multiprocessor: 102400 memory_size: 14835253248 bandwidth: 448064000 } outputs { dtype: DT_FLOAT shape { unknown_rank: true } }
2023-04-18 08:54:48.325785: W tensorflow/core/grappler/costs/op_level_cost_estimator.cc:690] Error in PredictCost() for the op: op: "Softmax" attr { key: "T" value { type: DT_FLOAT } } inputs { dtype: DT_FLOAT shape { unknown_rank: true } } device { type: "GPU" vendor: "NVIDIA" model: "NVIDIA RTX A4000" frequency



2023-04-18 08:55:53.748878: W tensorflow/core/grappler/costs/op_level_cost_estimator.cc:690] Error in PredictCost() for the op: op: "Softmax" attr { key: "T" value { type: DT_FLOAT } } inputs { dtype: DT_FLOAT shape { unknown_rank: true } } device { type: "GPU" vendor: "NVIDIA" model: "NVIDIA RTX A4000" frequency: 1560 num_cores: 48 environment { key: "architecture" value: "8.6" } environment { key: "cuda" value: "12000" } environment { key: "cudnn" value: "8700" } num_registers: 65536 l1_cache_size: 24576 l2_cache_size: 4194304 shared_memory_size_per_multiprocessor: 102400 memory_size: 14835253248 bandwidth: 448064000 } outputs { dtype: DT_FLOAT shape { unknown_rank: true } }


Done with 4 fold


2023-04-18 08:57:01.954648: W tensorflow/core/grappler/costs/op_level_cost_estimator.cc:690] Error in PredictCost() for the op: op: "Softmax" attr { key: "T" value { type: DT_FLOAT } } inputs { dtype: DT_FLOAT shape { unknown_rank: true } } device { type: "GPU" vendor: "NVIDIA" model: "NVIDIA RTX A4000" frequency: 1560 num_cores: 48 environment { key: "architecture" value: "8.6" } environment { key: "cuda" value: "12000" } environment { key: "cudnn" value: "8700" } num_registers: 65536 l1_cache_size: 24576 l2_cache_size: 4194304 shared_memory_size_per_multiprocessor: 102400 memory_size: 14835253248 bandwidth: 448064000 } outputs { dtype: DT_FLOAT shape { unknown_rank: true } }
2023-04-18 08:58:19.335794: W tensorflow/core/grappler/costs/op_level_cost_estimator.cc:690] Error in PredictCost() for the op: op: "Softmax" attr { key: "T" value { type: DT_FLOAT } } inputs { dtype: DT_FLOAT shape { unknown_rank: true } } device { type: "GPU" vendor: "NVIDIA" model: "NVIDIA RTX A4000" frequency



2023-04-18 08:59:28.515648: W tensorflow/core/grappler/costs/op_level_cost_estimator.cc:690] Error in PredictCost() for the op: op: "Softmax" attr { key: "T" value { type: DT_FLOAT } } inputs { dtype: DT_FLOAT shape { unknown_rank: true } } device { type: "GPU" vendor: "NVIDIA" model: "NVIDIA RTX A4000" frequency: 1560 num_cores: 48 environment { key: "architecture" value: "8.6" } environment { key: "cuda" value: "12000" } environment { key: "cudnn" value: "8700" } num_registers: 65536 l1_cache_size: 24576 l2_cache_size: 4194304 shared_memory_size_per_multiprocessor: 102400 memory_size: 14835253248 bandwidth: 448064000 } outputs { dtype: DT_FLOAT shape { unknown_rank: true } }


Done with 5 fold
It took : 25.678119452794395 minutes


In [18]:
f1_scores

[0.7041378165965214,
 0.7087085728077719,
 0.7090630704017136,
 0.7070039673560513,
 0.7084747552458103,
 0.6968894721757102,
 0.6928768611123682,
 0.704786471768415,
 0.7063606399106961,
 0.5995353783724666,
 0.7300216471440101]