In [1]:
!pip install tokenizers
import tensorflow as tf
from tokenizers import Tokenizer, models, pre_tokenizers, trainers, Regex
import tokenizers
import pandas as pd

Collecting tokenizers
  Downloading tokenizers-0.13.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m11.0 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: tokenizers
Successfully installed tokenizers-0.13.3
[0m

2023-04-18 09:50:34.342919: I tensorflow/core/platform/cpu_feature_guard.cc:194] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE3 SSE4.1 SSE4.2 AVX
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


# Load data

In [3]:
df = pd.read_csv('/home/user/files_for_research_Vova/processed_data.csv', usecols=['review_translate',
                                                            'dataset_name',
                                                            'rating',
                                                           'translated'])

# Load tokenizer and encode text

In [4]:
tokenizer = Tokenizer.from_file("/home/user/files_for_research_Vova/tokenizer_30k.json")

In [5]:
df['review_translate'] = df['review_translate'].str.lower()
df['encoded'] = tokenizer.encode_batch(df['review_translate'].values)
df['encoded'] = df['encoded'].apply(lambda x: x.ids)
encoded_tokens = df['encoded'].values
padded_tokens = tf.keras.preprocessing.sequence\
.pad_sequences(encoded_tokens, maxlen=300, padding="post")


In [6]:
mapping = dict([(i,c) for c,i in enumerate(df['rating'].unique())])

In [7]:
y = df['rating'].map(mapping).values

# Model creation

In [8]:
def create_model(pool_window = 2,
output_dim = 300, 
                num_classes=5):
    tf.keras.backend.clear_session()
    np.random.seed(0)
    tf.random.set_seed(0)
    input_layer = tf.keras.layers.Input(shape=(300,), name='input')
    word_embedding = tf.keras.layers.Embedding(input_dim=tokenizer.get_vocab_size(),
                                                       output_dim=300,
                                                       trainable=True,
                                               name='embedding',
                                               mask_zero=True
                                                       )
    dropout = tf.keras.layers.SpatialDropout1D(0.3)
    lstm = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128, name='lstm'))
    embedded = dropout(word_embedding(input_layer))
    hidden = lstm(embedded)
    x = tf.keras.layers.Dense(128, activation='relu', name='dense')(hidden)
    x = tf.keras.layers.Dropout(0.5, name='dropout')(x)
    output = tf.keras.layers.Dense(num_classes, activation='softmax', name='output')(x)
    model = tf.keras.Model(input_layer, output)
    model.compile(loss='sparse_categorical_crossentropy', \
              optimizer=tf.keras.optimizers.Adam(),
             metrics=['acc'])
    return model

# Early stopping

In [9]:
import operator
class EarlyStopping:
    def __init__(self, tolerance=5, mode='min'):
        assert mode in ['min','max'], 'Mode should be min or max'
        self.mode = operator.lt if mode=='min' else operator.gt 
        self.tolerance = tolerance
        self.counter = 0
        self.early_stop = False
        self.extremum_value = None
        self.best_model = None
    
    @staticmethod
    def copy_model(model):
        copied_model = tf.keras.models.clone_model(model)
        copied_model.set_weights(model.get_weights())
        return copied_model
        
    def __call__(self, val, model):
        if self.extremum_value is None:
            self.extremum_value = val
            self.best_model = self.copy_model(model)
        else:
            if not self.mode(val, self.extremum_value):
                self.counter+=1
            else:
                self.extremum_value = val
                self.best_model = self.copy_model(model)
                self.counter = 0
        
        if self.counter==self.tolerance:
            self.early_stop=True

# Filtering

In [10]:
import gc
from sklearn.model_selection import StratifiedKFold
import time
import numpy as np
from sklearn.metrics import f1_score

In [11]:
def filtering(X, y, ids, path, n_splits=5, epochs=10, batch_size=2024):
    
    kfold = StratifiedKFold(n_splits=n_splits)
    early_stopping = EarlyStopping(4, 'max')
    f1_scores = []
    
    c = 1
    start_time = time.time()
    for train_idx, val_idx in kfold.split(range(len(X)), y):
         # data split
        X_train = X[train_idx]
        X_val = X[val_idx]
        
        y_train = y[train_idx]
        y_val = y[val_idx]
        
        val_ids = ids[val_idx]
        
        #create model
        model = create_model(num_classes=5)
        
        #train model
        for epoch in range(epochs):
            
            history = model.fit(X_train, y_train, validation_data=(X_val, y_val), 
              epochs=1, batch_size=batch_size,
                               verbose=0)
        
            #evaluate model
            val_prediction = np.argmax(model.predict(X_val), axis=-1)
            val_f1 = f1_score(y_true=y_val, y_pred=val_prediction,
                         average='micro')
            
            f1_scores.append(val_f1)
        
            #early stopping
            early_stopping(val_f1, model)
            if early_stopping.early_stop:
                model = early_stopping.best_model
                break
                
        #save predictions
        predicted_y_val = model.predict(X_val)
        df = pd.DataFrame()
        df['id'] = val_ids
        df['y_predicted_proba'] = predicted_y_val.tolist()
        df['y_true'] = y_val
        df.to_csv(path, mode='a', index=False)

        print(f'Done with {c} fold') 
        c+=1 
        
        del model, df, predicted_y_val;
        gc.collect();
        
    end_time = time.time()
    print(f'It took : {(end_time-start_time)/60} minutes')
    
    return f1_scores

In [12]:
ids = np.array(list(range(len(padded_tokens))))

In [13]:
len(set(y))

5

In [14]:
f1_scores = filtering(padded_tokens, y, ids,\
                      '/home/user/files_for_research_Vova/filtering_bilstm.csv')

2023-04-18 09:51:40.090095: I tensorflow/core/platform/cpu_feature_guard.cc:194] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE3 SSE4.1 SSE4.2 AVX
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-04-18 09:51:40.249536: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1621] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 14148 MB memory:  -> device: 0, name: NVIDIA RTX A4000, pci bus id: 0000:03:00.0, compute capability: 8.6
2023-04-18 09:51:51.366682: W tensorflow/core/common_runtime/type_inference.cc:339] Type inference failed. This indicates an invalid graph that escaped type checking. Error message: INVALID_ARGUMENT: expected compatible input types, but input 1:
type_id: TFT_OPTIONAL
args {
  type_id: TFT_PRODUCT
  args {
    type_id: TFT_TENSOR
    args {
      type_id: TFT_INT32
    }
  }
}
 is neith

Done with 1 fold
Done with 2 fold
Done with 3 fold
Done with 4 fold
Done with 5 fold
It took : 28.653895366191865 minutes


In [15]:
f1_scores

[0.7072000724080192,
 0.7109411533994056,
 0.7096287580516208,
 0.7015658234149432,
 0.7042434116245041,
 0.7000648655171894,
 0.70768279253594,
 0.707273289536208,
 0.5592656564666129,
 0.7339965756782646]