In [1]:
import os

import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint
from kaggle_datasets import KaggleDatasets
import transformers
from transformers import TFAutoModel, AutoTokenizer
from tqdm.notebook import tqdm
from tokenizers import Tokenizer, models, pre_tokenizers, decoders, processors



## Helper Functions

In [3]:
def regular_encode(texts, tokenizer, maxlen=512):
    enc_di = tokenizer.batch_encode_plus(
        texts, 
        return_attention_masks=False, 
        return_token_type_ids=False,
        pad_to_max_length=True,
        max_length=maxlen
    )
    
    return np.array(enc_di['input_ids'])

In [4]:
def build_model(transformer, max_len=512):
    input_word_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    sequence_output = transformer(input_word_ids)[0]
    cls_token = sequence_output[:, 0, :]
    out = Dense(5, activation='softmax')(cls_token)
    
    model = Model(inputs=input_word_ids, outputs=out)
    model.compile(Adam(lr=1e-5), loss='categorical_crossentropy', metrics=['accuracy'])
    # model.layers[1].trainable = False
    
    return model

## TPU Configs

In [5]:
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    print('Running on TPU ', tpu.master())
except ValueError:
    tpu = None

if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
else:
    strategy = tf.distribute.get_strategy()

print("REPLICAS: ", strategy.num_replicas_in_sync)

Running on TPU  grpc://10.0.0.2:8470
REPLICAS:  8


In [6]:
AUTO = tf.data.experimental.AUTOTUNE

# Configuration
EPOCHS = 2
BATCH_SIZE = 16 * strategy.num_replicas_in_sync
MAX_LEN = 192
MODEL = 'jplu/tf-xlm-roberta-large'

## Create fast tokenizer

In [7]:
tokenizer = AutoTokenizer.from_pretrained(MODEL)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=513.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=5069051.0, style=ProgressStyle(descript…




## Load text data into memory

In [8]:
df = pd.read_csv("/kaggle/input/sbercode-1/sentences.csv", encoding="ISO-8859-1")

df.head()

Unnamed: 0,id,review,intention,topic
0,f80a32d6-c714-11e6-b9f8-fa163e345ce9,Hey I am using what's app,OTHER,App-Feature/Functionality
1,f8456718-c714-11e6-b9f8-fa163e345ce9,Rajesh Beragi133a,OTHER,Other
2,f8aa1c5b-c714-11e6-b9f8-fa163e345ce9,UselEss it takes a big part of Phone Storage,OTHER,Feature/Functionality-Model
3,f9bd611d-c714-11e6-b9f8-fa163e345ce9,Bad,OTHER,Other
4,f9f9cddf-c714-11e6-b9f8-fa163e345ce9,Best app,OTHER,App


In [9]:
from sklearn.preprocessing import LabelEncoder

In [10]:
target_encoder = LabelEncoder()

In [11]:
df['target'] = target_encoder.fit_transform(df['intention'])

df.head()

Unnamed: 0,id,review,intention,topic,target
0,f80a32d6-c714-11e6-b9f8-fa163e345ce9,Hey I am using what's app,OTHER,App-Feature/Functionality,3
1,f8456718-c714-11e6-b9f8-fa163e345ce9,Rajesh Beragi133a,OTHER,Other,3
2,f8aa1c5b-c714-11e6-b9f8-fa163e345ce9,UselEss it takes a big part of Phone Storage,OTHER,Feature/Functionality-Model,3
3,f9bd611d-c714-11e6-b9f8-fa163e345ce9,Bad,OTHER,Other,3
4,f9f9cddf-c714-11e6-b9f8-fa163e345ce9,Best app,OTHER,App,3


In [12]:
df_other = df[df['intention'] == 'OTHER']
df_not_other = df[df['intention'] != 'OTHER']

train = pd.concat([df_other[['review', 'target']].sample(40000),
                   df_not_other[['review', 'target']]])

train.head()

Unnamed: 0,review,target
123594,Addictive,3
90831,Just love it,3
320320,This app is amazing!,3
151647,Come on NPR.,3
46249,Nice,3


In [13]:
from keras.utils import to_categorical

In [14]:
%%time 

x = regular_encode(train.review.values, tokenizer, maxlen=MAX_LEN)
y = to_categorical(train.target)

CPU times: user 20.5 s, sys: 379 ms, total: 20.8 s
Wall time: 20.8 s


In [15]:
from sklearn.model_selection import train_test_split

In [16]:
x_train, x_valid, y_train, y_valid = train_test_split(x, y, test_size=0.1)

# x_train = x_train[:10000]
# y_train = y_train[:10000]

In [17]:
x_valid.shape

(12129, 192)

## Build datasets objects

In [18]:
train_dataset = (
    tf.data.Dataset
    .from_tensor_slices((x_train, y_train))
    .repeat()
    .shuffle(2048)
    .batch(BATCH_SIZE)
    .prefetch(AUTO)
)

valid_dataset = (
    tf.data.Dataset
    .from_tensor_slices((x_valid, y_valid))
    .batch(BATCH_SIZE)
    .cache()
    .prefetch(AUTO)
)

## Load model into the TPU

In [19]:
%%time

with strategy.scope():
    transformer_layer = TFAutoModel.from_pretrained(MODEL)
    model = build_model(transformer_layer, max_len=MAX_LEN)

model.summary()

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=3271420488.0, style=ProgressStyle(descr…


Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_word_ids (InputLayer)  [(None, 192)]             0         
_________________________________________________________________
tf_roberta_model (TFRobertaM ((None, 192, 1024), (None 559890432 
_________________________________________________________________
tf_op_layer_strided_slice (T [(None, 1024)]            0         
_________________________________________________________________
dense (Dense)                (None, 5)                 5125      
Total params: 559,895,557
Trainable params: 559,895,557
Non-trainable params: 0
_________________________________________________________________
CPU times: user 2min 18s, sys: 44.7 s, total: 3min 3s
Wall time: 8min 49s


## Train Model

In [20]:
n_steps = x_train.shape[0] // BATCH_SIZE
train_history = model.fit(
    train_dataset,
    steps_per_epoch=n_steps,
    validation_data=valid_dataset,
    epochs=EPOCHS
)

Epoch 1/2


  num_elements)


Epoch 2/2


## Saving weights

In [21]:
import pickle

In [22]:
%%time

with strategy.scope():
    w = model.get_weights()
    
    with open('weights_5.npy', 'wb') as f:
        np.save(f, w)

CPU times: user 2.19 s, sys: 6.29 s, total: 8.48 s
Wall time: 9.98 s


In [23]:
from IPython.display import FileLink
FileLink(r'weights_5.npy')

In [24]:
with open('label_encoder_5.pkl', 'wb') as f:
    pickle.dump(target_encoder, f)

In [25]:
# transformer_layer_1 = TFAutoModel.from_pretrained(MODEL)
# model_1 = build_model(transformer_layer_1, max_len=MAX_LEN)
# model_1.set_weights(w)

In [26]:
# model_1.predict(valid_dataset, verbose=1)

## Creating final wrapper-predictor

In [27]:
class IntentionPredictor:
    def __init__(self, model, tokenizer, label_encoder, batch_size=4, max_len=192, verbose=0):
        self._model = model
        self._tokenizer = tokenizer
        self._label_encoder = label_encoder
        self._batch_size = batch_size
        self._max_len = max_len
        self._verbose = verbose
        
    @staticmethod
    def _regular_encode(texts, tokenizer, maxlen=512):
        enc_di = tokenizer.batch_encode_plus(
            texts, 
            return_attention_masks=False, 
            return_token_type_ids=False,
            pad_to_max_length=True,
            max_length=maxlen
        )
    
        return np.array(enc_di['input_ids'])
        
    def predict(self, texts):
        input_texts = np.asarray(texts)
        x_test = self._regular_encode(input_texts, self._tokenizer, maxlen=self._max_len)
        
        test_dataset = (
            tf.data.Dataset
            .from_tensor_slices(x_test)
            .batch(self._batch_size)
        )
        
        y_test = np.asarray(self._model.predict(test_dataset, verbose=self._verbose))
        categories = []
        
        for probs in y_test:
            category = np.argmax(probs)
            categories.append(category)
            
        return self._label_encoder.inverse_transform(categories)

In [28]:
predictor = IntentionPredictor(model, tokenizer, target_encoder, batch_size=BATCH_SIZE, max_len=MAX_LEN, verbose=1)

## Getting predictions for real dataset

In [30]:
df_test = pd.read_csv('/kaggle/input/sbercode-2/all_reviews_v3.tsv', sep='\t')

df_test.head()

Unnamed: 0.1,Unnamed: 0,Date,App version,Rating,Title,Review,Country,Link,Platform
0,0,2020-07-29 06:53:05,11.1.0,5,Сбербанк онлайн,Не скачивается стала обновлять и зависло прило...,ru,https://appstoreconnect.apple.com/WebObjects/i...,App Store
1,1,2020-07-29 06:28:50,11.1.0,3,.,Приложение не открывается после обновления!!!!...,ru,https://appstoreconnect.apple.com/WebObjects/i...,App Store
2,2,2020-07-29 05:23:33,11.1.0,3,Касательно работы обновлённой версии,"Добрый день, после последнего обновления пропа...",ru,https://appstoreconnect.apple.com/WebObjects/i...,App Store
3,3,2020-07-29 03:06:06,11.1.0,1,.,Не могу скачать так как требует подключения Wi-fi,ru,https://appstoreconnect.apple.com/WebObjects/i...,App Store
4,4,2020-07-29 01:47:35,11.1.0,1,ApplePay,Почему то не могу оплатить при минусе на балан...,ru,https://appstoreconnect.apple.com/WebObjects/i...,App Store


In [31]:
mask = ~pd.isna(df_test['Review'])

df_test[mask].head()

Unnamed: 0.1,Unnamed: 0,Date,App version,Rating,Title,Review,Country,Link,Platform
0,0,2020-07-29 06:53:05,11.1.0,5,Сбербанк онлайн,Не скачивается стала обновлять и зависло прило...,ru,https://appstoreconnect.apple.com/WebObjects/i...,App Store
1,1,2020-07-29 06:28:50,11.1.0,3,.,Приложение не открывается после обновления!!!!...,ru,https://appstoreconnect.apple.com/WebObjects/i...,App Store
2,2,2020-07-29 05:23:33,11.1.0,3,Касательно работы обновлённой версии,"Добрый день, после последнего обновления пропа...",ru,https://appstoreconnect.apple.com/WebObjects/i...,App Store
3,3,2020-07-29 03:06:06,11.1.0,1,.,Не могу скачать так как требует подключения Wi-fi,ru,https://appstoreconnect.apple.com/WebObjects/i...,App Store
4,4,2020-07-29 01:47:35,11.1.0,1,ApplePay,Почему то не могу оплатить при минусе на балан...,ru,https://appstoreconnect.apple.com/WebObjects/i...,App Store


In [32]:
df_test.loc[mask, 'Intention'] = predictor.predict(df_test[mask]['Review'].values)

df_test[mask].head()



Unnamed: 0.1,Unnamed: 0,Date,App version,Rating,Title,Review,Country,Link,Platform,Intention
0,0,2020-07-29 06:53:05,11.1.0,5,Сбербанк онлайн,Не скачивается стала обновлять и зависло прило...,ru,https://appstoreconnect.apple.com/WebObjects/i...,App Store,OTHER
1,1,2020-07-29 06:28:50,11.1.0,3,.,Приложение не открывается после обновления!!!!...,ru,https://appstoreconnect.apple.com/WebObjects/i...,App Store,PROBLEM DISCOVERY
2,2,2020-07-29 05:23:33,11.1.0,3,Касательно работы обновлённой версии,"Добрый день, после последнего обновления пропа...",ru,https://appstoreconnect.apple.com/WebObjects/i...,App Store,OTHER
3,3,2020-07-29 03:06:06,11.1.0,1,.,Не могу скачать так как требует подключения Wi-fi,ru,https://appstoreconnect.apple.com/WebObjects/i...,App Store,PROBLEM DISCOVERY
4,4,2020-07-29 01:47:35,11.1.0,1,ApplePay,Почему то не могу оплатить при минусе на балан...,ru,https://appstoreconnect.apple.com/WebObjects/i...,App Store,PROBLEM DISCOVERY


In [33]:
df_test.to_csv('test_data_v5.tsv', sep='\t', index=False)