In [9]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [16]:
!wget --quiet https://raw.githubusercontent.com/tensorflow/models/master/official/nlp/bert/tokenization.py

In [17]:
!pip install bert-tensorflow

Collecting bert-tensorflow
  Downloading bert_tensorflow-1.0.4-py2.py3-none-any.whl (64 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.4/64.4 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: bert-tensorflow
Successfully installed bert-tensorflow-1.0.4
[0m

In [18]:
pip install bert-for-tf2

[0mNote: you may need to restart the kernel to use updated packages.


In [19]:
pip install bert-tokenizer

Collecting bert-tokenizer
  Using cached bert_tokenizer-0.1.5-py3-none-any.whl (1.2 MB)
Installing collected packages: bert-tokenizer
Successfully installed bert-tokenizer-0.1.5
[0mNote: you may need to restart the kernel to use updated packages.


In [20]:
from bert import bert_tokenization
BertTokenizer = bert_tokenization.FullTokenizer

In [21]:
import bert_tokenizer as tokenization

tokenization.tokenizer.FullTokenizer

bert_tokenizer.tokenizer.FullTokenizer

In [22]:
import bert_tokenizer as tokenizer

In [23]:
pip install tokenization

Collecting tokenization
  Downloading tokenization-1.0.7-py3-none-any.whl (10 kB)
Installing collected packages: tokenization
Successfully installed tokenization-1.0.7
[0mNote: you may need to restart the kernel to use updated packages.


In [24]:
import tokenization
import tensorflow as tf
import tensorflow_hub as hub
from tensorflow.keras.utils import to_categorical
from sklearn import preprocessing
from sklearn.model_selection import train_test_split


In [25]:
m_url = 'https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/2'
bert_layer = hub.KerasLayer(m_url, trainable=True)

In [26]:
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = bert_tokenization.FullTokenizer(vocab_file, do_lower_case)

def bert_encode(texts, tokenizer, max_len=512):
    all_tokens = []
    all_masks = []
    all_segments = []
    
    for text in texts:
        text = tokenizer.tokenize(text)
            
        text = text[:max_len-2]
        input_sequence = ["[CLS]"] + text + ["[SEP]"]
        pad_len = max_len - len(input_sequence)
        
        tokens = tokenizer.convert_tokens_to_ids(input_sequence) + [0] * pad_len
        pad_masks = [1] * len(input_sequence) + [0] * pad_len
        segment_ids = [0] * max_len
        
        all_tokens.append(tokens)
        all_masks.append(pad_masks)
        all_segments.append(segment_ids)
    
    return np.array(all_tokens), np.array(all_masks), np.array(all_segments)

In [27]:
def build_model(bert_layer, max_len=512):
    input_word_ids = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    input_mask = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="input_mask")
    segment_ids = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="segment_ids")

    pooled_output, sequence_output = bert_layer([input_word_ids, input_mask, segment_ids])
    clf_output = sequence_output[:, 0, :]
    net = tf.keras.layers.Dense(64, activation='relu')(clf_output)
    net = tf.keras.layers.Dropout(0.2)(net)
    net = tf.keras.layers.Dense(32, activation='relu')(net)
    net = tf.keras.layers.Dropout(0.2)(net)
    out = tf.keras.layers.Dense(3, activation='softmax')(net)
    
    model = tf.keras.models.Model(inputs=[input_word_ids, input_mask, segment_ids], outputs=out)
    model.compile(tf.keras.optimizers.Adam(lr=1e-5), loss='categorical_crossentropy', metrics=['accuracy'])
    
    return model

In [28]:
model = build_model(bert_layer, max_len=250)
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_word_ids (InputLayer)    [(None, 250)]        0           []                               
                                                                                                  
 input_mask (InputLayer)        [(None, 250)]        0           []                               
                                                                                                  
 segment_ids (InputLayer)       [(None, 250)]        0           []                               
                                                                                                  
 keras_layer (KerasLayer)       [(None, 768),        109482241   ['input_word_ids[0][0]',         
                                 (None, 250, 768)]                'input_mask[0][0]',         

In [29]:
!conda install -y gdown

Collecting package metadata (current_repodata.json): done
Solving environment: / 
The environment is inconsistent, please check the package plan carefully
The following packages are causing the inconsistency:

  - rapidsai/linux-64::libcuml==21.12.00=cuda11_g04c4927f3_0
  - conda-forge/linux-64::abseil-cpp==20211102.0=h93e1e8c_3
  - rapidsai/linux-64::dask-cudf==21.12.02=cuda_11_py37_g06540b9b37_0
  - conda-forge/linux-64::pyarrow==5.0.0=py37h8cf84b7_35_cuda
  - rapidsai/linux-64::cuml==21.12.00=cuda11_py37_g04c4927f3_0
  - conda-forge/linux-64::grpc-cpp==1.45.2=he70e3f0_3
  - rapidsai/linux-64::libcudf==21.12.02=cuda11_g06540b9b37_0
  - conda-forge/linux-64::arrow-cpp==5.0.0=py37h846d386_35_cuda
  - rapidsai/linux-64::cudf==21.12.02=cuda_11_py37_g06540b9b37_0
  - conda-forge/noarch::parquet-cpp==1.5.1=2
  - conda-forge/linux-64::libabseil==20211102.0=cxx17_h48a1fff_3
done


  current version: 22.9.0
  latest version: 23.3.0

Please update conda by running

    $ conda update -n base -

In [30]:
!gdown --id 1166w2d83g-dPHOdE_WesyymKzNHDT2ZC

Downloading...
From (uriginal): https://drive.google.com/uc?id=1166w2d83g-dPHOdE_WesyymKzNHDT2ZC
From (redirected): https://drive.google.com/uc?id=1166w2d83g-dPHOdE_WesyymKzNHDT2ZC&confirm=t&uuid=b7096881-fdaa-4fce-8b9d-3a3abdd73fc1
To: /kaggle/working/model.h5
100%|██████████████████████████████████████| 1.32G/1.32G [00:13<00:00, 98.4MB/s]


In [31]:
from IPython.display import FileLink
FileLink(r'model.h5')

In [32]:
model.load_weights('model.h5')

In [34]:
eng_text="Welcome to YC is your career more focused on the party thing.I had a desire to be with everyone. I wanted to make good people laugh by doing good comedy. Those desires were only at a young age. Tamil stand up comedy Kulla Piya how did you come was very good. It will not be shown to anyone. What do you think about Coimbatore?If you like it, you may be happy, there will be trees. People who know how to come and die. People who know how to come and die will always be there. Coimbatore people will come and be so kind. That&#39;s why Tanheemoguzhi is the end of the coin and the card company. This is all. Don&#39;t change your time with me. I&#39;m just for comedy. I got a lot of hits on this video and the comments run your videoMost of the comments come and say that he is a terrorist group and we are opposing our system, so who is opposing, and his history, we will give you the amount of time to seconds. Pannan also told me that now I am a studentYo, I was watching, even when I was coming, I felt like something like a kibit on the way to see everything."

In [47]:
x=["How mean you are. I will kill you if these persists. Its a very long day after hearing brutal comments"]

In [48]:
train_input = bert_encode(x, tokenizer, max_len=250)

In [49]:
test_pred = model.predict(train_input)



In [50]:
test_pred[0]

array([0.19789992, 0.60688466, 0.1952154 ], dtype=float32)

In [41]:
test_pred[0][0]

0.0022013104

0.0009964705