In [1]:
!pip install numpy==1.23.1
!pip install mxnet==1.6.0
!pip install gluonnlp==0.9.1
!pip install tqdm pandas
!pip install sentencepiece==0.1.96
!pip install torch
!pip install transformers==4.28.1

Collecting gluonnlp==0.9.1
  Using cached gluonnlp-0.9.1.tar.gz (252 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: gluonnlp
  Building wheel for gluonnlp (setup.py) ... [?25l[?25hdone
  Created wheel for gluonnlp: filename=gluonnlp-0.9.1-cp310-cp310-linux_x86_64.whl size=564572 sha256=9695eeb678576633ee4ec8565a390352935e511d197126dc32d306c8ef2b5d85
  Stored in directory: /root/.cache/pip/wheels/fc/5b/9c/3295bb07f7c5544a96303a48988707816f44a536e8e1413922
Successfully built gluonnlp
Installing collected packages: gluonnlp
Successfully installed gluonnlp-0.9.1
Collecting sentencepiece==0.1.96
  Downloading sentencepiece-0.1.96-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentencepiece
  Attempting uninstall: sentencepiece
    Found existing installation: sentenc

In [2]:
!pip install 'git+https://github.com/SKTBrain/KoBERT.git#egg=kobert_tokenizer&subdirectory=kobert_hf'

Collecting kobert_tokenizer
  Cloning https://github.com/SKTBrain/KoBERT.git to /tmp/pip-install-f9kbf8o3/kobert-tokenizer_bb9cde93c6344e8da59738aeb930ccc7
  Running command git clone --filter=blob:none --quiet https://github.com/SKTBrain/KoBERT.git /tmp/pip-install-f9kbf8o3/kobert-tokenizer_bb9cde93c6344e8da59738aeb930ccc7
  Resolved https://github.com/SKTBrain/KoBERT.git to commit 47a69af87928fc24e20f571fe10c3cc9dd9af9a3
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: kobert_tokenizer
  Building wheel for kobert_tokenizer (setup.py) ... [?25l[?25hdone
  Created wheel for kobert_tokenizer: filename=kobert_tokenizer-0.1-py3-none-any.whl size=4633 sha256=b356daad55ae4c558cc7296be1a326818e20cf39f1a2c29ae82a747b33d89eaf
  Stored in directory: /tmp/pip-ephem-wheel-cache-tbqtrxmp/wheels/e9/1a/3f/a864970e8a169c176befa3c4a1e07aa612f69195907a4045fe
Successfully built kobert_tokenizer
Installing collected packages: kobert_tokenizer
Successfully ins

In [3]:
import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import gluonnlp as nlp
import numpy as np
from tqdm import tqdm, tqdm_notebook

In [4]:
from kobert_tokenizer import KoBERTTokenizer
from transformers import BertModel

from transformers import AdamW
from transformers.optimization import get_cosine_schedule_with_warmup

In [5]:
tokenizer = KoBERTTokenizer.from_pretrained('skt/kobert-base-v1')
bertmodel = BertModel.from_pretrained('skt/kobert-base-v1', return_dict=False)
vocab = nlp.vocab.BERTVocab.from_sentencepiece(tokenizer.vocab_file, padding_token='[PAD]')
tok = tokenizer.tokenize

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


spiece.model:   0%|          | 0.00/371k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/244 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/432 [00:00<?, ?B/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'XLNetTokenizer'. 
The class this function is called from is 'KoBERTTokenizer'.


config.json:   0%|          | 0.00/535 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/369M [00:00<?, ?B/s]

In [6]:
class BERTDataset(Dataset):
    def __init__(self, dataset, sent_idx, label_idx, bert_tokenizer, vocab, max_len,
                 pad, pair):
        transform = nlp.data.BERTSentenceTransform(
            bert_tokenizer, max_seq_length=max_len, vocab=vocab, pad=pad, pair=pair)

        self.sentences = [transform([i[sent_idx]]) for i in dataset]
        self.labels = [np.int32(i[label_idx]) for i in dataset]

    def __getitem__(self, i):
        return (self.sentences[i] + (self.labels[i], ))

    def __len__(self):
        return (len(self.labels))

In [7]:
class BERTClassifier(nn.Module):
    def __init__(self,
                 bert,
                 hidden_size = 768,
                 num_classes=2,
                 dr_rate=None,
                 params=None):
        super(BERTClassifier, self).__init__()
        self.bert = bert
        self.dr_rate = dr_rate

        self.classifier = nn.Linear(hidden_size , num_classes)
        if dr_rate:
            self.dropout = nn.Dropout(p=dr_rate)

    def gen_attention_mask(self, token_ids, valid_length):
        attention_mask = torch.zeros_like(token_ids)
        for i, v in enumerate(valid_length):
            attention_mask[i][:v] = 1
        return attention_mask.float()

    def forward(self, token_ids, valid_length, segment_ids):
        attention_mask = self.gen_attention_mask(token_ids, valid_length)

        _, pooler = self.bert(input_ids = token_ids, token_type_ids = segment_ids.long(), attention_mask = attention_mask.float().to(token_ids.device))
        if self.dr_rate:
            out = self.dropout(pooler)
        return self.classifier(out)

In [8]:
# debug
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [9]:
path = '/content/gdrive/MyDrive/trained_model.pth'
device = torch.device("cpu")
model = BERTClassifier(bertmodel,  dr_rate=0.5).to(device)
model.load_state_dict(torch.load(path, map_location=device))

<All keys matched successfully>

In [10]:
max_len = 64
batch_size = 128

In [11]:
def predict(predict_sentence):

    data = [predict_sentence, '0']
    dataset_another = [data]

    another_test = BERTDataset(dataset_another, 0, 1, tok, vocab, max_len, True, False)
    test_dataloader = torch.utils.data.DataLoader(another_test, batch_size=batch_size, num_workers=5)

    model.eval()

    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(test_dataloader):
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)

        valid_length= valid_length
        label = label.long().to(device)

        out = model(token_ids, valid_length, segment_ids)

        test_eval=[]
        for i in out:
            logits=i
            logits = logits.detach().cpu().numpy()

            if np.argmax(logits) == 0:
                print(f"{predict_sentence}: normal")
            elif np.argmax(logits) == 1:
                print(f"{predict_sentence}: criminal")
            return int(np.argmax(logits))

In [17]:
# debug
while True:
    sentence = input("input text : ")
    if sentence == "exit model":
        break
    predict(sentence)
    print("\n")

input text : exit model


In [14]:
!pip install flask-ngrok
!pip install pyngrok

Collecting flask-ngrok
  Downloading flask_ngrok-0.0.25-py3-none-any.whl (3.1 kB)
Installing collected packages: flask-ngrok
Successfully installed flask-ngrok-0.0.25
Collecting pyngrok
  Downloading pyngrok-7.1.6-py3-none-any.whl (22 kB)
Installing collected packages: pyngrok
Successfully installed pyngrok-7.1.6


In [18]:
from pyngrok import ngrok
from flask import Flask, request, jsonify
!ngrok authtoken 2hzxfJSFvlPJhTK76JdLQbU6aFs_u2kihX8D3PEdWei2kGTJ

app = Flask(__name__)

port = 5001
public_url = ngrok.connect(port)
print(" * ngrok URL:", public_url)

@app.route('/predict_chat', methods=['POST'])
def predict_chat():
    print("received")
    data = request.get_json()
    sentence = data['sentence']
    result = predict(sentence)
    return jsonify({'value': result})

app.run(port=port)


Authtoken saved to configuration file: /root/.config/ngrok/ngrok.yml
 * ngrok URL: NgrokTunnel: "https://3d46-34-125-221-251.ngrok-free.app" -> "http://localhost:5001"
 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on http://127.0.0.1:5001
INFO:werkzeug:[33mPress CTRL+C to quit[0m


received


INFO:werkzeug:127.0.0.1 - - [17/Jun/2024 13:37:17] "POST /predict_chat HTTP/1.1" 200 -


안녕하세요: normal
received


INFO:werkzeug:127.0.0.1 - - [17/Jun/2024 13:37:57] "POST /predict_chat HTTP/1.1" 200 -


너 옷벗은 사진 나한테 보내봐: criminal
received


INFO:werkzeug:127.0.0.1 - - [17/Jun/2024 13:38:32] "POST /predict_chat HTTP/1.1" 200 -


전부 다 벗고 찍어야돼: criminal
received


INFO:werkzeug:127.0.0.1 - - [17/Jun/2024 13:38:46] "POST /predict_chat HTTP/1.1" 200 -


내꺼 보여줄테니까 너도 보내주면 안돼?: criminal
received


INFO:werkzeug:127.0.0.1 - - [17/Jun/2024 13:39:00] "POST /predict_chat HTTP/1.1" 200 -


너 사는데 찾아가서 내가 죽여버릴꺼야: criminal
received
너네 부모님까지 싹다 죽여버릴꺼야: criminal


INFO:werkzeug:127.0.0.1 - - [17/Jun/2024 13:39:22] "POST /predict_chat HTTP/1.1" 200 -


received
안녕하세요: normal


INFO:werkzeug:127.0.0.1 - - [17/Jun/2024 13:53:26] "POST /predict_chat HTTP/1.1" 200 -


received


INFO:werkzeug:127.0.0.1 - - [17/Jun/2024 13:54:12] "POST /predict_chat HTTP/1.1" 200 -


너 옷벗은 사진 나한테 다 보내: criminal
received


INFO:werkzeug:127.0.0.1 - - [17/Jun/2024 13:54:40] "POST /predict_chat HTTP/1.1" 200 -


너 죽여버리기 전에 당장 보내: criminal
received


INFO:werkzeug:127.0.0.1 - - [17/Jun/2024 13:54:57] "POST /predict_chat HTTP/1.1" 200 -


너 사는데 찾아가서 내가 죽여버릴꺼야: criminal
received
팬티까지 벗어: criminal


INFO:werkzeug:127.0.0.1 - - [17/Jun/2024 13:55:06] "POST /predict_chat HTTP/1.1" 200 -


received


INFO:werkzeug:127.0.0.1 - - [17/Jun/2024 13:55:15] "POST /predict_chat HTTP/1.1" 200 -


내꺼 보낼테니까 니 벗은 사진도 같이 보내: criminal
