# Load Model, Real Predictions

 - load wav file, STT

In [1]:
# 필요 import 문

import urllib3
import json
import base64
import kss
import torch
from torch import nn
import torch.nn.functional as F
from transformers import BertModel, BertTokenizer
import gluonnlp as nlp
from tqdm.notebook import tqdm
import pandas as pd
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from kobert_tokenizer import KoBERTTokenizer



In [2]:
## STT 관련 필요 코드

openApiURL = "http://aiopen.etri.re.kr:8000/WiseASR/Recognition"
accessKey = "624e30a6-6843-4f56-a417-3685c5ceda16"  # replace with your key
languageCode = "korean"
http = urllib3.PoolManager()

In [3]:
##### STT 변환 함수 정의 #####

def wav_to_stt(filename, openApiURL, languageCode, accessKey, http):
    result = []

    with open(filename, "rb") as file:
        audioContents = base64.b64encode(file.read()).decode("utf8")

    requestJson = {
        "argument": {
            "language_code": languageCode,
            "audio": audioContents
        }
    }

    response = http.request(
        "POST",
        openApiURL,
        headers={
            "Content-Type": "application/json; charset=UTF-8",
            "Authorization": accessKey
        },
        body=json.dumps(requestJson)
    )

    try:
        response_body = json.loads(response.data.decode('utf-8'))
    except json.JSONDecodeError:
        print("Failed to decode the response as JSON:")
        raise

    stt_result = response_body.get('return_object', {}).get('recognized', '')
    result = stt_result

    return result

In [4]:
def stt_to_kss(result):
    sentences = kss.split_sentences(result)
    return sentences

In [5]:
##### KoBERT 모델 관련 코드 #####

tokenizer = KoBERTTokenizer.from_pretrained('skt/kobert-base-v1')
bertmodel = BertModel.from_pretrained('skt/kobert-base-v1', return_dict=False)
vocab = nlp.vocab.BERTVocab.from_sentencepiece(tokenizer.vocab_file, padding_token='[PAD]')

class BERTClassifier(nn.Module):
    def __init__(self, bert, hidden_size=768, num_classes=2, dr_rate=None, params=None):
        super(BERTClassifier, self).__init__()
        self.bert = bert
        self.dr_rate = dr_rate

        self.classifier = nn.Linear(hidden_size, num_classes)
        if dr_rate:
            self.dropout = nn.Dropout(p=dr_rate)

    def gen_attention_mask(self, token_ids, valid_length):
        attention_mask = torch.zeros_like(token_ids)
        for i, v in enumerate(valid_length):
            attention_mask[i][:v] = 1
        return attention_mask.float()

    def forward(self, token_ids, valid_length, segment_ids):
        attention_mask = self.gen_attention_mask(token_ids, valid_length)

        _, pooler = self.bert(input_ids=token_ids, token_type_ids=segment_ids.long(), attention_mask=attention_mask.float().to(token_ids.device))
        if self.dr_rate:
            out = self.dropout(pooler)
        return self.classifier(out)

In [6]:
# 예측 후 확률값을 출력하는 함수

def predict_and_print(sentences):
    model.eval()

    # Tokenize all sentences at once
    tokenized_sent = tokenizer(
        sentences,
        return_tensors="pt",
        truncation=True,
        padding=True,
        max_length=128
    )

    token_ids = tokenized_sent['input_ids'].to(device)
    segment_ids = tokenized_sent['token_type_ids'].to(device)
    valid_length = torch.tensor([len(tokenizer.tokenize(sentence)) for sentence in sentences]).to(device)

    with torch.no_grad():
        logits = model(token_ids, valid_length, segment_ids)
        probabilities = F.softmax(logits, dim=1)
        _, predicted = torch.max(logits, 1)

    # Print the results
    for i, (sentence, label, probs) in enumerate(zip(sentences, predicted, probabilities)):
        print(f"Sentence {i + 1}: {sentence}")
        print("Predicted Label:", label.item())
        print("Predicted Probabilities:\n", probs.cpu().numpy())
        print("----------")

    print("Predicted Labels:", predicted)
    print("Predicted Probabilities:\n", probabilities)


In [7]:
# 그래픽 카드 사용

if torch.cuda.is_available():
    device = 'cuda:0'
#     print('현재 가상환경 GPU 사용 가능상태')
else:
    device = 'cpu'
#     print('GPU 사용 불가능 상태')

In [8]:
# model 정의

model = BERTClassifier(bertmodel, dr_rate=0.5).to(device)

In [9]:
# Load the model weights
model_path = '/KITA_Project/Final Model for KoBERT/fianl_vp_text_classification_model.pt'  # Path to your trained model checkpoint
model.load_state_dict(torch.load(model_path))

<All keys matched successfully>

## 상담 음성 예측

In [10]:
# wav 파일 불러오기
filename_counsel_path = './counsel_merge_1 (mp3cut.net).wav'

In [11]:
# wav to stt, stt to kss 진행

stt_result = wav_to_stt(filename_counsel_path, openApiURL, languageCode, accessKey, http)
sentences = stt_to_kss(stt_result)
# print(sentences)
predict_and_print(sentences)

[Kss]: Because there's no supported C++ morpheme analyzer, Kss will take pecab as a backend. :D
For your information, Kss also supports mecab backend.
We recommend you to install mecab or konlpy.tag.Mecab for faster execution of Kss.
Please refer to following web sites for details:
- mecab: https://github.com/hyunwoongko/python-mecab-kor
- konlpy.tag.Mecab: https://konlpy.org/en/latest/api/konlpy.tag/#mecab-class



Sentence 1: 네, 카드번호와 명의자 통화 부탁드립니다.
Predicted Label: 1
Predicted Probabilities:
 [3.178386e-04 9.996822e-01]
----------
Sentence 2: 확인되었습니다.
Predicted Label: 1
Predicted Probabilities:
 [0.00154106 0.998459  ]
----------
Sentence 3: 저는 추가로 넣어드렸습니다.
Predicted Label: 0
Predicted Probabilities:
 [0.9730498  0.02695015]
----------
Sentence 4: 더 필요한 사항 없으십니까?
Predicted Label: 1
Predicted Probabilities:
 [0.00185173 0.9981483 ]
----------
Sentence 5: 태양교육상담사 김민재였습니다.
Predicted Label: 1
Predicted Probabilities:
 [0.00178066 0.9982193 ]
----------
Sentence 6: 감사합니다.
Predicted Label: 1
Predicted Probabilities:
 [6.6386868e-04 9.9933606e-01]
----------
Predicted Labels: tensor([1, 1, 0, 1, 1, 1], device='cuda:0')
Predicted Probabilities:
 tensor([[3.1784e-04, 9.9968e-01],
        [1.5411e-03, 9.9846e-01],
        [9.7305e-01, 2.6950e-02],
        [1.8517e-03, 9.9815e-01],
        [1.7807e-03, 9.9822e-01],
        [6.6387e-04, 9.9934e-01]], device='cuda:0')


- predicted_probs는 각 클래스에 대한 확률을 나타냄
- 예를 들어, `[0.00154106 0.998459]`와 같은 출력
  - 첫 번째 클래스`(0)`의 확률이 0.15%이고 두 번째 클래스`(1)`의 확률이 99.84%임을 의미

## 보이스 피싱 음성 예측

In [12]:
# wav 파일 불러오기
filename_vp_path = './윤지수_보이스피싱범_testdata_[cut_20sec].wav'

In [13]:
# wav to stt, stt to kss 진행

stt_result = wav_to_stt(filename_vp_path, openApiURL, languageCode, accessKey, http)
sentences = stt_to_kss(stt_result)
# print(sentences)
predict_and_print(sentences)

Sentence 1: 여보세요 윤지수 씨 되실까요
Predicted Label: 0
Predicted Probabilities:
 [9.9950659e-01 4.9345376e-04]
----------
Sentence 2: 수고하십니다
Predicted Label: 0
Predicted Probabilities:
 [0.9978409  0.00215905]
----------
Sentence 3: 저는 대검찰청 특수부의 김수영 사무관이에요
Predicted Label: 0
Predicted Probabilities:
 [9.996401e-01 3.598673e-04]
----------
Sentence 4: 지금 저희가 담당하고 있는 사건 중에 전자 금융 거래망 건으로 참고인 조사차 연락드렸습니다
Predicted Label: 0
Predicted Probabilities:
 [9.9963391e-01 3.6608314e-04]
----------
Sentence 5: 몇 가지 확인 차 질문 좀 드리려고 전화드렸습니다
Predicted Label: 0
Predicted Probabilities:
 [9.9959558e-01 4.0444086e-04]
----------
Predicted Labels: tensor([0, 0, 0, 0, 0], device='cuda:0')
Predicted Probabilities:
 tensor([[9.9951e-01, 4.9345e-04],
        [9.9784e-01, 2.1590e-03],
        [9.9964e-01, 3.5987e-04],
        [9.9963e-01, 3.6608e-04],
        [9.9960e-01, 4.0444e-04]], device='cuda:0')


- predicted_probs는 각 클래스에 대한 확률을 나타냄
- 예를 들어, `[0.9978409  0.00215905]`와 같은 출력
  - 첫 번째 클래스`(0)`의 확률이 99.78%이고 두 번째 클래스`(1)`의 확률이 0.21%임을 의미