In [1]:
import librosa
import numpy as np

import os
import time

import boto3
from botocore.exceptions import ClientError

import gluonnlp as nlp

import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

from transformers import AutoTokenizer
from kobert.pytorch_kobert import get_pytorch_kobert_model
from kobert.utils import get_tokenizer

from torchvision.models.resnet import ResNet, BasicBlock

In [2]:
import pandas as pd

In [3]:
# BERT 모델, Vocabulary 불러오기
class BERTClassifier(nn.Module):
    def __init__(self,
                 bert,
                 hidden_size=768,
                 num_classes=6,  ##클래스 수 조정##
                 dr_rate=None,
                 params=None):
        super(BERTClassifier, self).__init__()
        self.bert = bert
        self.dr_rate = dr_rate

        self.classifier = nn.Linear(hidden_size, num_classes)
        if dr_rate:
            self.dropout = nn.Dropout(p=dr_rate)

    def gen_attention_mask(self, token_ids, valid_length):
        attention_mask = torch.zeros_like(token_ids)
        for i, v in enumerate(valid_length):
            attention_mask[i][:v] = 1
        return attention_mask.float()

    def forward(self, token_ids, valid_length, segment_ids):
        attention_mask = self.gen_attention_mask(token_ids, valid_length)

        _, pooler = self.bert(input_ids=token_ids, token_type_ids=segment_ids.long(),
                              attention_mask=attention_mask.float().to(token_ids.device), return_dict=False)
        if self.dr_rate:
            out = self.dropout(pooler)
        return self.classifier(out)


In [4]:
class BERTDataset(Dataset):
    def __init__(self, dataset, sent_idx, label_idx, bert_tokenizer, vocab, max_len,
                 pad, pair):
        transform = nlp.data.BERTSentenceTransform(
            bert_tokenizer, max_seq_length=max_len, vocab=vocab, pad=pad, pair=pair)

        self.sentences = [transform([i[sent_idx]]) for i in dataset]
        self.labels = [np.int32(i[label_idx]) for i in dataset]

    def __getitem__(self, i):
        return (self.sentences[i] + (self.labels[i],))

    def __len__(self):
        return (len(self.labels))


In [5]:
class ResNetModel(ResNet):
    def __init__(self, num_classes=8):
        super(ResNetModel, self).__init__(BasicBlock, [2, 2, 2, 2], num_classes=num_classes)
        self.conv1 = nn.Conv2d(4, 64, kernel_size=7, stride=2, padding=3, bias=False)

    def forward(self, x):
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)
        x = self.maxpool(x)

        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)

        x = self.avgpool(x)
        x = x.view(x.size(0), -1)
        x = self.fc(x)
        return x

In [6]:
device = torch.device('cpu')

In [7]:
# Setting prediction parameters
max_len = 60
batch_size = 64
learning_rate = 5e-5

print("Loading BERT model...")
# Load pre-trained model (weights)
bertmodel, vocab = get_pytorch_kobert_model()

# Load tokenizer from a local directory
# kobert_tokenizer = AutoTokenizer.from_pretrained("kobert_tokenizer", use_fast=False)
# tok = kobert_tokenizer.tokenize
print("Loading BERT tokenizer...")
tokenizer = get_tokenizer()
tok = nlp.data.BERTSPTokenizer(tokenizer, vocab, lower=False)

PATH = '/Users/yoohajun/Library/Mobile Documents/com~apple~CloudDocs/Hajun/Graduate_project/fastapi/KoBERT/'
kobert_model = BERTClassifier(bertmodel, dr_rate=0.5)
kobert_model.load_state_dict(torch.load(PATH + 'model_state_dict.pt', map_location=device))

Loading BERT model...
using cached model. /Users/yoohajun/Library/Mobile Documents/com~apple~CloudDocs/Hajun/Graduate_project/fastapi/Fusion/.cache/kobert_v1.zip
using cached model. /Users/yoohajun/Library/Mobile Documents/com~apple~CloudDocs/Hajun/Graduate_project/fastapi/Fusion/.cache/kobert_news_wiki_ko_cased-1087f8699e.spiece
Loading BERT tokenizer...
using cached model. /Users/yoohajun/Library/Mobile Documents/com~apple~CloudDocs/Hajun/Graduate_project/fastapi/Fusion/.cache/kobert_news_wiki_ko_cased-1087f8699e.spiece


<All keys matched successfully>

In [8]:
# Define the path to the saved model
model_path = '/Users/yoohajun/Library/Mobile Documents/com~apple~CloudDocs/Hajun/Graduate_project/fastapi/resnet-model/pytorch_resnet2.pt'
# Load the saved model
checkpoint = torch.load(model_path, map_location=torch.device('cpu'))
resnet_model = ResNetModel(num_classes=8)
resnet_model.load_state_dict(checkpoint['model_state_dict'])


<All keys matched successfully>

In [9]:
def scale_to_range(arr, target_range=(0, 1)):
    # Calculate exponential values for each element
    exp_arr = np.exp(arr)

    # Calculate the sum of all exponential values
    exp_sum = np.sum(exp_arr)

    # Calculate probability for each element by dividing its exponential value by the sum
    probs = exp_arr / exp_sum

    # Scale probabilities to target range
    scaled = (probs * (target_range[1] - target_range[0])) + target_range[0]

    return scaled

# decision diffusion을 수행할 때 range scaling을 하는 이유는 균일하게 특성을 통일해야한다고 생각했지만
# 오히려 scaling을 하게 되면 그 특성을 손상시키게 된다
# 정확도가 89 -> 82 로 낙하 (기본 Decision Tree)

In [10]:
def softmax(vals, idx):
    valscpu = vals.cpu().detach().squeeze(0)
    a = 0
    for i in valscpu:
        a += np.exp(i)
    return ((np.exp(valscpu[idx])) / a).item() * 100


In [11]:
def text_predict(predict_sentence, model=kobert_model):
    # print("predictsentence start:", predict_sentence)
    start = time.time()
    text_label = ['regular', 'help', 'robbery', 'sexual', 'theft', 'violence']
    data = [predict_sentence]
    # dataset_another = [data]

    transform = nlp.data.BERTSentenceTransform(tok, max_len, pad=True, pair=False)
    tokenized = transform(data)
    model.eval()

    # print([tokenized[0]])
    # token_ids = torch.tensor([tokenized[0]]).to(device)
    # segment_ids = torch.tensor([tokenized[2]]).to(device)
    token_ids = torch.tensor(np.array([tokenized[0]])).to(device)
    valid_length = [tokenized[1]]
    segment_ids = torch.tensor(np.array([tokenized[2]])).to(device)

    result = model(token_ids, valid_length, segment_ids)
    # print(result)
    idx = result.argmax().cpu().item()
    out_prob = result.detach().cpu().numpy()[0]

    # scaled_prob = scale_to_range(out_prob)
    # print(out_prob)
    # print("대사의 카테고리는:", text_label[idx])
    # print("대사 신뢰도는:", "{:.2f}%".format(softmax(result, idx)))
    end = time.time() - start
    # print("text predict 걸린 시간:", end)
    return out_prob

In [12]:
def audio_predict(file_location, model=resnet_model):
    try:
        audio_data, sr = librosa.load(file_location, sr=44100, duration=5)

        # Define label names
        label_names = ['regular_note','interior','exterior', 'help', 'robbery', 'sexual', 'theft', 'violence']

        # Calculate the spectrogram of the audio data
        spec = librosa.feature.melspectrogram(y=audio_data, sr=sr)

        # Convert the spectrogram to decibels
        spec_db = librosa.power_to_db(spec, ref=np.max)

        # Add an additional channel to the spectrogram
        spec_db = np.repeat(spec_db[:, :, np.newaxis], 4, axis=2)

        # Resize the spectrogram to match the input shape of the model
        spec_resized = np.resize(spec_db, (1, 4, 128, 128))

        # Normalize the spectrogram by z-score
        mean = np.mean(spec_resized)
        std = np.std(spec_resized)
        spec_resized = (spec_resized - mean) / std

        # Convert the spectrogram to a tensor and move it to the device
        spectrogram_tensor = torch.tensor(spec_resized, dtype=torch.float).to(device)

        # Set the model to evaluation mode
        model.eval()

        # Predict the probabilities for each class
        with torch.no_grad():
            out = model(spectrogram_tensor)

        # Get the index of the class with the highest probability
        predicted_class_index = torch.argmax(out, dim=1)

        label_index = predicted_class_index.item()

        # print("음성의 카테고리는:", label_names[label_index])
        # print("음성 신뢰도는:", "{:.2f}%".format(softmax(out, label_index)))
        prob = out.detach().cpu().numpy()[0]
        # scaled_prob = scale_to_range(prob)

        return prob

    except Exception as e:
        print(f'Error: {e}')
        return None

In [13]:
df = pd.read_csv('./fusion_train_no_feature.csv', encoding='utf8')

In [14]:
df

Unnamed: 0,audio_id,note,label,audio_dir
0,2.강도범죄_352620_label.wav,죽을래,강도범죄,/Users/yoohajun/Desktop/grad_audio/source/fusi...
1,2.강도범죄_352869_label.wav,죽고싶어,강도범죄,/Users/yoohajun/Desktop/grad_audio/source/fusi...
2,2.강도범죄_353054_label.wav,죽을래,강도범죄,/Users/yoohajun/Desktop/grad_audio/source/fusi...
3,2.강도범죄_353156_label.wav,죽을래,강도범죄,/Users/yoohajun/Desktop/grad_audio/source/fusi...
4,2.강도범죄_353550_label.wav,죽고싶냐,강도범죄,/Users/yoohajun/Desktop/grad_audio/source/fusi...
...,...,...,...,...
5402,일반남여_일반통합07_F_1529870693_41_수도권_실내_11997.wav,(SN:)그래 플라멩고에서는 기타가 아주 중요하던데 혹시 파두도 그런가,정상,/Users/yoohajun/Desktop/grad_audio/source/fusi...
5403,일반남여_일반통합07_F_1529870693_41_수도권_실내_12099.wav,난 장기휴가를 받으면 한 달 살기 방식으로 휴가를 가보는 게 로망인데,정상,/Users/yoohajun/Desktop/grad_audio/source/fusi...
5404,일반남여_일반통합07_F_1529870693_41_수도권_실내_12359.wav,알았어 나도 쉽게 생각했는데 그러면 안되겠다 싶네 고마워,정상,/Users/yoohajun/Desktop/grad_audio/source/fusi...
5405,일반남여_일반통합07_F_1529870693_41_수도권_실내_12415.wav,좋았겠다 연말 홍콩이 그렇게 좋다며,정상,/Users/yoohajun/Desktop/grad_audio/source/fusi...


In [15]:
# 음성 파일 id, 음성 파일 경로, note, 음성 파일 븐류 -> 1차 csv

# 오디오 확률 리스트, 텍스트 확률 리스트, 라벨

# 리스트 nested 해제 , 라벨

# diffusion layer ml 학습 - decision tree

In [16]:
# define a function to apply to each row of the DataFrame
def add_prediction(row):
    # call the audio_predict function with the audio_dir value
    audio_prediction = audio_predict(row['audio_dir'])
    # add the prediction values as a new column to the DataFrame
    if audio_prediction is not None:
        # add the prediction values as a new column to the DataFrame
        # temp = scale_to_range(audio_prediction)
        # temp_arr = temp.tolist()
        # row['audio_prediction'] = temp_arr
        row['audio_prediction'] = audio_prediction.tolist()
    else:
        row['audio_prediction'] = None

    text_prediction = text_predict(row['note'])

    if text_prediction is not None:
        # add the prediction values as a new column to the DataFrame
        row['text_prediction'] = text_prediction.tolist()
    else:
        row['text_prediction'] = None

    return row


In [17]:
# fill NaN values with empty string
df = df.fillna('empty')

In [None]:
from tqdm import tqdm
tqdm.pandas()
# apply the add_prediction function to each row of the DataFrame
df = df.progress_apply(add_prediction, axis=1)

In [19]:
len(df)

5407

In [20]:
# print the resulting DataFrame with the new 'prediction' column
df.head()

Unnamed: 0,audio_id,note,label,audio_dir,audio_prediction,text_prediction
0,2.강도범죄_352620_label.wav,죽을래,강도범죄,/Users/yoohajun/Desktop/grad_audio/source/fusi...,"[19.062274932861328, -16.566207885742188, -40....","[-2.1595633029937744, -0.8150117993354797, 3.0..."
1,2.강도범죄_352869_label.wav,죽고싶어,강도범죄,/Users/yoohajun/Desktop/grad_audio/source/fusi...,"[14.87557601928711, -12.88536548614502, -32.70...","[-1.7467725276947021, -1.3309136629104614, 3.1..."
2,2.강도범죄_353054_label.wav,죽을래,강도범죄,/Users/yoohajun/Desktop/grad_audio/source/fusi...,"[19.713226318359375, -13.963038444519043, -41....","[-2.1595633029937744, -0.8150117993354797, 3.0..."
3,2.강도범죄_353156_label.wav,죽을래,강도범죄,/Users/yoohajun/Desktop/grad_audio/source/fusi...,"[11.184046745300293, -3.842231273651123, -23.5...","[-2.1595633029937744, -0.8150117993354797, 3.0..."
4,2.강도범죄_353550_label.wav,죽고싶냐,강도범죄,/Users/yoohajun/Desktop/grad_audio/source/fusi...,"[12.278285026550293, -6.614180088043213, -27.3...","[-1.484963297843933, -1.370447039604187, 3.483..."


In [21]:
df.to_csv('./audio_text_label.csv')

In [22]:
df

Unnamed: 0,audio_id,note,label,audio_dir,audio_prediction,text_prediction
0,2.강도범죄_352620_label.wav,죽을래,강도범죄,/Users/yoohajun/Desktop/grad_audio/source/fusi...,"[19.062274932861328, -16.566207885742188, -40....","[-2.1595633029937744, -0.8150117993354797, 3.0..."
1,2.강도범죄_352869_label.wav,죽고싶어,강도범죄,/Users/yoohajun/Desktop/grad_audio/source/fusi...,"[14.87557601928711, -12.88536548614502, -32.70...","[-1.7467725276947021, -1.3309136629104614, 3.1..."
2,2.강도범죄_353054_label.wav,죽을래,강도범죄,/Users/yoohajun/Desktop/grad_audio/source/fusi...,"[19.713226318359375, -13.963038444519043, -41....","[-2.1595633029937744, -0.8150117993354797, 3.0..."
3,2.강도범죄_353156_label.wav,죽을래,강도범죄,/Users/yoohajun/Desktop/grad_audio/source/fusi...,"[11.184046745300293, -3.842231273651123, -23.5...","[-2.1595633029937744, -0.8150117993354797, 3.0..."
4,2.강도범죄_353550_label.wav,죽고싶냐,강도범죄,/Users/yoohajun/Desktop/grad_audio/source/fusi...,"[12.278285026550293, -6.614180088043213, -27.3...","[-1.484963297843933, -1.370447039604187, 3.483..."
...,...,...,...,...,...,...
5402,일반남여_일반통합07_F_1529870693_41_수도권_실내_11997.wav,(SN:)그래 플라멩고에서는 기타가 아주 중요하던데 혹시 파두도 그런가,정상,/Users/yoohajun/Desktop/grad_audio/source/fusi...,"[15.764503479003906, -9.406872749328613, -30.9...","[-1.5696812868118286, -1.2765368223190308, -0...."
5403,일반남여_일반통합07_F_1529870693_41_수도권_실내_12099.wav,난 장기휴가를 받으면 한 달 살기 방식으로 휴가를 가보는 게 로망인데,정상,/Users/yoohajun/Desktop/grad_audio/source/fusi...,"[11.892139434814453, -9.813698768615723, -26.1...","[4.747616291046143, -1.4685314893722534, 1.110..."
5404,일반남여_일반통합07_F_1529870693_41_수도권_실내_12359.wav,알았어 나도 쉽게 생각했는데 그러면 안되겠다 싶네 고마워,정상,/Users/yoohajun/Desktop/grad_audio/source/fusi...,"[9.303443908691406, -12.391471862792969, -24.7...","[-1.1037938594818115, -0.5025523900985718, -1...."
5405,일반남여_일반통합07_F_1529870693_41_수도권_실내_12415.wav,좋았겠다 연말 홍콩이 그렇게 좋다며,정상,/Users/yoohajun/Desktop/grad_audio/source/fusi...,"[14.91891098022461, -6.188786029815674, -28.14...","[-1.1456010341644287, -0.4835611879825592, -1...."


In [23]:
null_count = df['audio_prediction'].isnull().sum()
print("Null values count in audio_prediction column: ", null_count)


Null values count in audio_prediction column:  1169


In [24]:
df_new = df.dropna(subset=['audio_prediction'])

In [25]:
df_new

Unnamed: 0,audio_id,note,label,audio_dir,audio_prediction,text_prediction
0,2.강도범죄_352620_label.wav,죽을래,강도범죄,/Users/yoohajun/Desktop/grad_audio/source/fusi...,"[19.062274932861328, -16.566207885742188, -40....","[-2.1595633029937744, -0.8150117993354797, 3.0..."
1,2.강도범죄_352869_label.wav,죽고싶어,강도범죄,/Users/yoohajun/Desktop/grad_audio/source/fusi...,"[14.87557601928711, -12.88536548614502, -32.70...","[-1.7467725276947021, -1.3309136629104614, 3.1..."
2,2.강도범죄_353054_label.wav,죽을래,강도범죄,/Users/yoohajun/Desktop/grad_audio/source/fusi...,"[19.713226318359375, -13.963038444519043, -41....","[-2.1595633029937744, -0.8150117993354797, 3.0..."
3,2.강도범죄_353156_label.wav,죽을래,강도범죄,/Users/yoohajun/Desktop/grad_audio/source/fusi...,"[11.184046745300293, -3.842231273651123, -23.5...","[-2.1595633029937744, -0.8150117993354797, 3.0..."
4,2.강도범죄_353550_label.wav,죽고싶냐,강도범죄,/Users/yoohajun/Desktop/grad_audio/source/fusi...,"[12.278285026550293, -6.614180088043213, -27.3...","[-1.484963297843933, -1.370447039604187, 3.483..."
...,...,...,...,...,...,...
5402,일반남여_일반통합07_F_1529870693_41_수도권_실내_11997.wav,(SN:)그래 플라멩고에서는 기타가 아주 중요하던데 혹시 파두도 그런가,정상,/Users/yoohajun/Desktop/grad_audio/source/fusi...,"[15.764503479003906, -9.406872749328613, -30.9...","[-1.5696812868118286, -1.2765368223190308, -0...."
5403,일반남여_일반통합07_F_1529870693_41_수도권_실내_12099.wav,난 장기휴가를 받으면 한 달 살기 방식으로 휴가를 가보는 게 로망인데,정상,/Users/yoohajun/Desktop/grad_audio/source/fusi...,"[11.892139434814453, -9.813698768615723, -26.1...","[4.747616291046143, -1.4685314893722534, 1.110..."
5404,일반남여_일반통합07_F_1529870693_41_수도권_실내_12359.wav,알았어 나도 쉽게 생각했는데 그러면 안되겠다 싶네 고마워,정상,/Users/yoohajun/Desktop/grad_audio/source/fusi...,"[9.303443908691406, -12.391471862792969, -24.7...","[-1.1037938594818115, -0.5025523900985718, -1...."
5405,일반남여_일반통합07_F_1529870693_41_수도권_실내_12415.wav,좋았겠다 연말 홍콩이 그렇게 좋다며,정상,/Users/yoohajun/Desktop/grad_audio/source/fusi...,"[14.91891098022461, -6.188786029815674, -28.14...","[-1.1456010341644287, -0.4835611879825592, -1...."


In [27]:
audio_null_count = df_new['audio_prediction'].isnull().sum()
print("Null values count in audio_prediction column: ", audio_null_count)

text_null_count = df_new['text_prediction'].isnull().sum()
print("Null values count in audio_prediction column: ", text_null_count)

Null values count in audio_prediction column:  0
Null values count in audio_prediction column:  0


In [28]:
df_new.to_csv('./audio_text_label_null_removed.csv')

## Decision Diffusion Method with ML

In [29]:
import pandas as pd
import ast

In [30]:
df = pd.read_csv('./audio_text_label_null_removed.csv', encoding='utf8')

In [31]:
df.head()

Unnamed: 0.1,Unnamed: 0,audio_id,note,label,audio_dir,audio_prediction,text_prediction
0,0,2.강도범죄_352620_label.wav,죽을래,강도범죄,/Users/yoohajun/Desktop/grad_audio/source/fusi...,"[19.062274932861328, -16.566207885742188, -40....","[-2.1595633029937744, -0.8150117993354797, 3.0..."
1,1,2.강도범죄_352869_label.wav,죽고싶어,강도범죄,/Users/yoohajun/Desktop/grad_audio/source/fusi...,"[14.87557601928711, -12.88536548614502, -32.70...","[-1.7467725276947021, -1.3309136629104614, 3.1..."
2,2,2.강도범죄_353054_label.wav,죽을래,강도범죄,/Users/yoohajun/Desktop/grad_audio/source/fusi...,"[19.713226318359375, -13.963038444519043, -41....","[-2.1595633029937744, -0.8150117993354797, 3.0..."
3,3,2.강도범죄_353156_label.wav,죽을래,강도범죄,/Users/yoohajun/Desktop/grad_audio/source/fusi...,"[11.184046745300293, -3.842231273651123, -23.5...","[-2.1595633029937744, -0.8150117993354797, 3.0..."
4,4,2.강도범죄_353550_label.wav,죽고싶냐,강도범죄,/Users/yoohajun/Desktop/grad_audio/source/fusi...,"[12.278285026550293, -6.614180088043213, -27.3...","[-1.484963297843933, -1.370447039604187, 3.483..."


In [32]:
df['audio_prediction'].head()

0    [19.062274932861328, -16.566207885742188, -40....
1    [14.87557601928711, -12.88536548614502, -32.70...
2    [19.713226318359375, -13.963038444519043, -41....
3    [11.184046745300293, -3.842231273651123, -23.5...
4    [12.278285026550293, -6.614180088043213, -27.3...
Name: audio_prediction, dtype: object

In [33]:
df['audio_prediction'][0]

'[19.062274932861328, -16.566207885742188, -40.389774322509766, -9.753218650817871, -13.874068260192871, -19.5970458984375, -18.84337043762207, 6.017386436462402]'

In [34]:
df['audio_prediction'] = df['audio_prediction'].apply(lambda x: ast.literal_eval(str(x).replace('nan', 'None')))
df['text_prediction'] = df['text_prediction'].apply(lambda x: ast.literal_eval(str(x).replace('nan', 'None')))

In [35]:
try:del df['Unnamed: 0']
except: pass
try: del df['Unnamed: 0.1']
except: pass

In [36]:
df

Unnamed: 0,audio_id,note,label,audio_dir,audio_prediction,text_prediction
0,2.강도범죄_352620_label.wav,죽을래,강도범죄,/Users/yoohajun/Desktop/grad_audio/source/fusi...,"[19.062274932861328, -16.566207885742188, -40....","[-2.1595633029937744, -0.8150117993354797, 3.0..."
1,2.강도범죄_352869_label.wav,죽고싶어,강도범죄,/Users/yoohajun/Desktop/grad_audio/source/fusi...,"[14.87557601928711, -12.88536548614502, -32.70...","[-1.7467725276947021, -1.3309136629104614, 3.1..."
2,2.강도범죄_353054_label.wav,죽을래,강도범죄,/Users/yoohajun/Desktop/grad_audio/source/fusi...,"[19.713226318359375, -13.963038444519043, -41....","[-2.1595633029937744, -0.8150117993354797, 3.0..."
3,2.강도범죄_353156_label.wav,죽을래,강도범죄,/Users/yoohajun/Desktop/grad_audio/source/fusi...,"[11.184046745300293, -3.842231273651123, -23.5...","[-2.1595633029937744, -0.8150117993354797, 3.0..."
4,2.강도범죄_353550_label.wav,죽고싶냐,강도범죄,/Users/yoohajun/Desktop/grad_audio/source/fusi...,"[12.278285026550293, -6.614180088043213, -27.3...","[-1.484963297843933, -1.370447039604187, 3.483..."
...,...,...,...,...,...,...
4233,일반남여_일반통합07_F_1529870693_41_수도권_실내_11997.wav,(SN:)그래 플라멩고에서는 기타가 아주 중요하던데 혹시 파두도 그런가,정상,/Users/yoohajun/Desktop/grad_audio/source/fusi...,"[15.764503479003906, -9.406872749328613, -30.9...","[-1.5696812868118286, -1.2765368223190308, -0...."
4234,일반남여_일반통합07_F_1529870693_41_수도권_실내_12099.wav,난 장기휴가를 받으면 한 달 살기 방식으로 휴가를 가보는 게 로망인데,정상,/Users/yoohajun/Desktop/grad_audio/source/fusi...,"[11.892139434814453, -9.813698768615723, -26.1...","[4.747616291046143, -1.4685314893722534, 1.110..."
4235,일반남여_일반통합07_F_1529870693_41_수도권_실내_12359.wav,알았어 나도 쉽게 생각했는데 그러면 안되겠다 싶네 고마워,정상,/Users/yoohajun/Desktop/grad_audio/source/fusi...,"[9.303443908691406, -12.391471862792969, -24.7...","[-1.1037938594818115, -0.5025523900985718, -1...."
4236,일반남여_일반통합07_F_1529870693_41_수도권_실내_12415.wav,좋았겠다 연말 홍콩이 그렇게 좋다며,정상,/Users/yoohajun/Desktop/grad_audio/source/fusi...,"[14.91891098022461, -6.188786029815674, -28.14...","[-1.1456010341644287, -0.4835611879825592, -1...."


In [38]:
new_df = df[['audio_prediction', 'text_prediction', 'label']]

In [39]:
new_df['audio_prediction'][0]
# ['regular_note','interior','exterior', 'help', 'robbery', 'sexual', 'theft', 'violence']

[19.062274932861328,
 -16.566207885742188,
 -40.389774322509766,
 -9.753218650817871,
 -13.874068260192871,
 -19.5970458984375,
 -18.84337043762207,
 6.017386436462402]

In [40]:
new_df['text_prediction'][0]
# 'regular', 'help', 'robbery', 'sexual', 'theft', 'violence'

[-2.1595633029937744,
 -0.8150117993354797,
 3.0031497478485107,
 -1.3458218574523926,
 -1.2037882804870605,
 2.38088059425354]

In [41]:
new_df

Unnamed: 0,audio_prediction,text_prediction,label
0,"[19.062274932861328, -16.566207885742188, -40....","[-2.1595633029937744, -0.8150117993354797, 3.0...",강도범죄
1,"[14.87557601928711, -12.88536548614502, -32.70...","[-1.7467725276947021, -1.3309136629104614, 3.1...",강도범죄
2,"[19.713226318359375, -13.963038444519043, -41....","[-2.1595633029937744, -0.8150117993354797, 3.0...",강도범죄
3,"[11.184046745300293, -3.842231273651123, -23.5...","[-2.1595633029937744, -0.8150117993354797, 3.0...",강도범죄
4,"[12.278285026550293, -6.614180088043213, -27.3...","[-1.484963297843933, -1.370447039604187, 3.483...",강도범죄
...,...,...,...
4233,"[15.764503479003906, -9.406872749328613, -30.9...","[-1.5696812868118286, -1.2765368223190308, -0....",정상
4234,"[11.892139434814453, -9.813698768615723, -26.1...","[4.747616291046143, -1.4685314893722534, 1.110...",정상
4235,"[9.303443908691406, -12.391471862792969, -24.7...","[-1.1037938594818115, -0.5025523900985718, -1....",정상
4236,"[14.91891098022461, -6.188786029815674, -28.14...","[-1.1456010341644287, -0.4835611879825592, -1....",정상


In [43]:
target = new_df['label']

In [35]:
# audio_prob_df

In [44]:
audio_prob_df = new_df['audio_prediction'].apply(pd.Series)

In [45]:
audio_prob_df

Unnamed: 0,0,1,2,3,4,5,6,7
0,19.062275,-16.566208,-40.389774,-9.753219,-13.874068,-19.597046,-18.843370,6.017386
1,14.875576,-12.885365,-32.704639,-8.002541,-8.413226,-15.682615,-13.516604,3.813363
2,19.713226,-13.963038,-41.935040,-11.165486,-13.559433,-26.464722,-19.964052,6.281819
3,11.184047,-3.842231,-23.568636,-8.218171,-9.068943,-11.598783,-13.050208,2.658892
4,12.278285,-6.614180,-27.339249,-9.859420,-3.035055,-12.306230,-13.772902,0.111874
...,...,...,...,...,...,...,...,...
4233,15.764503,-9.406873,-30.937836,-15.016271,-8.383521,-16.754034,-17.965681,6.143479
4234,11.892139,-9.813699,-26.167070,-11.605628,-9.051976,-11.762973,-13.818318,4.723992
4235,9.303444,-12.391472,-24.783703,-6.603366,-4.273757,-14.372682,-6.405017,2.134245
4236,14.918911,-6.188786,-28.146217,-15.657052,-3.920493,-19.647785,-16.625849,3.432556


In [46]:
c = ['audio_prob_{}'.format(i) for i in range(8)]
audio_prob_df.columns = c
audio_prob_df

Unnamed: 0,audio_prob_0,audio_prob_1,audio_prob_2,audio_prob_3,audio_prob_4,audio_prob_5,audio_prob_6,audio_prob_7
0,19.062275,-16.566208,-40.389774,-9.753219,-13.874068,-19.597046,-18.843370,6.017386
1,14.875576,-12.885365,-32.704639,-8.002541,-8.413226,-15.682615,-13.516604,3.813363
2,19.713226,-13.963038,-41.935040,-11.165486,-13.559433,-26.464722,-19.964052,6.281819
3,11.184047,-3.842231,-23.568636,-8.218171,-9.068943,-11.598783,-13.050208,2.658892
4,12.278285,-6.614180,-27.339249,-9.859420,-3.035055,-12.306230,-13.772902,0.111874
...,...,...,...,...,...,...,...,...
4233,15.764503,-9.406873,-30.937836,-15.016271,-8.383521,-16.754034,-17.965681,6.143479
4234,11.892139,-9.813699,-26.167070,-11.605628,-9.051976,-11.762973,-13.818318,4.723992
4235,9.303444,-12.391472,-24.783703,-6.603366,-4.273757,-14.372682,-6.405017,2.134245
4236,14.918911,-6.188786,-28.146217,-15.657052,-3.920493,-19.647785,-16.625849,3.432556


In [47]:
audio_means = audio_prob_df.mean()

In [48]:
audio_prob_df = audio_prob_df.fillna(value=audio_means)

In [49]:
v = np.column_stack([df.audio_prediction.values.tolist()])
v[0][0]

19.062274932861328

In [50]:
text_prob_df = new_df['text_prediction'].apply(pd.Series)
c = ['text_prob_{}'.format(i) for i in range(6)]
text_prob_df.columns = c
text_prob_df

Unnamed: 0,text_prob_0,text_prob_1,text_prob_2,text_prob_3,text_prob_4,text_prob_5
0,-2.159563,-0.815012,3.003150,-1.345822,-1.203788,2.380881
1,-1.746773,-1.330914,3.154380,-1.338142,-1.199885,2.299360
2,-2.159563,-0.815012,3.003150,-1.345822,-1.203788,2.380881
3,-2.159563,-0.815012,3.003150,-1.345822,-1.203788,2.380881
4,-1.484963,-1.370447,3.483673,-1.421772,-1.278839,1.956226
...,...,...,...,...,...,...
4233,-1.569681,-1.276537,-0.463427,-0.437063,-0.319996,3.845335
4234,4.747616,-1.468531,1.110370,-1.298149,-1.232272,-1.747907
4235,-1.103794,-0.502552,-1.090245,4.209689,-1.124880,-0.655139
4236,-1.145601,-0.483561,-1.376934,4.087258,-1.160298,-0.236855


In [51]:
text_means = text_prob_df.mean()

In [52]:
text_prob_df = text_prob_df.fillna(value=text_means)

In [53]:
# 데이터프레임을 병렬로 연결하기
result = pd.concat([audio_prob_df, text_prob_df, target], axis=1)

In [54]:
result

Unnamed: 0,audio_prob_0,audio_prob_1,audio_prob_2,audio_prob_3,audio_prob_4,audio_prob_5,audio_prob_6,audio_prob_7,text_prob_0,text_prob_1,text_prob_2,text_prob_3,text_prob_4,text_prob_5,label
0,19.062275,-16.566208,-40.389774,-9.753219,-13.874068,-19.597046,-18.843370,6.017386,-2.159563,-0.815012,3.003150,-1.345822,-1.203788,2.380881,강도범죄
1,14.875576,-12.885365,-32.704639,-8.002541,-8.413226,-15.682615,-13.516604,3.813363,-1.746773,-1.330914,3.154380,-1.338142,-1.199885,2.299360,강도범죄
2,19.713226,-13.963038,-41.935040,-11.165486,-13.559433,-26.464722,-19.964052,6.281819,-2.159563,-0.815012,3.003150,-1.345822,-1.203788,2.380881,강도범죄
3,11.184047,-3.842231,-23.568636,-8.218171,-9.068943,-11.598783,-13.050208,2.658892,-2.159563,-0.815012,3.003150,-1.345822,-1.203788,2.380881,강도범죄
4,12.278285,-6.614180,-27.339249,-9.859420,-3.035055,-12.306230,-13.772902,0.111874,-1.484963,-1.370447,3.483673,-1.421772,-1.278839,1.956226,강도범죄
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4233,15.764503,-9.406873,-30.937836,-15.016271,-8.383521,-16.754034,-17.965681,6.143479,-1.569681,-1.276537,-0.463427,-0.437063,-0.319996,3.845335,정상
4234,11.892139,-9.813699,-26.167070,-11.605628,-9.051976,-11.762973,-13.818318,4.723992,4.747616,-1.468531,1.110370,-1.298149,-1.232272,-1.747907,정상
4235,9.303444,-12.391472,-24.783703,-6.603366,-4.273757,-14.372682,-6.405017,2.134245,-1.103794,-0.502552,-1.090245,4.209689,-1.124880,-0.655139,정상
4236,14.918911,-6.188786,-28.146217,-15.657052,-3.920493,-19.647785,-16.625849,3.432556,-1.145601,-0.483561,-1.376934,4.087258,-1.160298,-0.236855,정상


In [55]:
result.to_csv('./result_train_features.csv')

In [56]:
X = result.drop('label', axis=1)
y = result['label']

In [57]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [58]:
# 탐색할 매개변수들을 딕셔너리 형태로 정의
param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [2, 3, 4, 5, 6, 7, 8, 9, 10],
    'min_samples_split': [2, 3, 4, 5],
    'min_samples_leaf': [1, 2, 3, 4, 5],
}
# 분류 알고리즘 생성하기
clf = DecisionTreeClassifier()

# GridSearchCV 객체 생성
grid_search = GridSearchCV(estimator=clf, param_grid=param_grid, cv=5)



In [210]:
# 분류 알고리즘 학습시키기
grid_search.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=DecisionTreeClassifier(),
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': [2, 3, 4, 5, 6, 7, 8, 9, 10],
                         'min_samples_leaf': [1, 2, 3, 4, 5],
                         'min_samples_split': [2, 3, 4, 5]})

In [214]:
# 최적의 하이퍼파라미터로 학습된 모델을 가져오기
grid_best_model = grid_search.best_estimator_

In [215]:
# 학습 데이터에 대한 예측 수행
y_train_pred = grid_best_model.predict(X_train)

# 테스트 데이터에 대한 예측 수행
y_test_pred = grid_best_model.predict(X_test)


In [216]:
from sklearn.metrics import accuracy_score, f1_score

# 학습 데이터의 정확도 및 F1 점수 계산
train_accuracy = accuracy_score(y_train, y_train_pred)
train_f1 = f1_score(y_train, y_train_pred, average='macro')

# 테스트 데이터의 정확도 및 F1 점수 계산
test_accuracy = accuracy_score(y_test, y_test_pred)
test_f1 = f1_score(y_test, y_test_pred, average='macro')

print("Train Accuracy:", train_accuracy)
print("Test Accuracy:", test_accuracy)
print("Train F1 Score:", train_f1)
print("Test F1 Score:", test_f1)

Train Accuracy: 0.9584070796460177
Test Accuracy: 0.9363207547169812
Train F1 Score: 0.9179485895621918
Test F1 Score: 0.884731473689261


In [217]:
import pickle

pickle.dump(grid_search, open('./DT_model2.pkl', 'wb'))

In [218]:
# 예측하기
# prediction = grid_search.predict(new_data)

# print(prediction)

## DNN + Self-Attention

In [62]:
## 1. 데이터 불러오기

import pandas as pd

df = pd.read_csv('./result_train_features.csv', encoding='utf8')


In [76]:
df

Unnamed: 0.1,Unnamed: 0,audio_prob_0,audio_prob_1,audio_prob_2,audio_prob_3,audio_prob_4,audio_prob_5,audio_prob_6,audio_prob_7,text_prob_0,text_prob_1,text_prob_2,text_prob_3,text_prob_4,text_prob_5,label
0,0,19.062275,-16.566208,-40.389774,-9.753219,-13.874068,-19.597046,-18.843370,6.017386,-2.159563,-0.815012,3.003150,-1.345822,-1.203788,2.380881,강도범죄
1,1,14.875576,-12.885365,-32.704639,-8.002541,-8.413226,-15.682615,-13.516604,3.813363,-1.746773,-1.330914,3.154380,-1.338142,-1.199885,2.299360,강도범죄
2,2,19.713226,-13.963038,-41.935040,-11.165486,-13.559433,-26.464722,-19.964052,6.281819,-2.159563,-0.815012,3.003150,-1.345822,-1.203788,2.380881,강도범죄
3,3,11.184047,-3.842231,-23.568636,-8.218171,-9.068943,-11.598783,-13.050208,2.658892,-2.159563,-0.815012,3.003150,-1.345822,-1.203788,2.380881,강도범죄
4,4,12.278285,-6.614180,-27.339249,-9.859420,-3.035055,-12.306230,-13.772902,0.111874,-1.484963,-1.370447,3.483673,-1.421772,-1.278839,1.956226,강도범죄
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4233,4233,15.764503,-9.406873,-30.937836,-15.016271,-8.383521,-16.754034,-17.965681,6.143479,-1.569681,-1.276537,-0.463427,-0.437063,-0.319996,3.845335,정상
4234,4234,11.892139,-9.813699,-26.167070,-11.605628,-9.051976,-11.762973,-13.818318,4.723992,4.747616,-1.468531,1.110370,-1.298149,-1.232272,-1.747907,정상
4235,4235,9.303444,-12.391472,-24.783703,-6.603366,-4.273757,-14.372682,-6.405017,2.134245,-1.103794,-0.502552,-1.090245,4.209689,-1.124880,-0.655139,정상
4236,4236,14.918911,-6.188786,-28.146217,-15.657052,-3.920493,-19.647785,-16.625849,3.432556,-1.145601,-0.483561,-1.376934,4.087258,-1.160298,-0.236855,정상


In [87]:
df = df.drop('Unnamed: 0', axis=1)

In [88]:
df

Unnamed: 0,audio_prob_0,audio_prob_1,audio_prob_2,audio_prob_3,audio_prob_4,audio_prob_5,audio_prob_6,audio_prob_7,text_prob_0,text_prob_1,text_prob_2,text_prob_3,text_prob_4,text_prob_5,label
0,19.062275,-16.566208,-40.389774,-9.753219,-13.874068,-19.597046,-18.843370,6.017386,-2.159563,-0.815012,3.003150,-1.345822,-1.203788,2.380881,강도범죄
1,14.875576,-12.885365,-32.704639,-8.002541,-8.413226,-15.682615,-13.516604,3.813363,-1.746773,-1.330914,3.154380,-1.338142,-1.199885,2.299360,강도범죄
2,19.713226,-13.963038,-41.935040,-11.165486,-13.559433,-26.464722,-19.964052,6.281819,-2.159563,-0.815012,3.003150,-1.345822,-1.203788,2.380881,강도범죄
3,11.184047,-3.842231,-23.568636,-8.218171,-9.068943,-11.598783,-13.050208,2.658892,-2.159563,-0.815012,3.003150,-1.345822,-1.203788,2.380881,강도범죄
4,12.278285,-6.614180,-27.339249,-9.859420,-3.035055,-12.306230,-13.772902,0.111874,-1.484963,-1.370447,3.483673,-1.421772,-1.278839,1.956226,강도범죄
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4233,15.764503,-9.406873,-30.937836,-15.016271,-8.383521,-16.754034,-17.965681,6.143479,-1.569681,-1.276537,-0.463427,-0.437063,-0.319996,3.845335,정상
4234,11.892139,-9.813699,-26.167070,-11.605628,-9.051976,-11.762973,-13.818318,4.723992,4.747616,-1.468531,1.110370,-1.298149,-1.232272,-1.747907,정상
4235,9.303444,-12.391472,-24.783703,-6.603366,-4.273757,-14.372682,-6.405017,2.134245,-1.103794,-0.502552,-1.090245,4.209689,-1.124880,-0.655139,정상
4236,14.918911,-6.188786,-28.146217,-15.657052,-3.920493,-19.647785,-16.625849,3.432556,-1.145601,-0.483561,-1.376934,4.087258,-1.160298,-0.236855,정상


In [187]:
import torch
import torch.nn as nn
import torch.optim as optim

In [334]:
# Define the DNN model with self-attention
class DNNWithSelfAttention(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(DNNWithSelfAttention, self).__init__()
        num_heads = 7  # Number of heads for self-attention
        self.attention = nn.MultiheadAttention(input_dim, num_heads=num_heads)
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, 32)  # Additional fully connected layer
        self.fc3 = nn.Linear(32, 16)  # Additional fully connected layer
        self.fc4 = nn.Linear(16, output_dim)  # Additional fully connected layer
        
    def forward(self, x):
        # Check input shape
        if x.dim() == 3:
            batch_size, seq_length, input_dim = x.shape
            # Change shape for self-attention
            x = x.permute(1, 0, 2)  # (sequence length, batch size, input_dim)
            # Perform self-attention
            x, _ = self.attention(x, x, x)
            # Change shape back to original
            x = x.permute(1, 0, 2)  # (batch size, sequence length, input_dim)
        else:
            batch_size, input_dim = x.shape
            seq_length = 1
            # Add a fake batch dimension for self-attention
            x = x.unsqueeze(0)
            # Perform self-attention
            x, _ = self.attention(x, x, x)
            # Remove the fake batch dimension
            x = x.squeeze(0)
        # Apply fully connected layer 1
        x = torch.relu(self.fc1(x))
        # Apply fully connected layer 2
        x = torch.relu(self.fc2(x))  # Additional fully connected layer
        # Apply fully connected layer 3
        x = torch.relu(self.fc3(x))  # Additional fully connected layer
        # Apply fully connected layer 4
        x = self.fc4(x)  
                  
        return x


In [335]:
# Define the dataset and data loader
from sklearn.preprocessing import LabelEncoder

class Dataset(torch.utils.data.Dataset):
    def __init__(self, df):
        self.data = torch.FloatTensor(df.iloc[:, :-1].values)
        self.labels = torch.LongTensor(LabelEncoder().fit_transform(df.iloc[:, -1].values)) # Convert labels to torch.LongTensor after label encoding
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        return self.data[index], self.labels[index]

In [336]:
unique_labels = df['label'].unique()

# 고유 값들 출력
print("Unique Labels:", unique_labels)

Unique Labels: ['강도범죄' '강제추행(성범죄)' '실내' '도움요청' '실외' '폭력범죄' '절도범죄' '정상']


In [337]:
# Define the input dimensions, hidden dimensions, and output dimensions
input_dim = 14 # Number of input features (number of columns in the dataframe excluding the label column)
hidden_dim = 64 # Number of hidden units in the fully connected layers

output_dim = 8 # Number of output classes (number of unique labels in the dataset)

In [338]:
# Create the DNN model
model = DNNWithSelfAttention(input_dim, hidden_dim, output_dim)

In [339]:
# Print the model architecture
print(model)

DNNWithSelfAttention(
  (attention): MultiheadAttention(
    (out_proj): NonDynamicallyQuantizableLinear(in_features=14, out_features=14, bias=True)
  )
  (fc1): Linear(in_features=14, out_features=64, bias=True)
  (fc2): Linear(in_features=64, out_features=32, bias=True)
  (fc3): Linear(in_features=32, out_features=16, bias=True)
  (fc4): Linear(in_features=16, out_features=8, bias=True)
)


In [340]:
# Define the criterion and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [341]:
from sklearn.model_selection import train_test_split
# 데이터를 train과 test로 나누기
df_train, df_test = train_test_split(df, test_size=0.2, random_state=42)

In [342]:
# Train dataset과 DataLoader 생성
train_dataset = Dataset(df_train)
train_data_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

# Test dataset 생성
test_dataset = Dataset(df_test)


In [343]:
train_dataset.data

tensor([[  6.1045,   0.3931, -13.3324,  ...,  -0.5550,  -0.4058,   3.9634],
        [ 12.6984,  -9.6664, -29.0822,  ...,   0.5014,  -0.2412,   0.7384],
        [ 10.3179,  -0.7031, -23.4115,  ...,  -1.1811,  -1.0525,  -0.5453],
        ...,
        [  9.7913,  -7.6798, -25.6983,  ...,  -0.5354,  -0.4819,   3.9610],
        [ 11.2929, -11.9891, -26.3232,  ...,  -1.3724,  -1.6047,  -0.5246],
        [  6.7713,   2.0696, -17.5947,  ...,   4.1644,  -1.0542,  -0.7301]])

In [344]:
train_dataset.labels

tensor([7, 6, 0,  ..., 7, 6, 1])

In [345]:
train_data_loader

<torch.utils.data.dataloader.DataLoader at 0x7fdad63d6c50>

In [346]:
for inputs, labels in train_data_loader:
    print("Inputs shape: ", inputs.shape)
    print("Labels: ", labels)
    break

Inputs shape:  torch.Size([32, 14])
Labels:  tensor([7, 7, 5, 2, 2, 7, 1, 7, 5, 6, 6, 4, 5, 4, 6, 2, 1, 7, 3, 2, 2, 5, 7, 1,
        7, 4, 0, 7, 2, 1, 1, 2])


In [347]:
from sklearn.metrics import f1_score

# Train the model and record accuracy, loss for each epoch
num_epochs = 150
train_losses = []
train_accuracies = []
test_losses = []
test_accuracies = []

for epoch in range(num_epochs):
    train_loss_epoch = 0.0
    train_correct_epoch = 0
    train_total_epoch = 0
    model.train()  # Set model to training mode
    for inputs, labels in train_data_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        _, predicted = torch.max(outputs, 1)
        train_loss_epoch += loss.item() * inputs.size(0)
        train_correct_epoch += (predicted == labels).sum().item()
        train_total_epoch += labels.size(0)

    train_loss_epoch /= len(train_data_loader.dataset)
    train_accuracy_epoch = train_correct_epoch / train_total_epoch
    train_losses.append(train_loss_epoch)
    train_accuracies.append(train_accuracy_epoch)

    # Print the loss and accuracy for this epoch
    print(f'Epoch [{epoch + 1}/{num_epochs}], Train Loss: {train_loss_epoch:.4f}, Train Accuracy: {train_accuracy_epoch:.4f}')

    # Evaluate the model on test data and record accuracy, loss for each epoch
    model.eval()  # Set model to evaluation mode
    with torch.no_grad():
        test_loss_epoch = 0.0
        test_correct_epoch = 0
        test_total_epoch = 0
        test_outputs_all = torch.tensor([], dtype=torch.float32)
        test_labels_all = torch.tensor([], dtype=torch.long)
        for test_inputs, test_labels in DataLoader(test_dataset, batch_size=32, shuffle=False):
            test_outputs = model(test_inputs)
            test_loss = criterion(test_outputs, test_labels)
            test_loss_epoch += test_loss.item() * test_inputs.size(0)
            _, test_predicted = torch.max(test_outputs, 1)
            test_correct_epoch += (test_predicted == test_labels).sum().item()
            test_total_epoch += test_labels.size(0)
            test_outputs_all = torch.cat((test_outputs_all, test_outputs), dim=0)
            test_labels_all = torch.cat((test_labels_all, test_labels), dim=0)

        test_loss_epoch /= len(test_dataset)
        test_accuracy_epoch = test_correct_epoch / test_total_epoch
        test_losses.append(test_loss_epoch)
        test_accuracies.append(test_accuracy_epoch)

    # Print the loss and accuracy for this epoch
    print(f'Epoch [{epoch + 1}/{num_epochs}], Test Loss: {test_loss_epoch:.4f}, Test Accuracy: {test_accuracy_epoch:.4f}')

# Calculate F1 score
test_outputs_all = torch.argmax(test_outputs_all, dim=1)
test_labels_all = test_labels_all.cpu().numpy()
test_outputs_all = test_outputs_all.cpu().numpy()
f1 = f1_score(test_labels_all, test_outputs_all, average='macro')

# Print final train, test accuracy, loss and F1 score
print(f'Final Train Loss: {train_losses[-1]:.4f}, Train Accuracy: {train_accuracies[-1]:.4f}')
print(f'Final Test Loss: {test_losses[-1]:.4f}, Test Accuracy: {test_accuracies[-1]:.4f}, F1 Score: {f1:.4f}')

Epoch [1/150], Train Loss: 1.8283, Train Accuracy: 0.3327
Epoch [1/150], Test Loss: 1.4330, Test Accuracy: 0.4493
Epoch [2/150], Train Loss: 1.0298, Train Accuracy: 0.6319
Epoch [2/150], Test Loss: 0.7743, Test Accuracy: 0.7064
Epoch [3/150], Train Loss: 0.6499, Train Accuracy: 0.7587
Epoch [3/150], Test Loss: 0.5255, Test Accuracy: 0.8243
Epoch [4/150], Train Loss: 0.5047, Train Accuracy: 0.8257
Epoch [4/150], Test Loss: 0.4455, Test Accuracy: 0.8538
Epoch [5/150], Train Loss: 0.4304, Train Accuracy: 0.8422
Epoch [5/150], Test Loss: 0.3994, Test Accuracy: 0.8703
Epoch [6/150], Train Loss: 0.3806, Train Accuracy: 0.8555
Epoch [6/150], Test Loss: 0.3520, Test Accuracy: 0.8856
Epoch [7/150], Train Loss: 0.3454, Train Accuracy: 0.8640
Epoch [7/150], Test Loss: 0.3287, Test Accuracy: 0.8844
Epoch [8/150], Train Loss: 0.3215, Train Accuracy: 0.8761
Epoch [8/150], Test Loss: 0.2918, Test Accuracy: 0.8903
Epoch [9/150], Train Loss: 0.2973, Train Accuracy: 0.8782
Epoch [9/150], Test Loss: 0.26

In [348]:
torch.save(model.state_dict(), './self_attention_model.pt')

In [349]:
input_dim = 14 # Number of input features (number of columns in the dataframe excluding the label column)
hidden_dim = 64 # Number of hidden units in the fully connected layers

output_dim = 8

In [350]:
# Load the best model after training is complete
best_model = DNNWithSelfAttention(input_dim, hidden_dim, output_dim)  # Create a new instance of the model
best_model.load_state_dict(torch.load('self_attention_model.pt'))  # Load the state_dict of the best model
best_model.eval()  # Set the best model to evaluation mode

DNNWithSelfAttention(
  (attention): MultiheadAttention(
    (out_proj): NonDynamicallyQuantizableLinear(in_features=14, out_features=14, bias=True)
  )
  (fc1): Linear(in_features=14, out_features=64, bias=True)
  (fc2): Linear(in_features=64, out_features=32, bias=True)
  (fc3): Linear(in_features=32, out_features=16, bias=True)
  (fc4): Linear(in_features=16, out_features=8, bias=True)
)

Self-attention의 입력은 크기가 (sequence\ length,\ batch\ size,\ input\_dim)인 3D 텐서 x입니다.


``` x = x.permute(1, 0, 2)  # (sequence\ length,\ batch\ size,\ input\_dim) ```


$ \text{{self-attention}}(Q, K, V) = \text{{softmax}}\left(\frac{QK^T}{\sqrt{d_k}}\right) V$ 

여기서 Q, K, V는 각각 self-attention의 Query, Key, Value에 해당하는 텐서이고, d_k는 헤드당 key의 차원 수입니다.

DNNWithSelfAttention 모델에서는 nn.MultiheadAttention을 사용하여 self-attention을 수행합니다. 따라서 Q, K, V는 x로부터 자동으로 생성되며, nn.MultiheadAttention 내부에서는 위의 self-attention 수식이 사용됩니다.

이어서 x는 다음과 같이 다시 원래의 형태로 변형됩니다:
```
x = x.permute(1, 0, 2)  # (batch\ size,\ sequence\ length,\ input\_dim)
```

나머지 모델은 일반적인 딥 뉴럴 네트워크와 같이 fully connected 레이어를 통해 입력에 비선형 변환을 적용하는 것으로 표현

### reference

- https://uvadlc-notebooks.readthedocs.io/en/latest/tutorial_notebooks/tutorial6/Transformers_and_MHAttention.html

- https://arxiv.org/pdf/2109.12547.pdf ( Multi-modal Fusion using Fine-tuned Selfattention and Transfer Learning for Veracity
Analysis of Web Information)

### infence - self attention

In [351]:
import torch

# Load the best model after training is complete
input_dim = 14
hidden_dim = 64  # 예시로 임의로 설정
output_dim = 8   # 예시로 임의로 설정
best_model = DNNWithSelfAttention(input_dim, hidden_dim, output_dim)  # Create a new instance of the model
best_model.load_state_dict(torch.load('self_attention_model.pt', map_location=torch.device('cpu')))
best_model.eval()

DNNWithSelfAttention(
  (attention): MultiheadAttention(
    (out_proj): NonDynamicallyQuantizableLinear(in_features=14, out_features=14, bias=True)
  )
  (fc1): Linear(in_features=14, out_features=64, bias=True)
  (fc2): Linear(in_features=64, out_features=32, bias=True)
  (fc3): Linear(in_features=32, out_features=16, bias=True)
  (fc4): Linear(in_features=16, out_features=8, bias=True)
)

In [352]:
import torch.nn.functional as F

# Input data
input_data = [19.062274932861328,
              -16.566207885742188,
              -40.389774322509766,
              -9.753218650817871,
              -13.874068260192871,
              -19.5970458984375,
              -18.84337043762207,
              6.017386436462402, 
              -2.1595633029937744,
              -0.8150117993354797,
              3.0031497478485107,
              -1.3458218574523926,
              -1.2037882804870605,
              2.38088059425354]

Labels =  ['강도범죄','강제추행(성범죄)', '실내', '도움요청', '실외', '폭력범죄', '절도범죄', '정상']

# Convert the input data to a torch tensor
input_tensor = torch.tensor(input_data, dtype=torch.float32)
# Perform inference on the input data using the best model
with torch.no_grad():
    # Add a batch dimension to the input tensor
    input_tensor = input_tensor.unsqueeze(0)
    # Forward pass through the model
    output_tensor = best_model(input_tensor)
    # Remove the batch dimension from the output tensor and apply softmax
    output_tensor = F.softmax(output_tensor, dim=-1)
    # Convert the output tensor to a numpy array
    output_array = output_tensor.squeeze(0).numpy()

print("Output (softmax):", output_array)

# Get the predicted label index
predicted_label_idx = output_array.argmax()
# print(predicted_label_idx)
# Get the predicted label using the index
predicted_label = Labels[predicted_label_idx]

print("Predicted Label:", predicted_label)

Output (softmax): [9.9998510e-01 1.8568997e-15 2.3076009e-08 9.0280395e-18 7.6629485e-14
 5.7487672e-08 1.4621002e-05 1.9467386e-07]
Predicted Label: 강도범죄


In [353]:
# Shuffle the rows of the dataframe
result_shuffled = result.sample(frac=1, random_state=42)

# Extract 20 rows from the shuffled dataframe
result_sampled = result_shuffled.head(20)


In [354]:
# Extract the input data (excluding the last column) as a NumPy array
input_data_array = result_sampled.iloc[:, :-1].to_numpy()

print("Input Data Array:")
print(input_data_array)

Input Data Array:
[[  5.02774572  -2.58983421 -18.58485413  -9.53871059  -8.41992092
  -11.46984196  -8.5586462    0.89850485  -1.90088677   0.25716156
   -0.4955942   -1.10953557  -0.64811373   3.71773839]
 [ 11.71289062   1.79270589 -24.77228165 -11.57762432  -6.33594942
  -18.87142181 -17.11241341  -1.07602251  -0.97572452  -0.73356569
   -0.96332371   4.17968225  -1.03005266  -0.66918153]
 [  8.8684454  -22.48382759 -26.86816216  -3.11974454  -5.3022027
   -9.25043011  -7.87004995   0.42653406  -1.78040707  -0.86400372
   -0.6173259   -0.55498421  -0.40580994   3.96340084]
 [ 14.85270214  -3.0601778  -28.94112206 -11.61994362  -9.01618958
  -23.35820961 -13.75000763   2.56557512  -1.38470447   4.13305187
   -1.15236437  -0.7839973   -0.83570135   0.26239854]
 [  7.28918695   2.1308465  -17.12442398 -10.87859249  -2.42115974
   -8.66496468  -8.9953661    1.10778761   1.85835373  -1.15648806
    2.64362907  -1.4967401   -1.43094194  -0.40503064]
 [  6.36496878 -11.51178074 -26.372541

In [355]:
import torch
import torch.nn.functional as F
import numpy as np

def infer_and_predict_label(input_data, best_model, Labels):
    """
    Perform inference on the input data using the best model and predict the label.

    Args:
        input_data (np.ndarray): Input data as a NumPy array.
        best_model (nn.Module): Trained PyTorch model for inference.
        Labels (list): List of labels for prediction.

    Returns:
        predicted_label (str): Predicted label.
    """
    # Convert the input data to a torch tensor
    input_tensor = torch.tensor(input_data, dtype=torch.float32)
    # Perform inference on the input data using the best model
    with torch.no_grad():
        # Add a batch dimension to the input tensor
        input_tensor = input_tensor.unsqueeze(0)
        # Forward pass through the model
        output_tensor = best_model(input_tensor)
        # Remove the batch dimension from the output tensor and apply softmax
        output_tensor = F.softmax(output_tensor, dim=-1)
        # Convert the output tensor to a numpy array
        output_array = output_tensor.squeeze(0).numpy()

    # Get the predicted label index
    predicted_label_idx = output_array.argmax()
    # Get the predicted label using the index
    predicted_label = Labels[predicted_label_idx]

    return predicted_label

# Usage example
input_dim = 14
hidden_dim = 64  # 예시로 임의로 설정
output_dim = 8   # 예시로 임의로 설정
best_model = DNNWithSelfAttention(input_dim, hidden_dim, output_dim)  # Create a new instance of the model
best_model.load_state_dict(torch.load('self_attention_model.pt', map_location=torch.device('cpu')))
best_model.eval()



DNNWithSelfAttention(
  (attention): MultiheadAttention(
    (out_proj): NonDynamicallyQuantizableLinear(in_features=14, out_features=14, bias=True)
  )
  (fc1): Linear(in_features=14, out_features=64, bias=True)
  (fc2): Linear(in_features=64, out_features=32, bias=True)
  (fc3): Linear(in_features=32, out_features=16, bias=True)
  (fc4): Linear(in_features=16, out_features=8, bias=True)
)

In [356]:
# Loop over the input data array
for input_data in input_data_array:
    # Call the function with input_data, best_model, and Labels as arguments
    predicted_label = infer_and_predict_label(input_data, best_model, Labels)
    print("Predicted Label:", predicted_label)


Predicted Label: 정상
Predicted Label: 강제추행(성범죄)
Predicted Label: 정상
Predicted Label: 실내
Predicted Label: 도움요청
Predicted Label: 강도범죄
Predicted Label: 실외
Predicted Label: 강도범죄
Predicted Label: 절도범죄
Predicted Label: 실내
Predicted Label: 강제추행(성범죄)
Predicted Label: 실내
Predicted Label: 강도범죄
Predicted Label: 강제추행(성범죄)
Predicted Label: 절도범죄
Predicted Label: 정상
Predicted Label: 정상
Predicted Label: 절도범죄
Predicted Label: 실내
Predicted Label: 강제추행(성범죄)
