In [1]:
import os
import pandas as pd
import numpy as np
from tqdm import tqdm
import urllib.request
from sklearn import preprocessing
import tensorflow as tf
from transformers import BertTokenizer, TFBertModel
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

In [2]:
train_data=pd.read_excel('meta_list_labeling.xlsx')
train_data=train_data[train_data['has_lyric']==1]
train_data=train_data[:50]

In [4]:
tokenizer = BertTokenizer.from_pretrained("klue/bert-base")

In [5]:
max_seq_len = 128

In [6]:
def convert_tracks_to_features(tracks,labels,max_seq_len,tokenizer):
    input_ids, attention_masks, token_type_ids, data_labels = [], [], [] , []
    idx_encode = preprocessing.LabelEncoder()
    idx_encode.fit(labels)
    data_labels=idx_encode.transform(labels)
    
    label_idx = dict(zip(list(idx_encode.classes_), idx_encode.transform(list(idx_encode.classes_))))
    idx_label = {value: key for key, value in label_idx.items()}
    
    for track, label in tqdm(zip(tracks,labels),total=len(tracks)):
        
        input_id=tokenizer.encode(track,max_length=max_seq_len,pad_to_max_length=True)
        padding_count = input_id.count(tokenizer.pad_token_id)
        attention_mask = [1] * (max_seq_len - padding_count) + [0] * padding_count
        
        # token_type_id는 세그먼트 임베딩을 위한 것으로 이번 예제는 문장이 1개이므로 전부 0으로 통일.
        token_type_id = [0] * max_seq_len
        
        input_ids.append(input_id)
        attention_masks.append(attention_mask)
        token_type_ids.append(token_type_id)
        
    input_ids = np.array(input_ids, dtype=int)
    attention_masks = np.array(attention_masks, dtype=int)
    token_type_ids = np.array(token_type_ids, dtype=int)

    data_labels = np.asarray(data_labels, dtype=np.int32)

    return (input_ids, attention_masks, token_type_ids), data_labels,label_idx,idx_label

In [8]:
train_X, train_y,label_idx,idx_label  = convert_tracks_to_features(train_data['lyric'], train_data['mood'], max_seq_len=max_seq_len, tokenizer=tokenizer)

  0%|                                                                                           | 0/50 [00:00<?, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
100%|█████████████████████████████████████████████████████████████████████████████████| 50/50 [00:00<00:00, 606.73it/s]


In [9]:
model = TFBertModel.from_pretrained("bert-base-multilingual-cased")

Some layers from the model checkpoint at bert-base-multilingual-cased were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-multilingual-cased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [10]:
class TFBertForSequenceClassification(tf.keras.Model):
    def __init__(self, model_name):
        super(TFBertForSequenceClassification, self).__init__()
        self.bert = TFBertModel.from_pretrained(model_name, from_pt=True)
        self.classifier = tf.keras.models.Sequential()
        self.classifier.add(tf.keras.layers.Dense(16, activation='relu'))
        self.classifier.add(tf.keras.layers.Dense(32, activation='relu'))
        self.classifier.add(tf.keras.layers.Dense(16, activation='relu'))
        self.classifier.add(tf.keras.layers.Dense(3,
                                                kernel_initializer=tf.keras.initializers.TruncatedNormal(0.02),
                                                activation='softmax',
                                                name='classifier'))
        

    def call(self, inputs):
        input_ids, attention_mask, token_type_ids = inputs
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        cls_token = outputs[1]
        prediction = self.classifier(cls_token)

        return prediction

In [11]:
model = TFBertForSequenceClassification("bert-base-multilingual-cased")
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
loss = tf.keras.losses.SparseCategoricalCrossentropy()
model.compile(optimizer=optimizer, loss=loss, metrics = ['accuracy'])

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already

In [16]:
model.fit(train_X, train_y, epochs=10, batch_size=8, validation_split=0.2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x19eea1fc688>

In [17]:
def sentiment_predict(new_sentence):
    input_id = tokenizer.encode(new_sentence, max_length=max_seq_len, pad_to_max_length=True)

    padding_count = input_id.count(tokenizer.pad_token_id)
    attention_mask = [1] * (max_seq_len - padding_count) + [0] * padding_count
    token_type_id = [0] * max_seq_len

    input_ids = np.array([input_id])
    attention_masks = np.array([attention_mask])
    token_type_ids = np.array([token_type_id])

    encoded_input = [input_ids, attention_masks, token_type_ids]
    score = model.predict(encoded_input)
    idx=np.argmax(score)
    return idx,score

In [18]:
test=pd.read_excel('meta_list_labeling.xlsx')
test=test[test['has_lyric']==1]
for i in range(101,300):
    
    lyric=test['lyric'][i]
    idx,score=sentiment_predict(lyric)
    sentiment=idx_label[idx]
    print(test['track_title'][i])
    print(sentiment)
    print(score[0][idx])

삐삐
gloomy
0.7981874
소녀
relaxed
0.41803378
Dance The Night Away
delighted
0.5812669
IDOL
gloomy
0.79787886
한숨
delighted
0.5785353
그대네요 (With 아이유)
delighted
0.58177024
Let It Go (From "Frozen"/Soundtrack Version)
gloomy
0.6857783
썸 (Feat. 릴보이 of 긱스)
gloomy
0.79903126
만약에
delighted
0.57948786
Something Just Like This
delighted
0.5769166
Santa Tell Me
delighted
0.5541659
Rolling In The Deep
gloomy
0.7908689
거짓말 거짓말 거짓말
delighted
0.5816304
I Miss You
delighted
0.57972443
Memories
gloomy
0.79017526
OOH-AHH하게
gloomy
0.79818344
애인 있어요
delighted
0.57884324
뿜뿜
delighted
0.5806379
시간을 달려서 (Rough)
gloomy
0.7988407
신청곡 (Feat. SUGA of BTS)
delighted
0.58043694
We Don't Talk Anymore (Feat. Selena Gomez)
gloomy
0.7866967
뜨거운 여름밤은 가고 남은 건 볼품없지만
gloomy
0.7988646
사랑 안해
delighted
0.5698147
노래방에서
gloomy
0.7989829
열애중
gloomy
0.798485
크리스마스니까
gloomy
0.79535294
안아줘
gloomy
0.65541154
Viva La Vida
gloomy
0.6744166
D (half moon)  (Feat. 개코)
gloomy
0.7222733
우산 (Feat. 윤하)
gloomy
0.79754966
HandClap
delighted
0.55

양화대교
gloomy
0.79846424
시차 (We Are) (Feat. 로꼬 & GRAY)
gloomy
0.78602225
나만 몰랐던 이야기
delighted
0.5741973
넘쳐흘러
gloomy
0.78831166
Call You Mine (Feat. Geologic of The Blue Scholars)
gloomy
0.7857047
Roller Coaster
delighted
0.575903
오래된 노래
relaxed
0.4182508
All For You
gloomy
0.7986778
이쁘다니까
gloomy
0.7976066
Maps
gloomy
0.7928686
제자리 걸음
gloomy
0.79700315
눈사람
relaxed
0.4177796
첫사랑
delighted
0.5789416
Hush
gloomy
0.69503486
어른
gloomy
0.7978685
삐딱하게 (Crooked)
gloomy
0.79720604
총 맞은 것처럼
delighted
0.5785423
별이 빛나는 밤
delighted
0.57805234
출발
delighted
0.55530226
귀로
gloomy
0.79746443
내 손을 잡아
gloomy
0.79913837
TOMBOY
delighted
0.55448693
내게 오는 길
delighted
0.5779788
11:11
gloomy
0.789142
취기를 빌려 (취향저격 그녀 X 산들)
gloomy
0.7647137
같은 시간 속의 너
gloomy
0.79857683
그 사람
gloomy
0.5040882
그게나야
gloomy
0.798366
이 사랑
delighted
0.5683291
그냥 (Just)
gloomy
0.791621
좋은 사람 있으면 소개시켜줘
gloomy
0.7990452
LIKEY
gloomy
0.7974411
I Love U Oh Thank U (Feat. 김태우)
gloomy
0.7994447
그리워하다
gloomy
0.6411554
Who are you
gloomy
0.7950213

우주를 건너
delighted
0.5812267
Chandelier
gloomy
0.7880563
