In [None]:
import tensorflow as tf
tf.test.gpu_device_name()

'/device:GPU:0'

In [None]:
from google.colab import drive
drive.mount('gdrive')

Mounted at gdrive


In [None]:
!pip install transformers

Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Uninstalling PyYAML-3.13:
      Successfully uninstalled PyYAML-3.13
Successfully installed huggingface-hub-0.5.1 pyyaml-6.0 sacremoses-0.0.53 tokenizers-0.12.1 transformers-4.18.0


In [None]:
from transformers import RobertaTokenizer, RobertaForMaskedLM
import torch

torch.cuda.empty_cache()

In [None]:
model_name = 'roberta-base'

In [None]:
tokenizer = RobertaTokenizer.from_pretrained(model_name)
model = RobertaForMaskedLM.from_pretrained(model_name)

Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/481 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/478M [00:00<?, ?B/s]

In [None]:
DEFAULT_FOLDER = 'gdrive/My Drive/diploma/'
RULE = 'cross'
LANGUAGE = 'eng'
data_names = ['train', 'test', 'val']
genres = ['rock', 'rap', 'pop', 'metal']
full_lines = True

In [None]:
#define paths
if full_lines:
    data_paths = [DEFAULT_FOLDER + 'dataset_full_lines/' + genre + '/' + name + '_' + RULE + '_' + LANGUAGE + '.txt' for genre in genres for name in data_names]
else:
    data_paths = [DEFAULT_FOLDER + 'dataset/' + genre + '/' + name + '_' + RULE + '_' + LANGUAGE + '.txt' for genre in genres for name in data_names]
data_paths

['gdrive/My Drive/diploma/dataset/rock/train_cross_eng.txt',
 'gdrive/My Drive/diploma/dataset/rock/test_cross_eng.txt',
 'gdrive/My Drive/diploma/dataset/rock/val_cross_eng.txt',
 'gdrive/My Drive/diploma/dataset/rap/train_cross_eng.txt',
 'gdrive/My Drive/diploma/dataset/rap/test_cross_eng.txt',
 'gdrive/My Drive/diploma/dataset/rap/val_cross_eng.txt',
 'gdrive/My Drive/diploma/dataset/pop/train_cross_eng.txt',
 'gdrive/My Drive/diploma/dataset/pop/test_cross_eng.txt',
 'gdrive/My Drive/diploma/dataset/pop/val_cross_eng.txt',
 'gdrive/My Drive/diploma/dataset/metal/train_cross_eng.txt',
 'gdrive/My Drive/diploma/dataset/metal/test_cross_eng.txt',
 'gdrive/My Drive/diploma/dataset/metal/val_cross_eng.txt']

In [None]:
import pickle

def get_data(paths):

    data = []

    loc_list = []

    #load data
    for i, data_path in enumerate(paths):

        with open(data_path, 'rb') as fp:
            data_single = pickle.load(fp)

        loc_list.append(data_single)

        if (i + 1) % len(data_names) == 0:
            data.append(loc_list)
            loc_list = []
            
    return data

In [None]:
def get_data_shape(data):

    shape_list = []

    for data_genre in data:

        loc_list = []
        for data_type in data_genre:
            loc_list.append(len(data_type))

        shape_list.append(loc_list)
        
    return shape_list

In [None]:
def get_max_text_genres_len(genres):
    genres_name_len = [len(genre) for genre in genres]
    TEXT_GENRES_LEN = max(genres_name_len)
    return TEXT_GENRES_LEN

In [None]:
TEXT_GENRES_LEN = get_max_text_genres_len(genres)

def format_data(data):

    shape_list = get_data_shape(data)

    for i, data in enumerate(shape_list):
        print('{} ({} {} {})'.format(' ' * (TEXT_GENRES_LEN - len(genres[i])) + genres[i], data[0], data[1], data[2]))

In [None]:
data_genres = get_data(data_paths)

In [None]:
format_data(data_genres)

 rock (10300 1287 1288)
  rap (21847 2731 2731)
  pop (12675 1584 1585)
metal (10366 1296 1296)


In [None]:
import random

def get_random_data(nums, data):

    updated_data = []

    for data_genre in data:

        loc_list = []
        for i, data_type in enumerate(data_genre): 
            data_single = random.choices(data_type, k=nums[i])
            loc_list.append(data_single)
        
        updated_data.append(loc_list)
        
    return updated_data

In [None]:
TRAIN = 10000
TEST, VALID = 1000, 1000
types_num = [TRAIN, TEST, VALID]

In [None]:
def concat_data(data):

    updated_data = []

    for data_genre in data:

        updated_data += data_genre
    
    concat_data = []

    for i in range(len(types_num)):

        loc_list = []
        for j in range(i, len(updated_data), 3):
            # print(updated_data[j])
            # print(j)
            loc_list += updated_data[j]

        # print(len(loc_list))

        concat_data.append(loc_list)


    return concat_data

In [None]:
import seaborn as sns

#find max tokens len
def get_max_tokens_len(data):

    token_lens = []

    for text in data:
        loc_tokens = tokenizer.encode(text, max_length=512, truncation=True)
        token_lens.append(len(loc_tokens))

    # sns.distplot(token_lens)
    max_length = max(token_lens)

    # print(max_length)
    return max_length

In [None]:
def get_max_len(data):

    max_tokens_len_list = []

    for item in data:
        max_tokens_len = get_max_tokens_len(item)
        max_tokens_len_list.append(max_tokens_len)
    return max(max_tokens_len_list)

In [None]:
MAX_LEN = 1000
MIN_LEN = 85

if full_lines:
    MIN_LEN = MIN_LEN * 2

while MAX_LEN > MIN_LEN:
    data_genres_updated = get_random_data(types_num, data_genres)
    # format_data(data_genres_updated)
    data_genres_concat = concat_data(data_genres_updated)
    MAX_LEN = get_max_len(data_genres_concat)
    # print(MAX_LEN)

MAX_LEN

84

In [None]:
special_tokens_roberta = [0, 1, 2, 3, 50264]

In [None]:
class SongsTextsDataset(torch.utils.data.Dataset):

  def __init__(self, songs, tokenizer, max_len):
    self.songs = songs
    self.tokenizer = tokenizer
    self.max_len = max_len
  
  def __len__(self):
    return len(self.songs)
  
  def __getitem__(self, item):
    song = str(self.songs[item])

    encoding = self.tokenizer.encode_plus(
      song,
      max_length=self.max_len,
      truncation=True,
      padding='max_length',
      return_tensors='pt',
    )

    encoding['labels'] = encoding.input_ids.detach().clone()

    mask_arr = (encoding.input_ids != 0) * (encoding.input_ids != 1) * (encoding.input_ids != 2) * (encoding.input_ids != 3)
    
    selection = []

    for i in range(encoding.input_ids.shape[0]):

      # get indices of mask positions from mask array
      res = mask_arr[i].nonzero()[-1]

      # append mask position
      selection.append(torch.flatten(res).tolist())

    for i in range(encoding.input_ids.shape[0]):

      # mask input_ids
      encoding.input_ids[i, selection[i]] = 50264

    return {
      'song_text': song,
      'input_ids': encoding['input_ids'].flatten(),
      'attention_mask': encoding['attention_mask'].flatten(),
      'labels': encoding['labels'].flatten(),
    }

In [None]:
def create_data_loader(songs, tokenizer, max_len, batch_size):

  ds = SongsTextsDataset(
    songs=songs,
    tokenizer=tokenizer,
    max_len=max_len
  )

  return torch.utils.data.DataLoader(
    ds,
    batch_size=batch_size,
    num_workers=2
  )

In [None]:
BATCH_SIZE = 32

In [None]:
train_data_loader = create_data_loader(data_genres_concat[0], tokenizer, MAX_LEN, BATCH_SIZE)
val_data_loader = create_data_loader(data_genres_concat[1], tokenizer, MAX_LEN, BATCH_SIZE)
test_data_loader = create_data_loader(data_genres_concat[2], tokenizer, MAX_LEN, BATCH_SIZE)

In [None]:
# Setup GPU/CPU usage and activate the training mode of our model.
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

# and move our model over to the selected device
model.to(device)

In [None]:
EPOCHS = 4

In [None]:
from transformers import AdamW
from transformers import get_linear_schedule_with_warmup
import torch.nn as nn

optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False)
total_steps = len(train_data_loader) * EPOCHS

scheduler = get_linear_schedule_with_warmup(
  optimizer,
  num_warmup_steps=0,
  num_training_steps=total_steps
)



In [None]:
import numpy as np

def train_epoch(model, data_loader, optimizer, device, scheduler, n_examples):

    model = model.train()

    losses = []
    corr_pred = 0
    
    for d in data_loader:
      
        input_ids = d["input_ids"].to(device)
        attention_mask = d["attention_mask"].to(device)
        labels = d["labels"].to(device)

        outputs = model(
          input_ids=input_ids,
          attention_mask=attention_mask,
          labels=labels
        )

        _, preds = torch.max(outputs.logits, dim=-1)

        for i in range(len(labels)):
            if tokenizer.decode(labels[i]) == tokenizer.decode(preds[i]):
                corr_pred += 1
            # print(tokenizer.decode(labels[i]))
            # print(tokenizer.decode(preds[i]))

        loss = outputs.loss

        losses.append(loss.item())

        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()

    return corr_pred / n_examples, np.mean(losses)

In [None]:
import numpy as np

def eval_model(model, data_loader, device, n_examples):

    model = model.eval()

    losses = []
    corr_pred = 0

    with torch.no_grad():

        for d in data_loader:

            input_ids = d["input_ids"].to(device)
            attention_mask = d["attention_mask"].to(device)
            labels = d["labels"].to(device)
        
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )

            _, preds = torch.max(outputs.logits, dim=-1)

            for i in range(len(labels)):
                if tokenizer.decode(labels[i]) == tokenizer.decode(preds[i]):
                    corr_pred += 1
                # print(mixed_tokenizer.decode(labels[i]))
                # print(mixed_tokenizer.decode(preds[i]))

            loss = outputs.loss

            losses.append(loss.item())
    
    return corr_pred / n_examples, np.mean(losses)

In [None]:
import os.path
from os import path

def create_folder(folder_name):
    if path.exists(folder_name) == False:
        os.mkdir(folder_name)

In [None]:
if full_lines:
    if len(genres) > 1:
        model_save_path = 'gdrive/My Drive/diploma/models/mixed_cross_roberta_batch' + str(BATCH_SIZE) + '_' + LANGUAGE + '_full_lines/'
    else:
        model_save_path = 'gdrive/My Drive/diploma/models/' + genres[0] + '_cross_roberta_batch' + str(BATCH_SIZE) + '_' + LANGUAGE + '_full_lines/'
else:
    if len(genres) > 1:
        model_save_path = 'gdrive/My Drive/diploma/models/mixed_cross_roberta_batch' + str(BATCH_SIZE) + '_' + LANGUAGE + '/'
    else:
        model_save_path = 'gdrive/My Drive/diploma/models/' + genres[0] + '_cross_roberta_batch' + str(BATCH_SIZE) + '_' + LANGUAGE + '/'
    
create_folder(model_save_path)

In [None]:
%%time

from collections import defaultdict

history = defaultdict(list)
best_accuracy = 0

for epoch in range(EPOCHS):

  print(f'Epoch {epoch + 1}/{EPOCHS}')
  print('-' * 10)

  train_acc, train_loss = train_epoch(
    model,
    train_data_loader,    
    optimizer, 
    device, 
    scheduler, 
    len(data_genres_concat[0])
  )

  print(f'Train loss {train_loss} accuracy {train_acc}')

  val_acc, val_loss = eval_model(
    model,
    val_data_loader,
    device, 
    len(data_genres_concat[2])
  )

  print(f'Val   loss {val_loss} accuracy {val_acc}')
  print()

  history['train_acc'].append(train_acc)
  history['train_loss'].append(train_loss)
  history['val_acc'].append(val_acc)
  history['val_loss'].append(val_loss)

  if val_acc > best_accuracy:
    model.save_pretrained(model_save_path)
    tokenizer.save_pretrained(model_save_path)
    best_accuracy = val_acc

Epoch 1/10
----------
Train loss 0.6552184106528759 accuracy 0.145775
Val   loss 0.07970468610525132 accuracy 0.1665

Epoch 2/10
----------
Train loss 0.06547804579138755 accuracy 0.21815
Val   loss 0.08419632011651992 accuracy 0.157

Epoch 3/10
----------
Train loss 0.055613979960978034 accuracy 0.291875
Val   loss 0.09265344506502152 accuracy 0.127

Epoch 4/10
----------
Train loss 0.0463389841273427 accuracy 0.380875
Val   loss 0.09781409937143326 accuracy 0.1295

Epoch 5/10
----------
Train loss 0.03871327591985464 accuracy 0.463425
Val   loss 0.10026461035013198 accuracy 0.12

Epoch 6/10
----------
Train loss 0.032897223494946955 accuracy 0.53065
Val   loss 0.10334183359146118 accuracy 0.118

Epoch 7/10
----------
Train loss 0.028612821700423955 accuracy 0.580225
Val   loss 0.10444082403182983 accuracy 0.1165

Epoch 8/10
----------
Train loss 0.025659300231188537 accuracy 0.61715
Val   loss 0.1053359135389328 accuracy 0.119

Epoch 9/10
----------


In [None]:
import matplotlib.pyplot as plt

plt.plot(history['train_acc'], label='train accuracy')
plt.plot(history['val_acc'], label='validation accuracy')

plt.title('Training history')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend()
plt.ylim([0, 1])

In [None]:
test_acc, _ = eval_model(
  model,
  test_data_loader,
  device,
  len(data_genres_concat[1])
)

test_acc