In [None]:
import pickle

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

import t5
import pandas as pd
import numpy as np
import random
import math

from tqdm import tqdm
tqdm.pandas()

from sklearn.model_selection import train_test_split


import torch
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
device = torch.device("cpu")

torch.cuda.empty_cache()
from transformers import T5Tokenizer, MT5ForConditionalGeneration, T5ForConditionalGeneration, Adafactor

____

# Load data

In [None]:
# Data...

datasets = ['long', 'long_fully_aligned', 'long_softly_aligned',
            'short', 'short_fully_aligned', 'short_softly_aligned']

wa = datasets[0] # without_alignment
ls = datasets[1] # long_strict
lr = datasets[2] # long_relaxed
ss = datasets[4] # short_strict
sr = datasets[5] # short_relaxed

# Model...

models = ['t5-small', 't5-base', 't5-large', 'google/mt5-small', 'google/mt5-base']

model_version = models[0]


# Encoding...

spa_char_encode = True

In [None]:
data_wa = pickle.load(open('../Datasets/wikimusica_'+wa+'.p', "rb"))
input_test_wa = data_wa[1]
output_test_wa = data_wa[3]

data_ls = pickle.load(open('../Datasets/wikimusica_'+ls+'.p', "rb"))
input_test_ls = data_ls[1]
output_test_ls = data_ls[3]

data_lr = pickle.load(open('../Datasets/wikimusica_'+lr+'.p', "rb"))
input_test_lr = data_lr[1]
output_test_lr = data_lr[3]

data_ss = pickle.load(open('../Datasets/wikimusica_'+ss+'.p', "rb"))
input_test_ss = data_ss[1]
output_test_ss = data_ss[3]

data_sr = pickle.load(open('../Datasets/wikimusica_'+sr+'.p', "rb"))
input_test_sr = data_sr[1]
output_test_sr = data_sr[3]

# Select samples for testing

In [None]:
# Select samples after manual checking

ss_ind = [0,   2,   3,   5,   7, 
          8,   10,  12,  14,  15,
          16,  18,  21,  22,  23,
          29,  30,  31,  32,  34,]

ls_ind = [35,  36,  37,  38,  41,
          42,  43,  44,  45,  46,
          47,  48,  49,  50,  51,
          52,  53,  55,  57,  59,
          60,  61,  62,  63,  66,
          70,  72,  76,  77,  80,]
              
sr_ind = [81,  82,  84,  85,  86,
          89,  90,  95,  96,  97,
          99,  100, 101, 102, 105,
          106, 107, 108, 109, 110,
          111, 112, 113, 114, 116,
          117, 118, 119, 120, 125]
              
lr_ind = [127, 129, 130, 131, 133,
          136, 137, 140, 142, 144,
          145, 146, 147, 148, 149,
          150, 151, 152, 153, 154, 
          158, 159, 160, 162, 153,
          166, 167, 168, 170, 173]

wa_ind = [175, 178, 179, 180, 182,
         184, 185, 187, 188, 190,
         191, 193, 196, 198, 200,
         201, 202, 203, 205, 206,
         207, 217, 224, 225, 228,
         229, 234, 240, 241, 242,       
         259, 277, 289, 312, 314,
         328, 341, 368, 374, 395,]

In [None]:
input_test = list()
output_ref_test = list()

[input_test.append(input_test_ss[d]) for d in ss_ind]
[input_test.append(input_test_ls[d]) for d in ls_ind]
[input_test.append(input_test_sr[d]) for d in sr_ind]
[input_test.append(input_test_lr[d]) for d in lr_ind]
[input_test.append(input_test_wa[d]) for d in wa_ind]

[output_ref_test.append(output_test_ss[d]) for d in ss_ind]
[output_ref_test.append(output_test_ls[d]) for d in ls_ind]
[output_ref_test.append(output_test_sr[d]) for d in sr_ind]
[output_ref_test.append(output_test_lr[d]) for d in lr_ind]
[output_ref_test.append(output_test_wa[d]) for d in wa_ind]  

**Drop some selected samples, (we want just 100)**

In [None]:
# Finally, instead of 150 samples, we will test over 100

to_drop = [14, 21, 26, 34, 35, 42, 48, 49, 51, 54,
           59, 65, 67, 68, 70, 80, 81, 84, 95, 96,
           97, 102, 103, 23, 25, 29, 31, 38, 39, 53,
           147, 145, 141, 140, 137, 135, 130, 129, 127, 125,
           119, 115, 113, 111, 57, 66, 72, 100, 109, 134]


new_input_test = list()
new_output_ref_test = list()
for i in range(150):
    if i not in to_drop:
        new_input_test.append(input_test[i])
        new_output_ref_test.append(output_ref_test[i])
    
input_test = new_input_test
output_ref_test = new_output_ref_test

**Check attribute count for selected samples** 

In [None]:
input_attr_num = list()
input_attr = list()

for i in range(150):
    print(i)
    attr = {}
    attr_num = {'stagename':0,
                'birthname':0,

                'birthplace':0,
                'nation':0,
                'birthdate':0,
                'deathplace':0,
                'deathdate':0,

                'occupation':0,
                'instrument':0,
                'voice':0,
                'genre':0,
                'group':0,}
    
        
    for n in input_test[i].split('wikimusic: ')[1].split(' • '):
        a = n.split(' | ')[0]
        b = n.split(' | ')[1]
        
        attr_num[a] += 1
        try:
            attr[a].append(b)
        except:
            attr[a] = [b]
            
        print(n)
        
    input_attr_num.append(attr_num)
    input_attr.append(attr)
    
    print('----')
    print('')

In [None]:
df_attr_num = pd.DataFrame(input_attr_num)
df_attr_num.groupby('group').count()

In [None]:
input_test[0]

___

**Prepare input data for human evaluation**

In [None]:
name = ['stagename', 'birthname']
birth = ['birthplace', 'nation', 'birthdate', 'deathplace', 'deathdate']
info = ['occupation', 'instrument', 'voice', 'genre', 'group']

input_1_name = list()
input_1_birth = list()
input_1_info = list()

for i in input_attr:
    text_name = ''
    text_birth = ''
    text_info = ''
    
    for k in i.keys():
        for e in i[k]:
            if k in name:
                text_name += k + ': ' + e + '\n'
            elif k in birth:
                text_birth += k + ': ' + e + '\n'
            else:
                text_info += k + ': ' + e + '\n'
            
    input_1_name.append(text_name)
    input_1_birth.append(text_birth)
    input_1_info.append(text_info)
    
    
######


input_final = list()
for i in range(100):
    input_final.append(input_1_name[i] + '-----\n' + input_1_birth[i] + '-----\n' + input_1_info[i])


**Create Dataframe for evaluation**

In [None]:
test_records = list()
for i in range(100):
    record = {
        'reference': output_ref_test[i],
        'input_for_model': input_test[i],
        'attributes': input_final[i],
    }
    test_records.append(record)
    
df_test = pd.DataFrame(test_records)

**Add unique id-s for tracing each generated text**

In [None]:
ids = list(range(1,1101))

df_test['text_model_v1'] = ''
df_test['text_model_v1_id'] = random.sample(ids, k=df_test.shape[0])
ids_to_remove = df_test['text_model_v1_id'].tolist()
_ = [ids.remove(i) for i in ids_to_remove]

df_test['text_model_v2'] = ''
df_test['text_model_v2_id'] = random.sample(ids, k=df_test.shape[0])
ids_to_remove = df_test['text_model_v2_id'].tolist()
_ = [ids.remove(i) for i in ids_to_remove]

df_test['text_model_v3'] = ''
df_test['text_model_v3_id'] = random.sample(ids, k=df_test.shape[0])
ids_to_remove = df_test['text_model_v3_id'].tolist()
_ = [ids.remove(i) for i in ids_to_remove]

df_test['text_model_v4'] = ''
df_test['text_model_v4_id'] = random.sample(ids, k=df_test.shape[0])
ids_to_remove = df_test['text_model_v4_id'].tolist()
_ = [ids.remove(i) for i in ids_to_remove]

df_test['text_model_v5'] = ''
df_test['text_model_v5_id'] = random.sample(ids, k=df_test.shape[0])
ids_to_remove = df_test['text_model_v5_id'].tolist()
_ = [ids.remove(i) for i in ids_to_remove]

df_test['text_model_v6'] = ''
df_test['text_model_v6_id'] = random.sample(ids, k=df_test.shape[0])
ids_to_remove = df_test['text_model_v6_id'].tolist()
_ = [ids.remove(i) for i in ids_to_remove]

df_test['text_model_v7'] = ''
df_test['text_model_v7_id'] = random.sample(ids, k=df_test.shape[0])
ids_to_remove = df_test['text_model_v7_id'].tolist()
_ = [ids.remove(i) for i in ids_to_remove]

df_test['text_model_v8'] = ''
df_test['text_model_v8_id'] = random.sample(ids, k=df_test.shape[0])
ids_to_remove = df_test['text_model_v8_id'].tolist()
_ = [ids.remove(i) for i in ids_to_remove]

df_test['text_model_v9'] = ''
df_test['text_model_v9_id'] = random.sample(ids, k=df_test.shape[0])
ids_to_remove = df_test['text_model_v9_id'].tolist()
_ = [ids.remove(i) for i in ids_to_remove]

df_test['text_model_v10'] = ''
df_test['text_model_v10_id'] = random.sample(ids, k=df_test.shape[0])
ids_to_remove = df_test['text_model_v10_id'].tolist()
_ = [ids.remove(i) for i in ids_to_remove]

df_test['text_model_v11'] = df_test['reference']
df_test['text_model_v11_id'] = random.sample(ids, k=df_test.shape[0])
ids_to_remove = df_test['text_model_v11_id'].tolist()
_ = [ids.remove(i) for i in ids_to_remove]


# Shuffle data
df_test = df_test.sample(frac = 1)

_____

# Generate texts for evaluation

In [None]:
def encode_unseen_characters(text: str):
    
    text = (text.replace('í','%i%')
                .replace('Í','%I%')
                .replace('ú','%u%')
                .replace('Ú','%U%')
                .replace('Á','%A%')
                .replace('Ó','%O%')
                .replace('ñ','%n%')
                .replace('Ñ','%N%'))
    
    return text

def decode_unseen_characters(text: str):
    
    text = (text.replace('%i%','í')
                .replace('%I%','Í')
                .replace('%u%','ú')
                .replace('%U%','Ú')
                .replace('%A%','Á')
                .replace('%O%','Ó')
                .replace('%n%','ñ')
                .replace('%N%','Ñ'))
    
    return text   

#### #### ####

In [None]:
device = torch.device("cpu")

**Load models**

In [None]:
model_version = 't5-base'
t5_base_tokenizer = T5Tokenizer.from_pretrained(model_version)

####################

model_name = 'jumping-jazz-13-3.pt' # t-5 vanila (long)

v1_vanila = torch.load('../Models/'+model_name)
v1_vanila.to(device);

###

model_name = 'classic-puddle-20-3.pt'

v2_t5base_strict = torch.load('../Models/'+model_name)
v2_t5base_strict.to(device);

###

#model_name = 'trim-firebrand-17-3.pt'
model_name = 'vivid-darkness-18-3.pt'

v6_t5base_relaxed = torch.load('../Models/'+model_name)
v6_t5base_relaxed.to(device);

In [None]:
model_version = 't5-small'
t5_small_tokenizer = T5Tokenizer.from_pretrained(model_version)

####################

model_name = 'kind-hill-15-2.pt'

v7_t5small_strict = torch.load('../Models/'+model_name)
v7_t5small_strict.to(device);

###

model_name = 'noble-oath-3-5.pt'

v8_t5small_relaxed = torch.load('../Models/'+model_name)
v8_t5small_relaxed.to(device);

In [None]:
model_version = 'google/mt5-small'
mt5_small_tokenizer = T5Tokenizer.from_pretrained(model_version)

####################

model_name = 'peach-star-28-7.pt'

v9_mt5small_strict = torch.load('../Models/'+model_name)
v9_mt5small_strict.to(device);

###

model_name = 'zany-haze-27-4.pt'

v10_mt5small_relaxed = torch.load('../Models/'+model_name)
v10_mt5small_relaxed.to(device);

___

In [None]:
def check_occupation(text:str):
    
    ''' Function for checking if any occupation is present in input data'''
    
    has_occ = text.find('occupation |')
    if has_occ!=-1:
        return text
    else:
        has_ins = text.find('instrument |')
    if has_ins!=-1:
        return text
    else:
        return text + ' • occupation | m%u%sico'

**Generate eval text for each model**

In [None]:
def generate_text(input_text, model, tokenizer,
                  num_beams=10,
                  min_length=30,
                  num_return_sequences=1,
                  length_penalty=1,
                  no_repeat_ngram_size=0,
                  dynamic_min_length=False,
                  test=False):
    
    input_text = encode_unseen_characters(input_text)
    
    if dynamic_min_length:
        attrs = input_text.split('•')
        input_split = [a.split('|')[1].strip() for a in attrs]
        input_split = ' '.join(input_split)
        min_length = len(tokenizer.tokenize(input_split)) + 10

    features = tokenizer([input_text], return_tensors='pt')
    
    outputs = model.generate(input_ids=features['input_ids'],
                             attention_mask=features['attention_mask'],
                             max_length=512,
                             min_length=min_length,
                             num_beams=num_beams,
                             num_return_sequences=num_return_sequences,
                             length_penalty=length_penalty,
                             no_repeat_ngram_size=no_repeat_ngram_size)

    for output in outputs:
        t = tokenizer.decode(output, skip_special_tokens=True)
        t = decode_unseen_characters(t)

        if test:
            print('\n-- ** -- ** --\n')
            print(t)
            
        else:
            return t            

In [None]:
df_test['text_model_v1'] = df_test['input_for_model'].progress_apply(generate_text,
                                     model=v1_vanila,
                                     tokenizer=t5_base_tokenizer,
                                     num_beams=1)

In [None]:
df_test['text_model_v2'] = df_test['input_for_model'].progress_apply(generate_text,
                                     model=v2_t5base_strict,
                                     tokenizer=t5_base_tokenizer,
                                     num_beams=1)

In [None]:
df_test['text_model_v3'] = df_test['input_for_model'].progress_apply(generate_text,
                                     model=v2_t5base_strict,
                                     tokenizer=t5_base_tokenizer,
                                     num_beams=2)

In [None]:
df_test['text_model_v4'] = df_test['input_for_model'].progress_apply(generate_text,
                                     model=v2_t5base_strict,
                                     tokenizer=t5_base_tokenizer,
                                     num_beams=5)

In [None]:
df_test['text_model_v5'] = df_test['input_for_model'].progress_apply(generate_text,
                                     model=v2_t5base_strict,
                                     tokenizer=t5_base_tokenizer,
                                     num_beams=10)

In [None]:
df_test['text_model_v6'] = df_test['input_for_model'].progress_apply(generate_text,
                                     model=v6_t5base_relaxed,
                                     tokenizer=t5_base_tokenizer,
                                     num_beams=10)

In [None]:
df_test['text_model_v7'] = df_test['input_for_model'].progress_apply(generate_text,
                                     model=v7_t5small_strict,
                                     tokenizer=t5_small_tokenizer,
                                     num_beams=10)

In [None]:
df_test['text_model_v8'] = df_test['input_for_model'].progress_apply(generate_text,
                                     model=v8_t5small_relaxed,
                                     tokenizer=t5_small_tokenizer,
                                     num_beams=10)

In [None]:
df_test['text_model_v9'] = df_test['input_for_model'].progress_apply(generate_text,
                                     model=v9_mt5small_strict,
                                     tokenizer=mt5_small_tokenizer,
                                     num_beams=10)

In [None]:
df_test['text_model_v10'] = df_test['input_for_model'].progress_apply(generate_text,
                                     model=v10_mt5small_relaxed,
                                     tokenizer=mt5_small_tokenizer,
                                     num_beams=10)

**Correct some reference texts for evaluation**

In [None]:
df_test.loc[2, 'text_model_v11'] = 'Zhāng Huódīng (Baichen, Jilin, 1971), es una de las cantantes más famosas de ópera pekinesa.'
df_test.loc[21, 'text_model_v11'] = 'Basilio Antonio Fergus Alexander conocido artísticamente como Basilio (Ciudad de Panamá, Panamá, 13 de octubre de 1947 - Miami, Estados Unidos, 11 de octubre de 2009) fue un cantante panameño.'
df_test.loc[26, 'text_model_v11'] = 'Frankie Banali (Queens, 14 de noviembre de 1951) es un baterista estadounidense, reconocido por su trabajo con la banda de heavy metal multiplatino Quiet Riot, siendo el único miembro que queda de la formación clásica de la banda.'
df_test.loc[28, 'text_model_v11'] = 'Vagif Mustafazade (en azerí: Vaqif Mustafazadə; 16 de marzo de 1940-16 de diciembre de 1979) fue un músico azerbaiyano de jazz, pianista y compositor.'
df_test.loc[43, 'text_model_v11'] = 'Fernando Peña Soto (Lebrija, 1863-Utrera, circa 1930) más conocido como Popá Pinini o El Pinini, fue un cantaor flamenco.'
df_test.loc[59, 'text_model_v11'] = 'Philip David Charles Collins, más conocido como Phil Collins (Chiswick, Middlesex, Inglaterra, 30 de enero de 1951), es un baterista, cantante, compositor, productor y actor británico, y uno de los artistas de mayor éxito de la música pop y soft rock.'
df_test.loc[60, 'text_model_v11'] = 'Joey Belladonna (nacido como Joseph Bellardini el 13 de octubre de 1960 en Oswego, Nueva York) es un vocalista y baterista de heavy metal y thrash metal, más conocido por ser el cantante de la banda Anthrax.'
df_test.loc[61, 'text_model_v11'] = 'Roy Alan Lynes (n. 25 de octubre de 1943, Redhill, Surrey, Inglaterra) es un músico y compositor inglés, conocido por haber sido teclista de la banda de rock Status Quo'
df_test.loc[62, 'text_model_v11'] = 'Russell Freeman (Chicago, Illinois, 28 de mayo de 1926-Las Vegas, Nevada, 27 de junio de 2002) fue un pianista y compositor estadounidense de jazz.'
df_test.loc[65, 'text_model_v11'] = 'Thea Garret (Tarxien, Malta; 15 de marzo de 1992) es una cantante maltesa.'
df_test.loc[69, 'text_model_v11'] = '8 de abril de 1956, Dolores, provincia de Buenos Aires), apodado el "Chacarero Cantor", es un popular cantante folclórico argentino.'
df_test.loc[72, 'text_model_v11'] = 'Sven Erik Kristiansen, más conocido como Maniac (Noruega, 4 de febrero de 1969), es el vocalista de la banda de black metal Skitliv. Maniac es conocido principalmente por haber sido el vocalista de la banda pionera del black metal noruego, Mayhem.'
df_test.loc[73, 'text_model_v11'] = 'Kelly Beatrice Carrion Kipnis (n. 11 de marzo de 1996 en Rhode Island, Estados Unidos), también conocida como Kelly Carrion es una Cantante, compositora y Filántropa estadounidense'
df_test.loc[78, 'text_model_v11'] = 'Sal Valentino (Salvatore Willard Spampinato, 8 de septiembre de 1942) es un cantante, compositor y productor discográfico estadounidense, reconocido por haber sido el cantante de la agrupación The Beau Brummels.'
df_test.loc[80, 'text_model_v11'] = 'Sara Haydee Barreto Retuerto (Distrito de Sumbilca, Huaral, 28 de mayo de 1969-Lima, 28 de mayo de 2007), más conocida como Muñequita Sally, fue una cantante folclórica de huayno y huaylasrh reconocida en el Perú.'
df_test.loc[81, 'text_model_v11'] = 'Leonardo Lozano Escalante, es un músico venezolano, concertista de cuatro venezolano, guitarra, compositor, arreglista y docente.'
df_test.loc[84, 'text_model_v11'] = 'Cinthia Santibáñez (nacida el 13 de julio de 1973), músico, es la vocalista y principal letrista de la banda chilena Crisálida, que toma elementos del rock, el Metal y la música progresiva.'
df_test.loc[85, 'text_model_v11'] = 'Daniel Paul Johns (Newcastle, 22 de abril de 1979) es un músico, vocalista, compositor, guitarrista y pianista australiano. Fue el líder de la banda de rock Silverchair.'
df_test.loc[86, 'text_model_v11'] = 'Gordon Matthew Thomas Sumner, CBE (Wallsend, Tyneside del Norte, Inglaterra, 2 de octubre de 1951), más conocido como Sting, es un músico británico que se desempeñó inicialmente como bajista, y más tarde como cantante y bajista del grupo musical The Police, formando luego su propia banda.'
df_test.loc[87, 'text_model_v11'] = 'Lee Min Hyuk (Seúl, 29 de noviembre de 1990), conocido como Minhyuk, es un cantante, rapero, actor y MC surcoreano. Es integrante de grupo masculino BtoB'
df_test.loc[91, 'text_model_v11'] = 'Steve Walsh (n. 15 de junio de 1951) es un músico, cantante y compositor conocido principalmente por su trabajo como miembro de la banda estadounidense de rock progresivo Kansas.'
df_test.loc[96, 'text_model_v11'] = 'Woodrow Wilson Guthrie (Okemah, Oklahoma, 14 de julio de 1912-Nueva York, 3 de octubre de 1967), conocido como Woody Guthrie, fue un músico y cantautor folk estadounidense.'
df_test.loc[99, 'text_model_v11'] = 'Iñaki Antón González (Bilbao, 3 de agosto de 1964), conocido como simplemente Iñaki Antón o con el apodo de Uoho, es un músico multiinstrumentista, compositor y productor de rock español. Fue guitarrista de la bandas de rock Extremoduro.'

# Save data for evaluation

**For human evaluation**

In [None]:
pickle.dump(df_test, open( "../Evaluation/eval_data.p", "wb"))

**For automatic evaluation**

In [None]:
def write_file(file_name, text):
    
    file_name = '/home/hlaboa-server/jupyter/TFM/nlg_wikimusica/Evaluation/'+file_name+'.txt'
    text_file = open(file_name, 'w')
    n = text_file.write(text)
    text_file.close()

In [None]:
# Remove Elizabeth Mary Landreaxu, it's duplicates
eval_data = eval_data.drop([70])

# Take just 80 rows for evaluation
eval_data = eval_data.iloc[:80]



col_names = ['reference',
              'text_model_v1',
              'text_model_v2',
              'text_model_v3',
              'text_model_v4',
              'text_model_v5',
              'text_model_v12',
              'text_model_v7',
              'text_model_v8',
              'text_model_v9',
              'text_model_v10']

file_names = ['references',
              't5-base_vanilla_b0', 
              't5-base_strict_b0',
              't5-base_strict_b2',
              't5-base_strict_b5',
              't5-base_strict_b10',
              't5-base_relaxed_b10',
              't5-small_strict_b10',
              't5-small_relaxed_b10',
              'mt5-small_strict_b10',
              'mt5-small_relaxed_b10']


for ind,col in enumerate(col_names):
    text = '\n'.join(eval_data[col].tolist())
    write_file(file_names[ind], text)

____