In [None]:
import pickle

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

import pandas as pd
import numpy as np
import random
import math

from sklearn.model_selection import train_test_split

# Load articles

In [None]:
df_wikimusica_short = pickle.load(open("../Data/df_wikimusica_short.p", "rb"))
df_wikimusica_short_softly_aligned =  pickle.load(open("../Data/df_wikimusica_short_softly_aligned.p", "rb"))
df_wikimusica_short_fully_aligned =  pickle.load(open("../Data/df_wikimusica_short_fully_aligned.p", "rb"))

df_wikimusica_long = pickle.load(open("../Data/df_wikimusica_long.p", "rb"))
df_wikimusica_long_softly_aligned =  pickle.load(open("../Data/df_wikimusica_long_softly_aligned.p", "rb"))
df_wikimusica_long_fully_aligned =  pickle.load(open("../Data/df_wikimusica_long_fully_aligned.p", "rb"))

# Preprocess input data

In [None]:
def print_record_number():
    print('Short: ',len(df_wikimusica_short['_id_'].unique()))
    print('Long: ',len(df_wikimusica_long['_id_'].unique()))
    print('Short Softly Aligned: ',len(df_wikimusica_short_softly_aligned['_id_'].unique()))
    print('Long Softly Aligned: ',len(df_wikimusica_long_softly_aligned['_id_'].unique()))
    print('Long Fully Aligned: ',len(df_wikimusica_long_fully_aligned['_id_'].unique()))
    print('Short Fully Aligned: ',len(df_wikimusica_short_fully_aligned['_id_'].unique()))
    
####

print_record_number()

___

In [None]:
ids = df_wikimusica_short_fully_aligned['_id_'].unique().tolist()

df_wikimusica_short = df_wikimusica_short[df_wikimusica_short['_id_'].isin(ids)]
df_wikimusica_short_softly_aligned = df_wikimusica_short_softly_aligned[df_wikimusica_short_softly_aligned['_id_'].isin(ids)]

df_wikimusica_long = df_wikimusica_long[df_wikimusica_long['_id_'].isin(ids)]
df_wikimusica_long_softly_aligned = df_wikimusica_long_softly_aligned[df_wikimusica_long_softly_aligned['_id_'].isin(ids)]
df_wikimusica_long_fully_aligned = df_wikimusica_long_fully_aligned[df_wikimusica_long_fully_aligned['_id_'].isin(ids)]

____

In [None]:
print_record_number()

____

In [None]:
# Create dataset description

def describe_dataset(df):
    
    num_artists = len(df['_id_'].unique())

    df_plot = df.copy()
    df_plot_II = df.copy()

    df_plot = pd.DataFrame(df_plot
                            .groupby(['_id_','variable'])
                            .head(1).groupby('variable')
                            .count()['_id_']
                            .sort_values(ascending=False)).rename(columns={'_id_':'# of artists'})

    df_plot['% of artists'] = df_plot.apply(lambda a: round(100*(a/num_artists),2))


    df_plot_II = pd.DataFrame(df_plot_II
                                      .groupby('variable')
                                      .count()['_id_']
                                      .sort_values(ascending=False)).rename(columns={'_id_':'# total'})

    df_plot = df_plot_II.merge(df_plot, 'left',  'variable')
    df_plot['mean per artist'] = round((df_plot['# total']/df_plot['# of artists']),2)

    median_df = pd.DataFrame(df
                  .groupby(['_id_','variable'])
                  .count()
                  .groupby('variable')
                  .median()['_titulo_']).rename(columns={'_titulo_':'median per artist'})

    df_plot = df_plot.merge(median_df, 'left',  'variable')

    df_plot = df_plot.filter(['# of artists', '% of artists', '# total', 'mean per artist', 'median per artist'])
    df_plot = df_plot.sort_values('# of artists', ascending=False)

    return df_plot


####

df_wikimusica_short_desc = describe_dataset(df_wikimusica_short)
df_wikimusica_short_desc

In [None]:
df_wikimusica_short_softly_aligned_desc = describe_dataset(df_wikimusica_short_softly_aligned)
df_wikimusica_short_softly_aligned_desc

In [None]:
df_wikimusica_short_fully_aligned_desc = describe_dataset(df_wikimusica_short_fully_aligned)
df_wikimusica_short_fully_aligned_desc

_____

In [None]:
ids = df_wikimusica_short_fully_aligned[df_wikimusica_short_fully_aligned['variable']=='nombre artistico']['_id_'].unique().tolist()

df_wikimusica_short = df_wikimusica_short[df_wikimusica_short['_id_'].isin(ids)]
df_wikimusica_short_softly_aligned = df_wikimusica_short_softly_aligned[df_wikimusica_short_softly_aligned['_id_'].isin(ids)]
df_wikimusica_short_fully_aligned = df_wikimusica_short_fully_aligned[df_wikimusica_short_fully_aligned['_id_'].isin(ids)]


df_wikimusica_long = df_wikimusica_long[df_wikimusica_long['_id_'].isin(ids)]
df_wikimusica_long_softly_aligned = df_wikimusica_long_softly_aligned[df_wikimusica_long_softly_aligned['_id_'].isin(ids)]
df_wikimusica_long_fully_aligned = df_wikimusica_long_fully_aligned[df_wikimusica_long_fully_aligned['_id_'].isin(ids)]

____

In [None]:
print_record_number()

_____

In [None]:
df_wikimusica_short_fully_aligned_desc = describe_dataset(df_wikimusica_short_fully_aligned)
df_wikimusica_short_fully_aligned_desc

## Select attributes for training

In [None]:
sel_attributes = ['nombre artistico', 'nombre nacimiento', 'genero', 'ocupacion', 
                  'nacimiento fecha', 'nacimiento lugar', 'instrumento', 'grupo',
                  'nacionalidad', 'fallecimiento lugar', 'fallecimiento fecha',
                  'tipo voz']

def filter_attributes(df, attr_list):
    
    df = df[df['variable'].isin(attr_list)]
    return df

#####

df_wikimusica_short = filter_attributes(df_wikimusica_short,sel_attributes)
df_wikimusica_short_softly_aligned = filter_attributes(df_wikimusica_short_softly_aligned,sel_attributes)
df_wikimusica_short_fully_aligned = filter_attributes(df_wikimusica_short_fully_aligned,sel_attributes)
df_wikimusica_long = filter_attributes(df_wikimusica_long,sel_attributes)
df_wikimusica_long_softly_aligned = filter_attributes(df_wikimusica_long_softly_aligned,sel_attributes)
df_wikimusica_long_fully_aligned = filter_attributes(df_wikimusica_long_fully_aligned,sel_attributes)

In [None]:
df_wikimusica_short_fully_aligned_desc = describe_dataset(df_wikimusica_short_fully_aligned)
df_wikimusica_short_fully_aligned_desc

___

In [None]:
def encode_attributes(text):
    text = (text.replace('nombre artistico |', 'stagename |')
                .replace('nombre nacimiento |', 'birthname |')
                .replace('ocupacion |', 'occupation |')
                .replace('nacimiento lugar |', 'birthplace |')
                .replace('nacimiento fecha |', 'birthdate |')
                .replace('nacionalidad |', 'nation |')
                .replace('fallecimiento lugar |', 'deathplace |')
                .replace('fallecimiento fecha |', 'deathdate |')
                .replace('grupo |', 'group |')
                .replace('instrumento |', 'instrument |')
                .replace('tipo voz |', 'voice |')
                .replace('genero |', 'genre |'))
    
    return text

In [None]:
def generate_input_output(df):

    df['input'] = df['variable'] + ' | ' + df['value']
    
    data = pd.DataFrame(df.groupby(['_id_'])['input'].apply(lambda a: encode_attributes('wikimusic: ' + ' • '.join(a.replace('•',' ')))))
    data = data.merge(df.filter(['_id_', 'text']), 'left', '_id_')
    data.drop_duplicates(inplace=True)
    
    input_train, input_test, output_train, output_test = train_test_split(data['input'], data['text'], test_size=0.2, random_state=89)

    input_train = input_train.tolist()
    input_test = input_test.tolist()
    output_train = output_train.tolist()
    output_test = output_test.tolist()
    
    return [input_train, input_test, output_train, output_test]

In [None]:
wikimusica_short = generate_input_output(df_wikimusica_short)
wikimusica_short_softly_aligned = generate_input_output(df_wikimusica_short_softly_aligned)
wikimusica_short_fully_aligned = generate_input_output(df_wikimusica_short_fully_aligned)
wikimusica_long = generate_input_output(df_wikimusica_long)
wikimusica_long_softly_aligned = generate_input_output(df_wikimusica_long_softly_aligned)
wikimusica_long_fully_aligned = generate_input_output(df_wikimusica_long_fully_aligned)

# Save Preprocess Data

In [None]:
pickle.dump(wikimusica_short, open( "../Datasets/wikimusica_short.p", "wb"))
pickle.dump(wikimusica_short_softly_aligned, open( "../Datasets/wikimusica_short_softly_aligned.p", "wb"))
pickle.dump(wikimusica_short_fully_aligned, open( "../Datasets/wikimusica_short_fully_aligned.p", "wb"))
pickle.dump(wikimusica_long, open( "../Datasets/wikimusica_long.p", "wb"))
pickle.dump(wikimusica_long_softly_aligned, open( "../Datasets/wikimusica_long_softly_aligned.p", "wb"))
pickle.dump(wikimusica_long_fully_aligned, open( "../Datasets/wikimusica_long_fully_aligned.p", "wb"))