# Imports

In [None]:
import re
from unicodedata import normalize
import html

from tqdm.notebook import tqdm
tqdm.pandas()

import pickle
import gc
from itertools import compress
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

from difflib import SequenceMatcher

import nltk
from nltk import SnowballStemmer

import spacy
nlp = spacy.load('es_core_news_sm')
nlp.max_length = 3500000

# Load articles

In [None]:
df_wikimusica_short = pickle.load(open("../Data/df_wikimusica_short.p", "rb"))
df_wikimusica_short.reset_index(inplace=True)

df_wikimusica_long = pickle.load(open("../Data/df_wikimusica_long.p", "rb"))
df_wikimusica_long.reset_index(inplace=True)

In [None]:
df_wikimusica_short.text.iloc[0]

In [None]:
df_wikimusica_long.text.iloc[0]

In [None]:
df_wikimusica_short_fully_aligned = df_wikimusica_short.copy()

In [None]:
df_wikimusica_short_softly_aligned = df_wikimusica_short.copy()
df_wikimusica_short_fully_aligned = df_wikimusica_short.copy()

####

df_wikimusica_long_softly_aligned = df_wikimusica_long.copy()
df_wikimusica_long_fully_aligned = df_wikimusica_long.copy()

___

____

# Functions

In [None]:
def check_alignment(attr_value:str, text:str, strict, stemm=False):
    
    spanishstemmer =SnowballStemmer('spanish')
    
    match_l = list()
    attr_value = attr_value.split('::')[-1]
    attr_doc = nlp(attr_value)
    text_doc = nlp(text)
    
    for attr_t in attr_doc: 
        if (attr_t.is_punct) | (attr_t.is_space) | (attr_t.is_stop):
            
                if strict:
                    match=True
                else:
                    match=False
                    
                continue
                
        if strict:
            match=True
        else:
            match=False
            
        for token in text_doc:
            if (token.is_punct) | (token.is_space) | (token.is_stop):
                continue
            else:
                
                if stemm:
                    t = token.text_with_ws.lower().strip()
                    t2 = attr_t.text_with_ws.lower().strip()
                else:
                    t = spanishstemmer.stem(token.text_with_ws.lower().strip())
                    t2 = spanishstemmer.stem(attr_t.text_with_ws.lower().strip())                    
                
                similarity = SequenceMatcher(None, t, t2).ratio()
                
                if similarity>0.7:

                    if strict:
                        match=False
                    else:
                        match=True
                        
                    break
        
        match_l.append(match)
    
    if strict:
        if(sum(match_l)==0):
            return True
        else:
            return False
    else:
        if(sum(match_l)!=0):
            return True
        else:
            return False

        
####

def check_row_alignment(row, text_name='text', strict=True, stemm=False):
    
    text = row[text_name]
    attr = row['value']

    return check_alignment(attr_value=attr, text=text, strict=strict, stemm=stemm)


####

def check_multinstrumentista(text: str):
    m = text.find('multinstrumentista')
    if m!=-1:
        return True
    else:
        m = text.find('multiinstrumentista')
        if m!=-1:
            return True
        else:
            return False
        

####
                
def process_alignment(df, attr_list, strict=True, stemm=False, text_name='text'):
    
    for attr in attr_list:
        data_filt = df[df['variable']==attr]

        if stemm:
            data_filt['stemm_text'] = data_filt['text'].progress_apply(lambda s: s.replace('vocalista','voz')
                                                                                  .replace('cantante','voz')
                                                                                  .replace('ista',''))
            text_name = 'stemm_text'
        
        
        data_filt[attr + '_aligned'] = data_filt.progress_apply(check_row_alignment,
                                                                axis='columns',
                                                                strict=strict,
                                                                stemm=stemm,
                                                                text_name=text_name)   
            
        if stemm:
            data_filt.drop(columns=['stemm_text'], inplace=True)
            
        if attr=='instrumento':
            mask_is_multi = data_filt['text'].progress_apply(check_multinstrumentista)
            data_filt[attr + '_aligned'] = np.where(mask_is_multi, True, data_filt[attr + '_aligned'])
        
        
        data_filt = data_filt.filter(['index', attr + '_aligned'])
        df = df.merge(data_filt, 'left', 'index')
        
        print(f'Initial Shape: ',df.shape[0])
        print('----')
        print(attr)
        mask = (df[attr + '_aligned']==False)
        df = df[~mask]
        print(f'Shape: ',df.shape[0])
        
    return df


#####


def discard_artists_4attr(df):

    print(f'\nInitial Shape: ',df.shape[0])
    print('----')
        
    data_filt = pd.DataFrame(df.groupby('_id_').count()['index']).rename(columns={'index': '#_attr'})

    for i in range(1,4):
        list_id = data_filt[data_filt['#_attr']==i].reset_index()['_id_'].tolist()
        _id_list = df['_id_'].isin(list_id)
        df = df[~_id_list]
        print(f'Shape: ',df.shape[0])
        
    return df

####

def save_dataset(df, name):
    
    df = df.filter(['_id_', '_titulo_', '_url_', 'variable', 'value', 'text']).reset_index(drop=True)
    pickle.dump(df, open( "../Data/"+name+".p", "wb"))
    print('dataset saved!')
    
####

____

# Alignment parameters

In [None]:
## SOFT ALIGNMENT

soft_attrs_fully_aligned = [
    'nombre artistico',
    'nombre nacimiento',
    'ocupacion',
    'nacimiento lugar',
    #'nacimiento fecha',
    'fallecimiento lugar',
    #'fallecimiento fecha',
    'fallecimiento causa',
    'residencia',
    'pareja',
    'pareja periodo',
    'conyugue',
    'conyugue periodo',
    'conyugue razon fin',
    'progenitor',
    'hijo/a',
    'hermano/a',
    'grupo',
    'instrumento modelo',
    'premio año',
    'tipo voz',
    'idioma',
    #'genero',
    'sello',
    'sello periodo',
    'disco año',
    'single año',
    'estudios',
    'maestro/a',
    'alumno/a',
]

soft_attrs_softly_aligned = [
    'nacionalidad',           # soft
    'premio',                 # soft
    'disco',                  # soft
    'single',                 # soft
    'religion',               # soft
    'actividad',              # soft
]

soft_attrs_stemm_aligned = [
    'instrumento'             # stemmization alignment
]

In [None]:
## FULL ALIGNMENT

full_attrs_fully_aligned = [
    'nombre artistico',
    'nombre nacimiento',
    'ocupacion',
    'nacimiento lugar',
    'nacimiento fecha',
    'nacionalidad',
    'fallecimiento lugar',
    'fallecimiento fecha',
    'fallecimiento causa', 
    'residencia',
    'pareja',
    'pareja periodo',
    'conyugue',
    'conyugue periodo',
    'conyugue razon fin',
    'progenitor',
    'hijo/a',
    'hermano/a',
    'grupo',
    'instrumento modelo',
    'premio',
    'premio año',
    'tipo voz',
    'idioma',
    'sello',
    'sello periodo',
    'disco',
    'disco año',
    'single',
    'single año',
    'estudios',
    'maestro/a',
    'alumno/a',
]
  
full_attrs_softly_aligned = [
    'actividad',                # --       # split/full alignment 
]

full_attrs_stemm_aligned = [
    'genero',
    'instrumento'               # stemmization alignment
]

___

_____

# Model preprocessing

## df_wikimusica_short_softly_aligned

In [None]:
%%time

df_wikimusica_short_softly_aligned = process_alignment(df_wikimusica_short_softly_aligned,
                                                       soft_attrs_fully_aligned)

df_wikimusica_short_softly_aligned = process_alignment(df_wikimusica_short_softly_aligned,
                                                       soft_attrs_softly_aligned, strict=False)

df_wikimusica_short_softly_aligned = process_alignment(df_wikimusica_short_softly_aligned,
                                                       soft_attrs_stemm_aligned, stemm=True)

df_wikimusica_short_softly_aligned = discard_artists_4attr(df_wikimusica_short_softly_aligned)

save_dataset(df_wikimusica_short_softly_aligned, 'df_wikimusica_short_softly_aligned')

____

## df_wikimusica_short_fully_aligned

In [None]:
%%time

df_wikimusica_short_fully_aligned = process_alignment(df_wikimusica_short_fully_aligned,
                                                       full_attrs_fully_aligned)

df_wikimusica_short_fully_aligned = process_alignment(df_wikimusica_short_fully_aligned,
                                                       full_attrs_softly_aligned, strict=False)

df_wikimusica_short_fully_aligned = process_alignment(df_wikimusica_short_fully_aligned,
                                                       full_attrs_stemm_aligned, stemm=True)

df_wikimusica_short_fully_aligned = discard_artists_4attr(df_wikimusica_short_fully_aligned)

save_dataset(df_wikimusica_short_fully_aligned, 'df_wikimusica_short_fully_aligned')

____

## df_wikimusica_long_softly_aligned

In [None]:
%%time

df_wikimusica_long_softly_aligned = process_alignment(df_wikimusica_long_softly_aligned,
                                                       soft_attrs_fully_aligned)

df_wikimusica_long_softly_aligned = process_alignment(df_wikimusica_long_softly_aligned,
                                                       soft_attrs_softly_aligned, strict=False)

df_wikimusica_long_softly_aligned = process_alignment(df_wikimusica_long_softly_aligned,
                                                       soft_attrs_stemm_aligned, stemm=True)

df_wikimusica_long_softly_aligned = discard_artists_4attr(df_wikimusica_long_softly_aligned)

save_dataset(df_wikimusica_long_softly_aligned, 'df_wikimusica_long_softly_aligned')

____

## df_wikimusica_long_fully_aligned

In [None]:
%%time

df_wikimusica_long_fully_aligned = process_alignment(df_wikimusica_long_fully_aligned,
                                                       full_attrs_fully_aligned)

df_wikimusica_long_fully_aligned = process_alignment(df_wikimusica_long_fully_aligned,
                                                       full_attrs_softly_aligned, strict=False)

df_wikimusica_long_fully_aligned = process_alignment(df_wikimusica_long_fully_aligned,
                                                       full_attrs_stemm_aligned, stemm=True)

df_wikimusica_long_fully_aligned = discard_artists_4attr(df_wikimusica_long_fully_aligned)

save_dataset(df_wikimusica_long_fully_aligned, 'df_wikimusica_long_fully_aligned')

_____