# Imports

In [None]:
import re
from unicodedata import normalize
import html
from tqdm.notebook import tqdm
import pickle
import gc

# Load articles

In [9]:
data = pickle.load(open("../Data/data_v1.p", "rb"))

# Extract metadata

## HMTL tags

In [10]:
def decode_html_tags(data: list, key_name: str, new_key_name: str):

    '''This function decodes HTML tags from each article:     &lt;ref&gt;   -->  <ref> '''
    
    with tqdm(total=len(data)) as pbar:
        
        for d in data:
            d[new_key_name] = html.unescape(d[key_name])
            pbar.update(1)
        
    return data

####

data = decode_html_tags(data, 'body', 'body')

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=26789.0), HTML(value='')))




## HTML comments

In [11]:
def remove_html_comments(data: list, key_name: str, new_key_name: str):

    '''This function removes HTML comments from each article'''
    
    with tqdm(total=len(data)) as pbar:
        
        for d in data:
            d[new_key_name] = re.sub("<!--.+?-->", "", d[key_name])
            pbar.update(1)
        
    return data

####

data = remove_html_comments(data, 'body', 'body')

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=26789.0), HTML(value='')))




In [14]:
s = [d['body'] for d in data if d['_id_']=='213688'][0]
print(s)

{{Ficha de persona
|nombre = Nick Mason
|imagen = Nick_Mason_20060603_Fnac_08.jpg
|tamaño de imagen = 250px
|pie de imagen = Nick Mason, junio 2006.
|nombre de nacimiento = Nicholas Berkeley Mason
|fecha de nacimiento = {{Fecha de inicio|27|1|1944|edad}} {{bandera|Reino Unido}} 
|instrumento = [[batería (instrumento musical)|Batería]], [[Teclado electrónico|teclados]], [[bajo (instrumento musical)|bajo]], [[guitarra]]
|género = [[Rock progresivo]], [[rock psicodélico]], [[rock experimental]], [[rock instrumental]]
|ocupación = [[músico]], [[Productor discográfico|productor]], [[escritor]]
|años activo = [[1964]]-[[2018]]
|compañía discográfica = [[Capitol Records]], [[Columbia Records]], [[Sony Music Entertainment|Sony]], [[EMI]], [[Harvest Records|Harvest]]
|relacionados = [[Pink Floyd]]<br />[[Sigma 6 (banda)|Sigma 6]]<br />[[Sigma 6 (banda)|The Screaming Abdabs]]<br />[[Mason + Fenn]]<br />[[Nick Mason's Saucerful of Secrets]]<br />[[Robert Wyatt]]<br />[[Carla Bley]]<br />[[Michael

## Breake line tags

In [5]:
def replace_break_line_tags(data: list, key_name: str, new_key_name: str):

    '''This function replaces <br> HTML tags'''
    
    with tqdm(total=len(data)) as pbar:
        
        for d in data:
            
            text = d[key_name]
            
            br_line_tags = ['<br>', '<br >',
                            '<Br>', '<Br >',

                            '<br/>', '<br />', '<br/ >', '<br / >',
                            '<Br/>', '<Br />', '<Br/ >', '<Br / >',
                            '>br>',
                            
                            '&nbsp;', '&nbsp',
                            '<li>',
                            
                            '<span>', '</span>',
                             ]

            for br in br_line_tags:
                text = text.replace(br, '\n')

            d[new_key_name] = text
            
            pbar.update(1)

    return data

####

data = replace_break_line_tags(data, 'body', 'body')

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=26789.0), HTML(value='')))




## Double square brackets

In [6]:
def clean_double_square_brackets_metadata(data: list, key_name: str, new_key_name: str):
   
    '''This function cleans double square brackets'''
    
    with tqdm(total=len(data)) as pbar:
        
        for ind,d in enumerate(data):    
            texto_processed = d[key_name]
    
            left_index = 0
            right_index = len(texto_processed)
    
            while True:
                
                reg_resp_l = re.search(r'\[\[', texto_processed[left_index:right_index])

                if reg_resp_l == None:
                    break
            
                else:
            
                    left_index = reg_resp_l.span()[0] + left_index
                    reg_resp_r = re.search(r'\]\]', texto_processed[left_index:right_index])

                    if reg_resp_r == None:
                        print('¡warning!')
                        print(ind)
                        break

                    else:

                        right_index = reg_resp_r.span()[1] + left_index
                        note = texto_processed[left_index:right_index]
                        
                        note_replacements = note.split('|')
                        
                        if len(note_replacements)==1:
                            note_replacement = note_replacements[0]
                            left_padding = 0
                        elif len(note_replacements)==2:
                            note_replacement = '[[' + note_replacements[1]
                            left_padding = len(note_replacements[0]) - 2
                        else:
                            note_replacement = '[[]]'
                            left_padding = len(note) - 4
                        
                        texto_processed = note_replacement.join(texto_processed.split(note))
                        left_index = right_index-left_padding
                        right_index = len(texto_processed)
                
                
            d[new_key_name] = texto_processed

            pbar.update(1)
            
        return data

####

data = clean_double_square_brackets_metadata(data, 'body', 'body')         

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=26789.0), HTML(value='')))

21048
24730



## HTML

In [7]:
def extract_html_element(text: str):

    html_l = list()
    
    left_index = 0
    right_index = len(text)

    list_index = 0
    while True:

        reg_resp_l = re.search(r'<[^/!]*?>', text[left_index:right_index])
        if reg_resp_l == None:

            reg_resp_l = re.search(r'<![\s\S]*?>', text[left_index:right_index])        
            if reg_resp_l != None:

                text = ''.join(text.split(reg_resp_l.group()))

            else:

                reg_resp_l = re.search(r'<[^/]*?/>', text[left_index:right_index])
                if reg_resp_l == None:

                    break

                else:

                    tag_raw = reg_resp_l.group()
                    tag_name = tag_raw.split(' ')[0][1:]

                    try:
                        tag_value = tag_raw.split(' ')[1][:-2]
                    except:
                        tag_value = ''

                    tag = dict()
                    tag = {'tag': tag_name, 'value': tag_value}

                    list_index_str = '%«% '+ str(list_index) + ' %»%'
                    list_index += 1

                    text = list_index_str.join(text.split(tag_raw))

                    html_l.append(tag)

                    left_index = 0
                    right_index = len(text)


        else:

            tag_name = reg_resp_l.group()[1:-1].split(' ')[0]
            tag_raw_open = reg_resp_l.group()

            left_index = reg_resp_l.span()[1] + left_index

            reg_resp_r = re.search(r'</[\s\S]*?>', text[left_index:right_index])
            if reg_resp_r == None:

                break


            else:

                tag_raw_close = reg_resp_r.group()
                right_index = reg_resp_r.span()[0] + left_index

                reg_resp_l_2 = re.search(r'<[\s\S]*?>', text[left_index:right_index-1])
                if reg_resp_l_2 == None:

                    tag_value = text[left_index:right_index]
                    tag_raw = tag_raw_open + tag_value + tag_raw_close
                    tag = {'tag': tag_name, 'value': tag_value}

                    list_index_str = '%«% '+ str(list_index) + ' %»%'
                    list_index += 1

                    text = list_index_str.join(text.split(tag_raw))

                    html_l.append(tag)

                    left_index = 0
                    right_index = len(text)

                else:

                    left_index = reg_resp_l_2.span()[0] + left_index
                            
    return text, html_l

####


def extract_html_elements_metadata(data: list, key_name: str, new_key_name: str):
   
    '''This function extracts all the html elements from the text and replaces them with placeholders'''
    
    with tqdm(total=len(data)) as pbar:
        for d in data:
            
            
            text_processed, html_l = extract_html_element(d[key_name])
            d[new_key_name] = text_processed
            d['_metadata_html_'] = html_l

            pbar.update(1)
        
    return data

####


data = extract_html_elements_metadata(data, 'body', 'body')

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=26789.0), HTML(value='')))




## Curly brackets

In [9]:
def extract_curly_brackets_metadata(data: list, key_name: str, new_key_name: str):
   
    '''This function extracts all the curly brackets from the text and replaces them with placeholders'''
    
    with tqdm(total=len(data)) as pbar:
        for d in data:
            
            brackets_l = list()
            texto_processed = d[key_name]
    
            left_index = 0
            right_index = len(texto_processed)
    
            list_index = 0
            while True:
        
                reg_resp_l = re.search(r'{{', texto_processed[left_index:right_index])

                if reg_resp_l == None:
                    break
            
                else:
            
                    left_index = reg_resp_l.span()[0] + left_index
                    reg_resp_r = re.search(r'}}', texto_processed[left_index:right_index])

                    if reg_resp_r == None:
                        break

                    else:

                        right_index = reg_resp_r.span()[1] + left_index
                        reg_resp_l_2 = re.search(r'{{', texto_processed[left_index+2:right_index])

                        if reg_resp_l_2 == None:
                            bracket = re.search(r'{{[\s\S}]*?}}', texto_processed[left_index:]).group(0)
                            list_index_str = '%{% '+ str(list_index) + ' %}%'
                            texto_processed = list_index_str.join(texto_processed.split(bracket))

                            brackets_l.append(bracket)
                            list_index += 1

                            left_index = 0
                            right_index = len(texto_processed)

                        else:
                            left_index = reg_resp_l_2.span()[0] + left_index
                
                
            d[new_key_name] = texto_processed
            d['_metadata_brackets_'] = brackets_l

            pbar.update(1)
        
    return data


data = extract_curly_brackets_metadata(data, 'body', 'text')       

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=26789.0), HTML(value='')))




# Divide article parts

In [10]:
def extract_infobox(data: list):
    
    ''' This function extracts the infobox part of the articles'''
    
    pattern = r'{{Ficha de'
    pattern_persona = r'{{Ficha de persona'

    with tqdm(total=len(data)) as pbar:
        for d in data:
            for b in d['_metadata_brackets_']:
                response = re.match(pattern, b)
                if response:
                    d['infobox'] = b
                    
                    response_persona = re.match(pattern_persona, b)
                    if response_persona:
                        d['_tipo_'] = 'persona'
                    else:
                        d['_tipo_'] = 'grupo' 
                    
                    
            pbar.update(1)

        return data


data = extract_infobox(data)


####

#for d in data:
    #del d['body']

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=26789.0), HTML(value='')))




# Divide article parts

## Infobox keys

In [11]:
def normalize_string(text: str):
    
    '''This function normalizes a string'''
    
    text = text.lower().replace('_',' ').replace('-',' ').strip()

    # -> NFD & remove diacritical
    text = re.sub(r'([^n\u0300-\u036f]|n(?!\u0303(?![\u0300-\u036f])))[\u0300-\u036f]+',
                      r'\1', normalize( "NFD", text), 0, re.I)

    # -> NFC
    text = normalize('NFC', text)
    
    return text


#####


def get_infobox_attributes_pattern():
    
    '''This function returns the regex pattern to extract the attributes from each infobox'''
        
    pattern = r'(\n\|)([^=]*)(=)([^\n]*)' # group 2 -> attr. name , group 4 -> attr. value
    
    # \n\|      : ... cada atributo empieza por un salto de línea "\n" y el símbolo "|"
    # [^=]*     : ... luego, todo lo que viene es el nombre de la variable (group 2)
    # =         : ... hasta llegar a un símbolo "="
    # ([^\n]*)  : ... luego, todo lo que hay hasta un salto de línea "\n" es el valor de la variable (group 4)
    
    return pattern


#####

    
def extract_infobox_attributes(data: list):
    
    '''This function extracts and returns all the attributes and their values from each infobox'''
    
    pattern = get_infobox_attributes_pattern()

    with tqdm(total=len(data)) as pbar:
        for a in data:

            attr_names = [x.group(2).strip() for x in re.finditer(pattern, a['infobox'])]
            attr_values = [x.group(4).strip() for x in re.finditer(pattern, a['infobox'])]


            for i in range(len(attr_names)):

                attr_name = normalize_string(attr_names[i])
                attr_value = attr_values[i].strip()
                
                try:
                    if attr_name[0]=='|':
                        attr_name = attr_name[1:].strip()
                except:
                    pass
                
                attrs = attr_name.split('\n|')
                num_attrs = len(attrs)
                
                if num_attrs>1:
                    for n in attrs[:-1]:
                        a[n] = ''

                    a[attrs[-1]] = attr_value
                else:
                    a[attr_name] = attr_value
                
            pbar.update(1)
        
    return data


####

data = extract_infobox_attributes(data)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=26789.0), HTML(value='')))




## Attribute freq

In [12]:
def get_attr_frequency(data: list):
    
    '''This function returns the keys sorted by frecuency of appearence in the dataset'''
    
    attr_freq = {}
    
    for a in data:
        for k in list(a.keys()):

            if k in attr_freq.keys():
                attr_freq[k] += 1
            else:
                attr_freq[k] = 1

    attr_freq = {k: v for k, v in sorted(attr_freq.items(), key=lambda item: item[1], reverse=True)}

    num_attrs = len(attr_freq.keys())
    print(f'# of distinct attributes: {num_attrs}')
    
    return attr_freq

####

attr_frequency = get_attr_frequency(data)

# of distinct attributes: 1387


In [13]:
attr_frequency

{'_titulo_': 26789,
 '_id_': 26789,
 'body': 26789,
 '_metadata_html_': 26789,
 'text': 26789,
 '_metadata_brackets_': 26789,
 'infobox': 26789,
 '_tipo_': 26789,
 'nombre': 24867,
 'imagen': 23695,
 'relacionados': 17760,
 'ocupacion': 15551,
 'instrumento': 15338,
 'genero': 15147,
 'fecha de nacimiento': 14522,
 'nombre de nacimiento': 14326,
 'años activo': 13717,
 'compañia discografica': 12800,
 'alias': 12463,
 'pie de imagen': 12387,
 'origen': 10528,
 'fondo': 10520,
 'tiempo': 10493,
 'discografica': 10107,
 'tamaño de imagen': 10052,
 'estilo': 9876,
 'miembros': 9848,
 'lugar de nacimiento': 9735,
 'fecha de fallecimiento': 9599,
 'pagina web': 9138,
 'url': 9105,
 'subtitulo': 8535,
 'tamaño': 8223,
 'otros miembros': 6993,
 'conyuge': 6786,
 'estado': 6596,
 'hijos': 6480,
 'voz': 5765,
 'url2': 4062,
 'logo': 3875,
 'nacionalidad': 3833,
 'firma': 3671,
 'sitio web': 3374,
 'artistas relacionados': 3012,
 'pareja': 2747,
 'facebook': 2712,
 'twitter': 2615,
 'nacimiento'

## Extract new attributes

In [14]:
def get_new_attributes(text: str):
    
    new_attributes = list()
    
    pattern = '\|.*?='
    elements = re.findall(pattern, text)
    elements = elements[::-1]
    
    t = text
    
    if len(elements)>0:

        t = text.split(elements[-1])[0]
        
        for ind, e in enumerate(elements):
            
            new_key = e[1:-1].strip().lower()
            new_value = text.split(e)[1]
            
            if ind!=0:    
                new_value = new_value.split(elements[ind-1])[0]
            
            new_value = new_value.strip()
            new_attributes.append([new_key, new_value])

    return t, new_attributes


####

attr_l = list(attr_frequency.keys())[10:]
for a in data:
    keys_l = list(a.keys())
    for key in keys_l:
        if key in attr_l:
            a[key], new_attributes = get_new_attributes(a[key])
            for attr in new_attributes:
                a[attr[0]] = attr[1]

####

print(f'# of articles: {len(data)}')
attr_frequency = get_attr_frequency(data)

# of articles: 26789
# of distinct attributes: 1440


## Remove empty keys

In [15]:
def remove_empty_attributes(data: list):
    
    '''This function removes empty attributes from the dataset''' 
    
    for a in data:
        attrs = list(a.keys())
        for attr in attrs:
            if a[attr]=='':
                if attr == '_tipo_':
                    print(a[attr])
                del a[attr]
            
    return data

####

data_artists = remove_empty_attributes(data)
attr_frequency = get_attr_frequency(data)

# of distinct attributes: 1020


## Split musicians & groups

In [16]:
def split_musicians_and_groups(data: list):
    
    '''This function splits data between musicians and groups'''
    
    data_persons = list()
    data_groups = list()
    
    for a in data:
        if a['_tipo_'] == 'persona':
            data_persons.append(a)
        elif a['_tipo_'] == 'grupo':
            data_groups.append(a)
            
    return data_persons, data_groups

####

data_persons, data_groups = split_musicians_and_groups(data)

In [17]:
print(f'# of articles about persons: {len(data_persons)}')
attr_frequency_persons = get_attr_frequency(data_persons)

# of articles about persons: 15506
# of distinct attributes: 654


In [18]:
print(f'# of articles about groups: {len(data_groups)}')
attr_frequency_groups = get_attr_frequency(data_groups)

# of articles about groups: 11283
# of distinct attributes: 563


# Save the articles

In [19]:
pickle.dump(data_persons, open( "../Data/data_v2_persons.p", "wb"))
pickle.dump(data_groups, open( "../Data/data_v2_groups.p", "wb"))