# Imports

In [1]:
import re
from unicodedata import normalize
import html
from tqdm.notebook import tqdm
import pickle
import gc

# Load articles

In [2]:
data_articles_raw = pickle.load(open("../DATA/artists_articles_raw.p", "rb"))

# Extract article parts

In [3]:
def extract_text_and_title(data: list):

    '''This function extracts the article itself and the title of it'''
    
    data_artists = list()
    
    title_pattern = r'<title>[\s\S]*?</title>'
    full_text_pattern = r'<text[\s\S]*?>[\s\S]*?</text>'
    text_pattern = r'<text[\s\S]*?>'
    
    with tqdm(total=len(data)) as pbar:
        
        for d in data:
            data_artist = dict()
            data_artist['articulo_raw'] = d
            
            data_artist['id_titulo'] = re.search(title_pattern, d).group()[7:-8]
            
            full_texto = re.search(full_text_pattern, d).group()
            text_header = re.search(text_pattern, full_texto).group()
            data_artist['texto_raw'] = full_texto.split(text_header)[1]
                      
            data_artists.append(data_artist)
            pbar.update(1)
        
    return data_artists

data_artists = extract_text_and_title(data_articles_raw)

HBox(children=(IntProgress(value=0, max=26807), HTML(value='')))




# Clean articles

In [4]:
def clean_text(data: list):
   
    '''This function cleans slightly the text of the articles'''
    
    with tqdm(total=len(data)) as pbar:    
        for d in data:
            d['texto_raw'] = d['texto_raw'].split('</text>')[0]
            pbar.update(1)
            
    return data
    
data_artists = clean_text(data_artists)

HBox(children=(IntProgress(value=0, max=26807), HTML(value='')))




## Clean particular cases

In [5]:
with tqdm(total=len(data_artists)) as pbar:
    for d in data_artists:
        if d['id_titulo']=='Daniela Carpio':
            d['texto_raw'] = (d['texto_raw']
                              .replace('{{{{','{{'))

        elif d['id_titulo']=='Ski Mask the Slump God':
            d['texto_raw'] = (d['texto_raw']
                              .replace('* escritor de canciones', '* escritor de canciones}}'))

        elif d['id_titulo']=='Marcus &amp; Martinus':
            d['texto_raw'] = (d['texto_raw']
                              .replace('marcusandmartinus.com/', 'marcusandmartinus.com/]]'))

        elif d['id_titulo']=='Virgil Popa':
            d['texto_raw'] = (d['texto_raw']
                              .replace('Virgil Popa.jpg', 'Virgil Popa.jpg}}'))

        elif d['id_titulo']=='Rangel':
            d['texto_raw'] = (d['texto_raw']
                              .replace('= [[elcartelurbano]],', '= [[elcartelurbano]]}}'))

        elif d['id_titulo']=='Jennifer Holliday':
            d['texto_raw'] = (d['texto_raw']
                              .replace('10|1960|edad', '10|1960|edad}}'))

        elif d['id_titulo']=='Infinite (grupo musical)':
            d['texto_raw'] = (d['texto_raw']
                              .replace('[[Hoya (Cantante)|Hoya]]', '[[Hoya (Cantante)|Hoya]]}}'))

        elif d['id_titulo']=='Coro de la Generalidad Valenciana':
            d['texto_raw'] = (d['texto_raw']
                              .replace('[[Francisco Hervás]]&lt;br /&gt;', '[[Francisco Hervás]]&lt;br /&gt;}}'))

        elif d['id_titulo']=='Boddega':
            d['texto_raw'] = (d['texto_raw']
                              .replace('Roberto Jijón, Carlos del Campo', 'Roberto Jijón, Carlos del Campo}}'))

        elif d['id_titulo']=='Andrea Bocelli':
            d['texto_raw'] = (d['texto_raw']
                              .replace('|idioma=español{{', '|idioma=español}}'))
            
        pbar.update(1)

HBox(children=(IntProgress(value=0, max=26807), HTML(value='')))




## Remove particular cases

In [6]:
def remove_particular_cases(data, to_remove_l):
    
    ind_to_remove_l = list()
    
    with tqdm(total=len(data)) as pbar:
        for ind, d in enumerate(data):
            if d['id_titulo'] in to_remove_l:
                ind_to_remove_l.append(ind)
                
            pbar.update(1)

    for ind in ind_to_remove_l[::-1]:
        data.pop(ind)
        
      
    return data


titulos_to_remove = ['Wikipedia:Tablón de anuncios de los bibliotecarios/Portal/Archivo/Protección de artículos/2018/04',
                     'Plantilla:Ficha de persona/doc',
                     'Wikipedia:Café/Portal/Archivo/Propuestas/2009/09',
                     'Wikipedia:Consultas de borrado/Giancarlo Monsalve',
                     'Gauvain Sers',
                     'Wernher von Braun',
                     'Raymond Kurzweil',
                     'Tomás Luis de Victoria',
                     'Cristóbal de Morales',
                     'Francisco Guerrero', 
                     'Juan Cabanilles',
                     'Robert Moog',
                     'Diego Ortiz',
                     'Alejandro García Villalón («Virulo»)',
                     'Jean-Pierre Christin',
                     'Juan Navarro Hispalensis',
                     'Keiji Fujiwara',
                     'Phil Zimmermann']

data_artists = remove_particular_cases(data_artists, titulos_to_remove)

HBox(children=(IntProgress(value=0, max=26807), HTML(value='')))




## Decode HTML tags

In [7]:
def decode_html_tags(data: list, key_name: str, new_key_name: str):

    '''This function decodes HTML tags from each article:     &lt;ref&gt;   -->  <ref> '''
    
    with tqdm(total=len(data)) as pbar:
        
        for d in data:
            d[new_key_name] = html.unescape(d[key_name])
            pbar.update(1)
        
    return data

data_artists = decode_html_tags(data_artists, 'texto_raw', 'texto')

HBox(children=(IntProgress(value=0, max=26789), HTML(value='')))




# Extract hidden elements

## Extract bracket structures

In [8]:
def extract_bracket_structure(data: list, key_name: str, new_key_name: str):
   
    '''This function extracts all the references from each article'''
    
    with tqdm(total=len(data)) as pbar:
        for d in data:
            
            brackets_l = list()
            texto_processed = d[key_name]
    
            left_index = 0
            right_index = len(texto_processed)
    
            list_index = 0
            while True:
        
                reg_resp_l = re.search(r'{{', texto_processed[left_index:right_index])

                if reg_resp_l == None:
                    break
            
                else:
            
                    left_index = reg_resp_l.span()[0] + left_index
                    reg_resp_r = re.search(r'}}', texto_processed[left_index:right_index])

                    if reg_resp_r == None:
                        break

                    else:

                        right_index = reg_resp_r.span()[1] + left_index
                        reg_resp_l_2 = re.search(r'{{', texto_processed[left_index+2:right_index])

                        if reg_resp_l_2 == None:
                            bracket = re.search(r'{{[\s\S}]*?}}', texto_processed[left_index:]).group(0)
                            list_index_str = '%{% '+ str(list_index) + ' %}%'
                            texto_processed = list_index_str.join(texto_processed.split(bracket))

                            brackets_l.append(bracket)
                            list_index += 1

                            left_index = 0
                            right_index = len(texto_processed)

                        else:
                            left_index = reg_resp_l_2.span()[0] + left_index
                
                
            d[new_key_name] = texto_processed
            d['_brackets_'] = brackets_l

            pbar.update(1)
        
    return data


data_artists = extract_bracket_structure(data_artists, 'texto', 'texto')       

HBox(children=(IntProgress(value=0, max=26789), HTML(value='')))




## Extract infobox

In [9]:
def extract_infobox(data: list):
    
    ''' This function extracts the infobox part of the articles'''
    
    pattern = r'{{Ficha de'
    pattern_persona = r'{{Ficha de persona'

    with tqdm(total=len(data)) as pbar:
        for d in data:
            for b in d['_brackets_']:
                response = re.match(pattern, b)
                if response:
                    d['infobox'] = b
                    
                    response_persona = re.match(pattern_persona, b)
                    if response_persona:
                        d['tipo_articulo'] = 'persona'
                    else:
                        d['tipo_articulo'] = 'grupo' 
                    
                    
            pbar.update(1)

        return data


data_artists = extract_infobox(data_artists)

HBox(children=(IntProgress(value=0, max=26789), HTML(value='')))




## Extract references

In [10]:
def extract_references(data: list, key_name: str, new_key_name: str):
   
    '''This function extracts all the references from each article'''
    
    with tqdm(total=len(data)) as pbar:
        for d in data:
            
            references_l = list()
            texto_processed = d[key_name]
    
            left_index = 0
            right_index = len(texto_processed)
    
            list_index = 0
            while True:
                
                reg_resp_l = re.search(r'<[^/!]*?>', texto_processed[left_index:right_index])
                if reg_resp_l == None:
                    
                    reg_resp_l = re.search(r'<![\s\S]*?>', texto_processed[left_index:right_index])        
                    if reg_resp_l != None:
                    
                        texto_processed = ''.join(texto_processed.split(reg_resp_l.group()))
                       
                    else:
                        
                        reg_resp_l = re.search(r'<[^/]*?/>', texto_processed[left_index:right_index])
                        if reg_resp_l == None:
                            
                            break
                        
                        else:

                            tag_raw = reg_resp_l.group()
                            tag_name = tag_raw.split(' ')[0][1:]

                            try:
                                tag_value = tag_raw.split(' ')[1][:-2]
                            except:
                                tag_value = ''

                            tag = dict()
                            tag = {'tag': tag_name, 'value': tag_value}

                            list_index_str = '%«% '+ str(list_index) + ' %»%'
                            list_index += 1
                            
                            texto_processed = list_index_str.join(texto_processed.split(tag_raw))
                            
                            references_l.append(tag)
 
                            left_index = 0
                            right_index = len(texto_processed)
                        
                
                else:
                    
                    tag_name = reg_resp_l.group()[1:-1].split(' ')[0]
                    tag_raw_open = reg_resp_l.group()
                    
                    left_index = reg_resp_l.span()[1] + left_index
                    
                    reg_resp_r = re.search(r'</[\s\S]*?>', texto_processed[left_index:right_index])
                    if reg_resp_r == None:
                        
                        break

                        
                    else:
                        
                        tag_raw_close = reg_resp_r.group()
                        right_index = reg_resp_r.span()[0] + left_index

                        reg_resp_l_2 = re.search(r'<[\s\S]*?>', texto_processed[left_index:right_index-1])
                        if reg_resp_l_2 == None:
                            
                            tag_value = texto_processed[left_index:right_index]
                            tag_raw = tag_raw_open + tag_value + tag_raw_close
                            tag = {'tag': tag_name, 'value': tag_value}

                            list_index_str = '%«% '+ str(list_index) + ' %»%'
                            list_index += 1

                            texto_processed = list_index_str.join(texto_processed.split(tag_raw))

                            references_l.append(tag)

                            left_index = 0
                            right_index = len(texto_processed)

                        else:
                                  
                            left_index = reg_resp_l_2.span()[0] + left_index
                
                
            d[new_key_name] = texto_processed
            d['_references_' + new_key_name] = references_l

            pbar.update(1)
        
    return data

data_artists = extract_references(data_artists, 'texto', 'texto')  
data_artists = extract_references(data_artists, 'infobox', 'infobox')

HBox(children=(IntProgress(value=0, max=26789), HTML(value='')))




HBox(children=(IntProgress(value=0, max=26789), HTML(value='')))




## Replace note separator

In [11]:
def replace_note_separator(data: list, key_name: str, new_key_name: str):
   
    '''This function replaces note separators'''
    
    with tqdm(total=len(data)) as pbar:
        
        for ind,d in enumerate(data):    
            texto_processed = d[key_name]
    
            left_index = 0
            right_index = len(texto_processed)
    
            while True:
                
                reg_resp_l = re.search(r'\[\[', texto_processed[left_index:right_index])

                if reg_resp_l == None:
                    break
            
                else:
            
                    left_index = reg_resp_l.span()[0] + left_index
                    reg_resp_r = re.search(r'\]\]', texto_processed[left_index:right_index])

                    if reg_resp_r == None:
                        print('¡warning!')
                        print(ind)
                        break

                    else:

                        right_index = reg_resp_r.span()[1] + left_index
                        note = texto_processed[left_index:right_index]
                        note_replacement = note.replace('|', '%%')
                        
                        texto_processed = note_replacement.join(texto_processed.split(note))
                        
                        left_index = right_index
                        right_index = len(texto_processed)
                
                
            d[new_key_name] = texto_processed

            pbar.update(1)
        
    return data


data_artists = replace_note_separator(data_artists, 'infobox', 'infobox')       

HBox(children=(IntProgress(value=0, max=26789), HTML(value='')))




# Save the articles

In [12]:
pickle.dump(data_artists, open( "../DATA/artists_articles.p", "wb" ) )