In [2]:
# Libraries

import numpy as np
import pandas as pd
import re
import emoji

import requests
from bs4 import BeautifulSoup
import html
import lxml

## Procesador de eventos

In [63]:
#### Function 1
def get_data(url):
    # Link to the event
    r = requests.get(url)
    # Pulling the data from the link
    soup = BeautifulSoup(r.text, "lxml")
    # Take event title and description
    # In the event description is where "accesibility" info is located
    event_title = soup.find(class_ = "plan-hero__title").text
    event_descr = soup.find(class_ = "plan-description mb-32")
    # Transform the event description into string for later processing
    event_descr = str(event_descr)
    # Get price info
    event_price = soup.find(class_ = "sidebarBuyingText sidebarWrapper__btn").text
    event_price = event_price.split("\xa0€")[0]

    return event_title, event_descr, event_price

#### Function 2
def separate_sections(event_descr):
    # 1) Split the data using the "<strong>" tag -> This way we separate the sections
    # 2) Split the data using the "</strong>" tag -> This way we separate titles from descriptions
    event_descr_items = [elem.split("</strong>") for elem in event_descr.split("<strong>")]

    return event_descr_items

#### Function 3
def separate_title_descr(event_descr_items):
    # Dict to save the info title-descriptions that we have in a list of lists
    title_descr = {}
    # This is to assign a numerical value as key to those descriptions without section title
    no_title_count = 1

    # Iterate over the list of lists, and for every list...
    for elem in event_descr_items:
        # If there's more than one element (that means, we have description and title)...
        if len(elem) > 1:
            # Then the first element will be the key and the second one will be the value in our new dict
            title_descr[elem[0]] = elem[1]
        # If we don't have two values (we are missing the title)...
        else:
            # Then, the key will be the numerical value we defined and the value will be the only value of the list, which should be the description
            title_descr[no_title_count] = elem[0]
            no_title_count += 1

    return title_descr

#### Function 4
def remove_html(title_descr):
    # This is to remove all the html tags from the text
    for key, value in title_descr.items():
        title_descr[key] = re.sub(r"\<.*?\>", "", value)

    return title_descr
    
#### Function 5
def remove_emojis(string, emoji_list):
    # We'll save all the actual characters (not emojis) here
    new_string = []
    # Iterate over the string
    for s in string:
        # If character not in the emoji list
        if s not in emoji_list:
            # append it to the new_string list
            new_string.append(s)
    
    # join the list into a new string
    return "".join(map(str, new_string))

#### Function 6
def remove_all_emojis(title_descr_without_html, emojis, info_names):
    
    for key, val in title_descr_without_html.items():
        if key in info_names:
            pass
        else:
            title_descr_without_html[key] = remove_emojis(title_descr_without_html[key], emojis)

    return title_descr_without_html

#### Function 7
def separate_info_sections(title_descr_without_html, info_names, icons_list):
    # Now I have to do all the processing with the iterators to pull the information from "Información general".

    # We'll need an iterator
    # iterator = new_dict["Información General"]
    for name in info_names:
        try:
            iterator = title_descr_without_html[name]
            break
        except:
            pass

    # We'll need an empty list too, save the subsections
    general_info_sections = []

    # Iterate over all the icons in the list
    for icon in icons_list:
        # If the icon is in the iterator (string)...
        if icon in iterator:
            # Split iterator using the icon and save it as the new iterator
            # Input: string
            # Output: list with 2 items
            iterator = iterator.split(icon)
            # Save the first item in our new list
            general_info_sections.append(iterator[0])

            # Check if there is still any icon left in the second element of the iterator
            if any(icon in iterator[1] for icon in icons_list):
                # If so, save it as the new iterator (a string again)
                iterator = iterator[1]

            # Else, save it in our new list.
            # As we don't have more icons in the second element, that means, we don't need to split the iterator anymore, since we already reach the last piece of info we needed
            else:
                general_info_sections.append(iterator[1])
        else:
            pass

    return general_info_sections

#### Function 8
def separate_info_sections_title_descr(general_info_sections):
    
    new_general_info_sections = []

    for elem in general_info_sections:
        if len(elem) > 1:
            new_general_info_sections.append(elem.split(":", 1))

    return new_general_info_sections

#### Function 9
def transform_info_sections(new_general_info_sections):
    sections = {}
    extra = 1

    for list_ in new_general_info_sections:
        if len(list_) > 1:
            sections[list_[0]] = list_[1]
        else:
            sections[extra] = list_[0]
            extra += 1

    return sections

#### Function 10
def add_rest(sections, event_title, event_price):
    rest = {"Event_title": event_title, "Event_price": event_price}
    final_dict = {**sections, **rest}
    return final_dict

#### Function 11
def create_df(dict1, dict2, index):
    # Creating dfs
    main_sections_df = pd.DataFrame(dict1, index = [index])
    subsections_df = pd.DataFrame(dict2, index = [index])

    # Joining dfs
    full_df = pd.merge(main_sections_df, subsections_df, how = "outer", left_index = True, right_index = True)

    return full_df

In [68]:
def processor(url, emojis, section_emojis, info_names, index):
    # Step 1: Get data
    try:
        event_title, event_descr, event_price = get_data(url)
    except:
        return "Step 1 error"

    # Step 2: Split event_descr into sections
    try:
        event_descr_items = separate_sections(event_descr)
    except:
        return "Step 2 error"

    # Step 3: Split sections into titles and descriptions
    try:
        title_descr = separate_title_descr(event_descr_items)
    except:
        return "Step 3 error"

    # Step 4: Some cleaning
    # 4.1 Remove html
    try:
        title_descr_without_html = remove_html(title_descr)
    except:
        return "Step 4.1 error"
    # 4.2 Remove emojis
    try:
        cleaned_dict = remove_all_emojis(title_descr_without_html, emojis, info_names)
    except:
        return "Step 4.2 error"

    # Step 5: Get data from "info" section
    try:
        general_info_sections = separate_info_sections(title_descr_without_html, info_names, section_emojis)
    except:
        return "Step 5 error"
    # 5.1 Split the data from "info" section into title and descr
    try:
        new_general_info_sections = separate_info_sections_title_descr(general_info_sections)
    except:
        return "Step 5.1 error"

    # Step 6: Make the "info" subsections a list
    try:
        info_sections = transform_info_sections(new_general_info_sections)
    except:
        return "Step 6 error"

    # Step 7: Join all the data together into a dict
    try:
        final_dict = add_rest(info_sections, event_title, event_price)
    except:
        return "Step 7 error"

    # Step 8: Create joined dataframe
    try:
        df = create_df(cleaned_dict, final_dict, index)
    except:
        return "Step 8 error"

    return df

In [75]:
url = "https://feverup.com/m/98148"
emojis = emoji.UNICODE_EMOJI["en"]
section_emojis = ["📅", "🕒", "⏳", "👤", "📍", "⚠️", "♿", "⌚", "❓", "🔗"]
info_names = ["Información", "Información General"]
index = 0

processor(url, emojis, section_emojis, info_names, index)

Unnamed: 0,1,Tickets,Información,Descripción,Fecha,Edad,Event_title,Event_price
0,,"Apúntate a ""Concierto de Pinturilla y La Pand...",📅 Fecha: domingos de junio por la mañana📍 Luga...,Pinturilla y la Pandilla Vainilla es un nuevo ...,domingos de junio por la mañana📍 Lugar: Espac...,todos los públicos💰 Precio estimado de 10€ a 22€,Concierto de Pinturilla y La Pandilla Vainill...,Join in!


# XXX

In [78]:
candlelight_urls = ['https://feverup.com/m/96831', 'https://feverup.com/m/97975', 'https://feverup.com/m/97053', 'https://feverup.com/m/100404', 'https://feverup.com/m/100896', 'https://feverup.com/m/95040', 'https://feverup.com/m/100405', 'https://feverup.com/m/98160', 'https://feverup.com/m/97280', 'https://feverup.com/m/84521']

emojis = emoji.UNICODE_EMOJI["en"]
section_emojis = ["📅", "🕒", "⏳", "👤", "📍", "⚠️", "♿", "⌚", "❓", "🔗"]
info_names = ["Información", "Información General", "Información:", "Información general"]
index = 0

dfs = []

for url_ in candlelight_urls:
    df_ = processor(url_, emojis, section_emojis, info_names, index)

    if type(df_) != str:
        dfs.append(df_)
        print(index, url_)
    else:
        print(index, url_, df_)

    index += 1

0 https://feverup.com/m/96831
1 https://feverup.com/m/97975
2 https://feverup.com/m/97053
3 https://feverup.com/m/100404
4 https://feverup.com/m/100896
5 https://feverup.com/m/95040
6 https://feverup.com/m/100405
7 https://feverup.com/m/98160
8 https://feverup.com/m/97280
9 https://feverup.com/m/84521


In [79]:
dfs[0]

Unnamed: 0,1,Tickets,Qué vas a disfrutar,Información:,Descripción,Valoraciones de otros usuarios,📍 Lugar,Fecha y hora,Duración aproximada,Edad,Accesibilidad,El uso de la mascarilla es obligatorio durante el conciertoPrograma,Event_title,Event_price
0,,Entrada zona A: muy buena visibilidad Entrada...,️ Una atmósfera muy íntima en una localización...,📍 Lugar: Teatro Goya📅 Fecha y hora: 7 de agost...,"¡Las mejores canciones de tu infancia, como nu...","Dre S. : ""Muy buena idea y excelente conciert...",Teatro Goya,7 de agosto a las 21:30,60 minutos (apertura de puertas 60 minutos an...,a partir de 8 años. Los menores de 16 años de...,recinto habilitado para personas en silla de ...,La Bella DurmienteEl Rey LeónAladdinLos Vengad...,Candlelight Open Air: Bandas Sonoras Mágicas,1500


## Sacar URLs

In [5]:
url_madrid = "https://feverup.com/"

r = requests.get('https://feverup.com/madrid')
soup = BeautifulSoup(r.text, 'lxml')
eventos_g = soup.findAll(class_ = "fv-carousel__feed")
eventos_g = soup.findAll(class_ = "fv-carousel__item")


lista_URLs = []
for evento in eventos_g:
    URL_evento = evento.find('a')['href']
    lista_URLs.append(URL_evento)

lista_completa = []
for url in lista_URLs:
    url_completa = url_madrid + url
    lista_completa.append(url_completa)
#lista_completa

In [6]:
batch = lista_completa[:20]

dfs = []
count = 1
error_count = 0
error_log = []

for url_ in batch:
    new_df = processor(url_)

    if type(new_df) != str:
        print(count, ":", url_)
        dfs.append(new_df)
    else:
        print(count, ":", url_ + " --> error")
        error_log.append(f"Iteración: {count} - link: {url_} - step with error: {new_df}")
        error_count += 1

    count += 1

1 : https://feverup.com//m/100044
2 : https://feverup.com//m/92796
3 : https://feverup.com//m/100122
4 : https://feverup.com//m/99411
5 : https://feverup.com//m/100404 --> error
6 : https://feverup.com//m/93001 --> error
7 : https://feverup.com//m/54872 --> error
8 : https://feverup.com//m/100892 --> error
9 : https://feverup.com//m/101265
10 : https://feverup.com//m/91274
11 : https://feverup.com//m/100122
12 : https://feverup.com//m/96831 --> error
13 : https://feverup.com//m/97053 --> error
14 : https://feverup.com//m/100404 --> error
15 : https://feverup.com//m/98160 --> error
16 : https://feverup.com//m/100896 --> error
17 : https://feverup.com//m/97975 --> error
18 : https://feverup.com//m/95040 --> error
19 : https://feverup.com//m/100938 --> error
20 : https://feverup.com//m/100405 --> error


In [7]:
error_count / count

0.6190476190476191

In [8]:
df = pd.concat(dfs)
print(df.shape)
df.columns

(7, 30)


Index([                                                      '1_x',
                                                         'Tickets',
                                             'Qué vas a disfrutar',
                                                     'Información',
                                                     'Descripción',
                                  'Valoraciones de otros usuarios',
                                                          ' Fecha',
                                                       ' Horarios',
                                                          ' Lugar',
                                                             '1_y',
                                                     'Event_title',
                                                     'Event_price',
                      'Agatha Ruiz de la Prada u Okuda San Miguel',
                                                    'Valoraciones',
                                                

In [9]:
df.head()

Unnamed: 0,1_x,Tickets,Qué vas a disfrutar,Información,Descripción,Valoraciones de otros usuarios,Fecha,Horarios,Lugar,1_y,...,Accesibilidad,para seleccionar tu asiento numerado,Duración del show,2,1,Botellas Select disponibles,Botellas Premium disponibles,¡accede sin colas entrando con Fever y reserva exclusiva!,Programa,Fecha y hora
0,,🎫 Entrada Adulto para 1 día completo o media j...,🌊 Un refrescante chapuzón en la enorme piscina...,📅 Fecha: todos los días🕒 Horarios: Entrada dí...,¡Comienza la temporada de piscina Club de Tiro...,"💬 Marta H. ⭐⭐⭐⭐⭐: Piscina limpia, césped cuida...",todos los días,Entrada día completo: de 11:00 a 20:30 Entra...,Club de Tiro de El Pardo🚗 Parking exclusivo p...,No está permitido introducir comida o bebida a...,...,,,,,,,,,,
0,,🎫 Entrada general (a partir de 12 años)🎫 Entra...,🍭 Acceso a un viaje a la imaginación que une e...,📅 Fechas: escoge el día que quieras al comprar...,Disfruta de una aventura interactiva en Sweet ...,,,escoge la opción que quieras al comprar📍 Luga...,,Aforo restringido de 110 personas por hora. N...,...,,,,,,,,,,
0,⭐ Meet Vincent van Gogh es una experiencia tri...,🎫 Entrada - incluye acceso a la exhibición y a...,🎨 Descubre una nueva forma de admirar las obra...,,¿Alguna vez has soñado con adentrarte en una p...,,,-Lunes a jueves y domingos: de 10:00 a 20:00-V...,Espacio Ibercaja Delicias - Paseo de las Deli...,"Puedes consultar las preguntas frecuentes, su...",...,accesible para personas con movilidad reducida,,,,,,,,,
0,,"🎫 Acceso a Corona Paradise, una experiencia se...",🌴 Coge tu mochila y prepara tu cámara porque… ...,📅 Fechas: del 1 de julio al 1 de agosto🕒 Horar...,Llega a Madrid la experiencia más divertida y...,,,varias opciones disponibles📍 Lugar: Palacio d...,,La seguridad de nuestros asistentes es una p...,...,"Al tratarse de un edificio antiguo, no cuenta...",,,,,,,,,
0,,"🎫 Entrada con butaca en zona A, B, C o D🎫 Entr...",🌟 La segunda temporada del aclamado espectácul...,📅 Fechas: a partir de septiembre de 2021🕒 Hora...,¡La segunda temporada del mayor espectáculo ho...,,,"-Martes, miércoles y jueves a las 20:00-Vierne...",,El espectáculo contiene luces estroboscópicas...,...,recinto accesible para PMR📱 Recibirás la info...,en la zona escogida(desmarca la opción predet...,150 minutos aprox. (descanso incluido)📍 Lugar...,Puedes consultar las medidas de seguridad que...,,,,,,
