# Latest version

In [None]:
# Function 1
def get_data(url):
    # Link to the event
    r = requests.get(url)
    # Pulling the data from the link
    soup = BeautifulSoup(r.text, "lxml")
    # Take event title and description
    # In the event description is where "accesibility" info is located
    event_title = soup.find(class_ = "plan-hero__title").text
    event_descr = soup.find(class_ = "plan-description mb-32")
    # Transform the event description into string for later processing
    event_descr = str(event_descr)
    # Get price info
    event_price = soup.find(class_ = "sidebarBuyingText sidebarWrapper__btn").text
    event_price = event_price.split("\xa0€")[0]

    return event_title, event_descr, event_price

# Function 2
def separate_sections(event_descr):
    # 1) Split the data using the "<strong>" tag -> This way we separate the sections
    # 2) Split the data using the "</strong>" tag -> This way we separate titles from descriptions
    event_descr_items = [elem.split("</strong>") for elem in event_descr.split("<strong>")]

    return event_descr_items

# Function 3
def separate_title_descr(event_descr_items):
    # Dict to save the info title-descriptions that we have in a list of lists
    new_dict = {}
    # This is to assign a numerical value as key to those descriptions without section title
    no_title_count = 1

    # Iterate over the list of lists, and for every list...
    for elem in event_descr_items:
        # If there's more than one element (that means, we have description and title)...
        if len(elem) > 1:
            # Then the first element will be the key and the second one will be the value in our new dict
            new_dict[elem[0]] = elem[1]
        # If we don't have two values (we are missing the title)...
        else:
            # Then, the key will be the numerical value we defined and the value will be the only value of the list, which should be the description
            new_dict[no_title_count] = elem[0]
            no_title_count += 1

    return new_dict

# Function 4
def remove_html(new_dict):
    # This is to remove all the html tags from the text
    for key, value in new_dict.items():
        new_dict[key] = re.sub(r"\<.*?\>", "", value)

    return new_dict

# Function 5
def separate_info_sections(new_dict):
    # Now I have to do all the processing with the iterators to pull the information from "Información general".

    # We'll need an iterator
    # iterator = new_dict["Información General"]
    info_names = ["Información", "Información General"]
    for name in info_names:
        try:
            iterator = new_dict[name]
            break
        except:
            pass

    # We'll need an empty list too, save the subsections
    general_info_sections = []

    # Lastly, we'll make use of an icons' list to check whether our text has any emoji in it or not
    # icons_list = emoji.UNICODE_EMOJI["es"].keys()
    icons_list = ["📅", "🕒", "⏳", "👤", "📍", "⚠️", "♿", "⌚", "❓", "🔗"]

    # Iterate over all the icons in the list
    for icon in icons_list:
        # If the icon is in the iterator (string)...
        if icon in iterator:
            # Split iterator using the icon and save it as the new iterator
            # Input: string
            # Output: list with 2 items
            iterator = iterator.split(icon)
            # Save the first item in our new list
            general_info_sections.append(iterator[0])

            # Check if there is still any icon left in the second element of the iterator
            if any(icon in iterator[1] for icon in icons_list):
                # If so, save it as the new iterator (a string again)
                iterator = iterator[1]

            # Else, save it in our new list.
            # As we don't have more icons in the second element, that means, we don't need to split the iterator anymore, since we already reach the last piece of info we needed
            else:
                general_info_sections.append(iterator[1])
        else:
            pass

    return general_info_sections

# Function 6
def separate_info_sections_title_descr(general_info_sections):
    general_info_sections_2 = []

    for elem in general_info_sections:
        if len(elem) > 1:
            general_info_sections_2.append(elem.split(":", 1))

    return general_info_sections_2

# Function 7
def transform_info_sections(general_info_sections_2):
    sections = {}
    extra = 1

    for list_ in general_info_sections_2:
        if len(list_) > 1:
            sections[list_[0]] = list_[1]
        else:
            sections[extra] = list_[0]
            extra += 1

    return sections

# Function 8
def add_rest(sections, event_title, event_price):
    rest = {"Event_title": event_title, "Event_price": event_price}
    final_dict = {**sections, **rest}
    return final_dict

In [None]:
def processor(url):
    # Step 1
    try:
        event_title, event_descr, event_price = get_data(url)
    except:
        return "Error in step 1"
    # Step 2
    try:
        event_descr_items = separate_sections(event_descr)
    except:
        return "Error in step 2"
    # Step 3
    try:
        new_dict = separate_title_descr(event_descr_items)
    except:
        return "Error in step 3"
    # Step 4
    try:
        new_dict = remove_html(new_dict)
    except:
        return "Error in step 4"
    # Step 5
    try:
        general_info_sections = separate_info_sections(new_dict)
    except:
        return "Error in step 5"
    # Step 6
    try:
        general_info_sections_2 = separate_info_sections_title_descr(general_info_sections)
    except:
        return "Error in step 6"
    # Step 7
    try:
        sections = transform_info_sections(general_info_sections_2)
    except:
        return "Error in step 7"
    # Step 8
    try:
        final_dict = add_rest(sections, event_title, event_price)
    except:
        return "Error in step 8"

    # Final step
    # Creating dfs
    general_df = pd.DataFrame(new_dict, index = [0])
    sections_df = pd.DataFrame(final_dict, index = [0])

    # Joining dfs
    full_df = pd.merge(general_df, sections_df, how = "outer", left_index = True, right_index = True)

    return full_df

In [None]:
# Art
#url = "https://feverup.com/m/100122"
#url = "https://feverup.com/m/100982"
#url = "https://feverup.com/m/96379"
url = "https://feverup.com/m/100122"

#processor(url)
# Step 1: Get data
event_title, event_descr, event_price = get_data(url)
# Step 2: Split event_descr into sections
event_descr_items = separate_sections(event_descr)
# Step 3: Split sections into titles and descriptions
title_descr = separate_title_descr(event_descr_items)
# Step 4: Some cleaning
# 4.1 Remove html
title_descr_without_html = remove_html(title_descr)

# 4.2 Remove emojis
emojis = emoji.UNICODE_EMOJI["en"]
info_names = ["Información", "Información General"]
cleaned_dict = remove_all_emojis(title_descr_without_html, emojis, info_names)

# Step 5: Get data from "info" section
icons_list = ["📅", "🕒", "⏳", "👤", "📍", "⚠️", "♿", "⌚", "❓", "🔗"]
general_info_sections = separate_info_sections(title_descr_without_html, info_names, icons_list)

# 5.1 Split the data from "info" section into title and descr
new_general_info_sections = separate_info_sections_title_descr(general_info_sections)

# Step 6: Make the "info" subsections a list
info_sections = transform_info_sections(new_general_info_sections)

# Step 7: Join all the data together into a dict
final_dict = add_rest(info_sections, event_title, event_price)

# Step 8: Create joined dataframe
df = create_df(cleaned_dict, final_dict, 0)
df

---------------------

In [None]:
#re.split("Información", text)
#items = re.findall(r"(?<=strong).+", text_with_html)
items = text_with_html.split("strong")

In [None]:
gen_info = re.sub(r"\<.*?\>", "", items[6])
gen_info

In [None]:
info_items_2 = []
for item in info_items: info_items_2.append(re.sub(r"\<.*?\>", "", item))

In [None]:
gen_info = info_items_2[6].split("🕒")
date = gen_info[0]

gen_info_2 = gen_info[1].split("⏳")
timetable = gen_info_2[0]

gen_info_3 = gen_info_2[1].split("👤")
time = gen_info_3[0]
time

In [None]:
iterator = info_items[6]

icons = ["📅", "🕒", "⏳", "👤", "📍", "⚠️", "♿", "⌚", "❓", "🔗"]
general_info = []
count = 0

for icon in icons:
    #print("icon:", icon)
    #print("iterator:\n", iterator)
    #print("-" * 20)
    if icon in iterator:
        iterator = iterator.split(icon)
        #print("iterator[0]:\n", iterator[0])
        #print("-" * 20)
        general_info.append(iterator[0])
        #print("iterator[1]:\n", iterator[1])
        if any(icon in iterator[1] for icon in icons):
            iterator = iterator[1]
        else:
            general_info.append(iterator[1])
    else:
        pass

    count += 1
    #print("-" * 50)

general_info

In [None]:
# Function 1
def get_data(url):
    # Link to the event
    r = requests.get(url)
    # Pulling the data from the link
    soup = BeautifulSoup(r.text, "lxml")
    # Take event title and description
    # In the event description is where "accesibility" info is located
    event_title = soup.find(class_ = "plan-hero__title").text
    event_descr = soup.find(class_ = "plan-description mb-32")
    # Transform the event description into string for later processing
    event_descr = str(event_descr)
    # Get price info
    event_price = soup.find(class_ = "sidebarBuyingText sidebarWrapper__btn").text
    event_price = event_price.split("\xa0€")[0]

    return event_title, event_descr, event_price

# Function 2
def separate_sections(event_descr):
    # 1) Split the data using the "<strong>" tag -> This way we separate the sections
    # 2) Split the data using the "</strong>" tag -> This way we separate titles from descriptions
    event_descr_items = [elem.split("</strong>") for elem in event_descr.split("<strong>")]

    return event_descr_items

# Function 3
def separate_title_descr(event_descr_items):
    # Dict to save the info title-descriptions that we have in a list of lists
    title_descr = {}
    # This is to assign a numerical value as key to those descriptions without section title
    no_title_count = 1

    # Iterate over the list of lists, and for every list...
    for elem in event_descr_items:
        # If there's more than one element (that means, we have description and title)...
        if len(elem) > 1:
            # Then the first element will be the key and the second one will be the value in our new dict
            new_dict[elem[0]] = elem[1]
        # If we don't have two values (we are missing the title)...
        else:
            # Then, the key will be the numerical value we defined and the value will be the only value of the list, which should be the description
            new_dict[no_title_count] = elem[0]
            no_title_count += 1

    return title_descr

# Function 4
def remove_html(title_descr):
    # This is to remove all the html tags from the text
    for key, value in title_descr.items():
        title_descr[key] = re.sub(r"\<.*?\>", "", value)

    return title_descr_without_html

# Function 5
def separate_info_sections(title_descr_without_html):
    # Now I have to do all the processing with the iterators to pull the information from "Información general".

    # We'll need an iterator
    # iterator = new_dict["Información General"]
    info_names = ["Información", "Información General"]
    for name in info_names:
        try:
            iterator = title_descr_without_html[name]
            break
        except:
            pass

    # We'll need an empty list too, save the subsections
    general_info_sections = []

    # Lastly, we'll make use of an icons' list to check whether our text has any emoji in it or not
    # icons_list = emoji.UNICODE_EMOJI["es"].keys()
    icons_list = ["📅", "🕒", "⏳", "👤", "📍", "⚠️", "♿", "⌚", "❓", "🔗"]

    # Iterate over all the icons in the list
    for icon in icons_list:
        # If the icon is in the iterator (string)...
        if icon in iterator:
            # Split iterator using the icon and save it as the new iterator
            # Input: string
            # Output: list with 2 items
            iterator = iterator.split(icon)
            # Save the first item in our new list
            general_info_sections.append(iterator[0])

            # Check if there is still any icon left in the second element of the iterator
            if any(icon in iterator[1] for icon in icons_list):
                # If so, save it as the new iterator (a string again)
                iterator = iterator[1]

            # Else, save it in our new list.
            # As we don't have more icons in the second element, that means, we don't need to split the iterator anymore, since we already reach the last piece of info we needed
            else:
                general_info_sections.append(iterator[1])
        else:
            pass

    return general_info_sections

# Function 6
def separate_info_sections_title_descr(general_info_sections):
    
    general_info_sections_2 = []

    for elem in general_info_sections:
        if len(elem) > 1:
            general_info_sections_2.append(elem.split(":", 1))

    return general_info_sections_2

# Function 7
def transform_info_sections(general_info_sections_2):
    sections = {}
    extra = 1

    for list_ in general_info_sections_2:
        if len(list_) > 1:
            sections[list_[0]] = list_[1]
        else:
            sections[extra] = list_[0]
            extra += 1

    return sections

# Function 8
def add_rest(sections, event_title, event_price):
    rest = {"Event_title": event_title, "Event_price": event_price}
    final_dict = {**sections, **rest}
    return final_dict

In [None]:
def processor(url):
    # Step 1
    try:
        event_title, event_descr, event_price = get_data(url)
    except:
        return "Error in step 1"
    # Step 2
    try:
        event_descr_items = separate_sections(event_descr)
    except:
        return "Error in step 2"
    # Step 3
    try:
        new_dict = separate_title_descr(event_descr_items)
    except:
        return "Error in step 3"
    # Step 4
    try:
        new_dict = remove_html(new_dict)
    except:
        return "Error in step 4"
    # Step 5
    try:
        general_info_sections = separate_info_sections(new_dict)
    except:
        return "Error in step 5"
    # Step 6
    try:
        general_info_sections_2 = separate_info_sections_title_descr(general_info_sections)
    except:
        return "Error in step 6"
    # Step 7
    try:
        sections = transform_info_sections(general_info_sections_2)
    except:
        return "Error in step 7"
    # Step 8
    try:
        final_dict = add_rest(sections, event_title, event_price)
    except:
        return "Error in step 8"

    # Final step
    # Creating dfs
    general_df = pd.DataFrame(new_dict, index = [0])
    sections_df = pd.DataFrame(final_dict, index = [0])

    # Joining dfs
    full_df = pd.merge(general_df, sections_df, how = "outer", left_index = True, right_index = True)

    return full_df

------------------------------------------