In [3]:
from bs4 import BeautifulSoup, Comment
import random
import json
import pandas as pd
import datetime
import openai
import os

In [4]:
# Set OpenAI API key from environment variable
client = openai.OpenAI(
    api_key=os.environ.get("OPENAI_API_KEY"),
)

In [None]:
def html_to_json(html_content):
    def parse_element(element):
        """Recursively parse HTML element into a JSON structure."""
        # Exclude comments
        if isinstance(element, Comment):
            return None
        
        # Exclude specific tags (e.g., <style>, <script>, <code>) because they are not meaningful and have a lot of noise
        if element.name in ["style", "script", "code"]:
            return None

        if element.name == "a":  # Handle links specifically
            # Extract the link and the text
            # this is needed to avoid empty texty links because they do not have context to measure the relevance (for the LLM)
            href = element.get("href", "").strip()
            text = element.get_text(strip=True)


            # Only include the link if it has meaningful text or href
            if text or href:
                return {"text": text, "link": href} if text else {"link": href}
            else:
                return None
        
        if element.name:  # If it's a tag (not a text node)
            # Recursively parse children
            content = [parse_element(child) for child in element.contents]
            # Filter out None values from content
            content = [child for child in content if child is not None]
            
            # Flatten nodes with a single child
            if len(content) == 1:
                return content[0]  # Replace node with its single meaningful child
            
            # If content has meaningful children, return it
            if content:
                return {"content": content}
            else:
                return None  # Remove nodes with no meaningful content
        else:
            # For text nodes, return the stripped text
            text = element.strip()
            return text if text else None  # Return None if the text node is empty

    # Parse the HTML content
    soup = BeautifulSoup(html_content, "html.parser")
    
    # Start parsing from the root
    parsed_data = parse_element(soup)
    
    return parsed_data

In [None]:
# Example HTML content from a members' list for a Meetup group
html_content = """"""

In [15]:
# Parse the HTML content into JSON
parsed_json = html_to_json(html_content)

# Save the parsed JSON to a file
with open("parsed_data.json", "w") as json_file:
    json.dump(parsed_json, json_file, indent=4)

print("JSON data has been saved to 'parsed_data.json'")


JSON data has been saved to 'parsed_data.json'


In [16]:
relevant_members = [
    # comes from parsed html content
    # Examples
    { "content": [{ "content": [{ "text": "Alex", "link": "https://www.meetup.com/members/357287860/" }, "Member", "Joined Mar 11, 2022", "Visited group today"] }, "icon"] },
    { "content": [{ "content": [{ "text": "Hugh", "link": "https://www.meetup.com/members/414770121/" }, "Member", "Joined May 2, 2024", "Visited group today"] }, "icon"] },
]

In [32]:
target_group_name = 'London-Spanish Cultural Exchange'
source_group_name = "Spanish Conversational Club"
event_link = "link_to_event"
num_members = 5_384

def generate_message(member_name='Default Name'):
    prompt = f"""
    You are an event organizer for {target_group_name}. Generate a personalized invitation message for a Meetup member named {member_name}, who is a member of the {source_group_name} group. 
    The message should highlight the unique aspects of the event, such as curated participants, structured format, and high show-up rates. 
    Include a link to the event: {event_link}.
    Keep the message concise and engaging, ideally under 500 characters.
    """
    response = client.responses.create(
        model="gpt-4o",
        instructions="You are a marketing specialist for a Meetup group. Generate a personalized invitation message for a Meetup member.",
        input=prompt
    )

    # Check if the response is valid
    if not response.output[0].content[0].text:
        return "Error: No valid response from the model."
    
    # Extract the generated message
    message = response.output[0].content[0].text
    # Check if the message is empty
    if not message:
        return "Error: Generated message is empty."
    # Check if the message is too long
    if len(message) > 500:
        return "Error: Generated message is too long."
    # Check if the message contains any links
    if "http" in message:
        return "Error: Generated message contains a link."
    # Check if the message contains any HTML tags
    if "<" in message or ">" in message:
        return "Error: Generated message contains HTML tags."
    # Check if the message contains any special characters
    if any(char in message for char in ['$', '%', '&', '@']):
        return "Error: Generated message contains special characters."
    
    return message

In [33]:
raw_messages = []
for i in range(1):
    message = generate_message()
    raw_messages.append(message)

In [34]:
raw_messages

["¡Hola Default Name!\n\nWe're excited to invite you to our London-Spanish Cultural Exchange event! Join a select group of Spanish enthusiasts for an evening of lively conversations and cultural immersion. With our structured format and high show-up rates, you’re sure to make meaningful connections and enhance your Spanish skills.\n\nDon't miss out—reserve your spot now: [link_to_event]\n\n¡Hasta pronto!\n\nThe London-Spanish Cultural Exchange Team"]

In [19]:
organizer_name = 'Name'
target_group_name = 'London-Spanish Cultural Exchange'
source_group_name = "Spanish Conversational Club"

def get_message(member_name, is_member_of=source_group_name):
    messages = {
        # cleaned up messages from raw_messages
        # Examplea
       "UA 0.2 A": 
       f"""Hey {member_name}, I’m {organizer_name} and I’m the organizer for The {target_group_name} event. I saw you’re a member of {is_member_of} group on Meetup and I think you’re the perfect addition to ours as well.

What makes our events different? We carefully curate the participants list, craft the perfect agenda, and have high show up rates. Happy to answer any questions you have :)

Check out our upcoming event {event_link}
        """,
        "UA 0.2 B": 
        f"""Hey {member_name}, I'm {organizer_name}, organizing an intimate {target_group_name} on Friday February 7, 2025. I noticed from {is_member_of} group on meetup that you're interested in language practice, and I'd love to invite you to join us. 

What makes this exchange special? We keep our groups small and carefully matched, ensure everyone shows up (through advance commitment), and follow a structured format that guarantees equal practice time in both languages. No more awkward silences or uneven language splits! 

Come check out our upcoming event {event_link}
        """
        }
       
    selected_message_key = random.choice(list(messages.keys()))
    message = messages[selected_message_key]

    return selected_message_key, message

In [20]:
def parse_meetup_list(data):
    """
    Given a Python list of dicts (like the snippet you showed),
    extract for each member:
      - name (string)
      - link (string)
      - joined_date (YYYY-MM-DD or raw string if parsing fails)
      - days_since_last_visited (integer or None)
    
    data is expected to look like:
    [
      {
        "content": [
          {
            "content": [
              { "text": "John", "link": "https://..." },
              "Member",
              "Joined Jan 1, 2020",
              "Visited group 2 days ago"
            ]
          },
          "icon"
        ]
      },
      ...
    ]
    """

    # Helper to parse the "Joined XXX" format, e.g. "Joined Apr 26, 2021"
    def parse_joined_date(joined_str):
        raw_date = joined_str.replace("Joined ", "").strip()
        try:
            dt = datetime.datetime.strptime(raw_date, "%b %d, %Y")
            return dt.strftime("%Y-%m-%d")  # ISO format
        except ValueError:
            # If it doesn't match the expected format, just return the raw string
            return raw_date

    # Helper to parse "Visited group XXX" into an integer
    def parse_days_since_visited(visited_str):
        # Remove the leading "Visited group"
        remainder = visited_str.replace("Visited group", "").strip().lower()
        if remainder == "today":
            return 0
        # Possibly "1 day ago", "2 days ago", "1 month ago", etc.
        parts = remainder.split()
        # e.g. ["1", "day", "ago"] or ["2", "days", "ago"] or ["1", "month", "ago"]
        if len(parts) >= 3:
            try:
                num = int(parts[0])
                measure = parts[1]  # "day(s)", "month(s)"
                if measure.startswith("day"):
                    return num
                elif measure.startswith("month"):
                    return num * 30  # approximate a month as 30 days
            except ValueError:
                pass
        return None

    def extract_members(obj):
        """
        Recursively walk through the structure to find arrays of the form:
          [
            { "text": "...", "link": "..." },
            "Member" or "Organizer" (optional),
            "Joined ...",
            "Visited group ...",
            ...
          ]
        """
        results = []

        if isinstance(obj, dict):
            # If dictionary has "content", recurse deeper
            if "content" in obj:
                results.extend(extract_members(obj["content"]))

        elif isinstance(obj, list):
            # Check if this list starts with the dict { "text": "...", "link": "..." }
            if (
                len(obj) > 0
                and isinstance(obj[0], dict)
                and "text" in obj[0]
                and "link" in obj[0]
            ):
                # We found a "member" entry
                name = obj[0]["text"]
                link = obj[0]["link"]
                joined_date = None
                days_since_visited = None

                # Look at the rest of the items for "Joined ..." or "Visited group ..."
                for item in obj[1:]:
                    if isinstance(item, str):
                        if item.startswith("Joined "):
                            joined_date = parse_joined_date(item)
                        elif item.startswith("Visited group"):
                            days_since_visited = parse_days_since_visited(item)
                member_id = [elem for elem in link.split('/') if len(elem.strip()) > 0][-1]
                member_name_cleaned = name.replace(' ', '%20%')

                removed_space = [elem.strip() for elem in name.split(' ') if len(elem.strip()) > 0][0]
                removed_dash = [elem.strip() for elem in removed_space.split('-') if len(elem.strip()) > 0][0]
                friendly_name = removed_dash

                ab, message = get_message(member_name=friendly_name)
                results.append({
                    "message_link": f"https://www.meetup.com/messages/?new_convo=true&member_id={member_id}&name={member_name_cleaned}",
                    "message": message,
                    "name": name,
                    "link": link,
                    "joined_date": joined_date,
                    "days_since_last_visited": days_since_visited,
                    "member_id": member_id,
                    "ab": ab
                })
            else:
                # Otherwise, we recurse into each sub-item
                for sub_item in obj:
                    results.extend(extract_members(sub_item))

        return results

    all_members = []
    for top_level_item in data:
        all_members.extend(extract_members(top_level_item))

    return all_members

In [21]:
relevant_members = parse_meetup_list(relevant_members)
new_members = [rm for rm in relevant_members]

In [22]:
pd.DataFrame(new_members).to_csv('parsed_messages.csv', index=False)