In [14]:
from bs4 import BeautifulSoup
import requests
import openai
import os

openai.api_key = os.getenv("OPENAI_API_KEY")

# Retrieve the HTML content
url = 'https://sf.funcheap.com/events/san-francisco/'
response = requests.get(url)
html = response.content

# Parse the HTML content using BeautifulSoup
soup = BeautifulSoup(html, 'lxml')

# Find the table element
table = soup.find('table')

# Find all the links within the table
links = table.find_all('a')

# Extract the href attributes from each link
hrefs = [link.get('href') for link in links]
headlines = [link.text for link in links]

In [18]:
def generate_prompt(event):
    prompt = """Extract information from the event below and store it into a JSON with the following structure {venue: venue_name, date: date_and_time, age: age_requirement, price: price, location: address}""" + ' and enclose each key and value in the JSON structure with double quotes.' + '\n' + "Event: " + event
    return prompt

def generate_desc(event):
    prompt = 'Summarize the following event description below in a sentence or two to find the most important aspects that explain what the event is about.' + '\n' + 'Event Description: ' + event
    return prompt

In [19]:
import json 
import pandas as pd
import time

sf_events = pd.DataFrame()

for link,headline in zip(hrefs,headlines):
  headline = headline.strip()
  if link is not None and len(headline) > 0: 
    response = requests.get(link)
    html = response.content
    soup = BeautifulSoup(html, 'html.parser')
    p_tags = soup.find_all('p')
    event_name = soup.find('h1')
    event_name = event_name.get_text()
    text = ''
    div = soup.find('div', {'id': 'stats'})
    text += div.get_text()
    
    response = openai.Completion.create(
    model="text-davinci-003",
    prompt=generate_prompt(text),
    temperature=0.0,
    max_tokens = 200,
    )

    response_text = response['choices'][0]['text'].strip()
    try:
      response_dict = json.loads(response_text)
    except Exception as e:
      print(str(e))
      print(response_text)

    description = ''
    for p in p_tags:
      description += p.get_text()
    
    time.sleep(2)
    event_response = openai.Completion.create(
    model="text-curie-001",
    prompt=generate_desc(description),
    temperature=0.3,
    max_tokens = 500,
    timeout=0
    )

    event_text = event_response['choices'][0]['text'].strip()
    response_dict['event_description'] = event_text
    response_dict['event_name'] = event_name

    sf_events = sf_events.append(response_dict, ignore_index=True)
    
    


Timeout: Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=600)

In [34]:
from opencage.geocoder import OpenCageGeocode
from shapely.geometry import Point

key = '21b734acde524209895d1c01fc8e6eea'
geocoder = OpenCageGeocode(key)
locations = []

for index,row in sf_events.iterrows():
    v = row['location']
    v = v.split("CA", 1)[0].strip() + " CA"
    results = geocoder.geocode(v)
    loc = Point(results[0]['geometry']['lat'],results[0]['geometry']['lng'])
    locations.append(loc)
    


In [22]:
sf_events['geolocation'] = locations

  arr = construct_1d_object_array_from_listlike(values)


## Get Embeddings

In [23]:
import pandas as pd
import tiktoken
from openai.embeddings_utils import get_embedding
import os
import openai

openai.apikey = os.getenv("OPENAI_API_KEY")
# embedding model parameters
embedding_model = "text-embedding-ada-002"
embedding_encoding = "cl100k_base"  # this the encoding for text-embedding-ada-002
max_tokens = 5000  # the maximum for text-embedding-ada-002 is 8191

In [24]:
def find_venue_type(venue):
    prompt = f'Provide a short description of the following venue in SF? Utilize the venues website or online search to find the answer.' + '\n' + 'Venue: ' + venue
    return prompt

def find_neighborhood(venue):
    prompt = "Given the following venue, provide the SF neighborhood that this venue is located in." + '\n' + 'Venue: ' + venue
    return prompt

In [25]:
venue_description = {}
for index, row in sf_events.iterrows():
    if row['venue'] not in venue_description:
        response = openai.Completion.create(
            model="text-curie-001",
            prompt=find_venue_type(row['venue']),
            temperature=0.3,
            max_tokens = 100,
        )
        response_text = response['choices'][0]['text'].strip()
        venue_description[row['venue']] = response_text
        sf_events.loc[index, 'venue_description'] = response_text

    else:
        sf_events.loc[index, 'venue_description'] = venue_description[row['venue']]  

In [26]:
venue_description = {}
for index, row in sf_events.iterrows():
    if row['venue'] not in venue_description:
        response = openai.Completion.create(
            model="text-curie-001",
            prompt=find_neighborhood(row['venue']),
            temperature=0.0,
            max_tokens = 50,
        )
        response_text = response['choices'][0]['text'].strip()
        venue_description[row['venue']] = response_text
        sf_events.loc[index, 'neighborhood'] = response_text

    else:
        sf_events.loc[index, 'neighborhood'] = venue_description[row['venue']]  

In [27]:
fc_df_embeddings = sf_events.applymap(str)
encoding = tiktoken.get_encoding(embedding_encoding)
sf_events["combined"] = (
    "Venue: " + fc_df_embeddings.venue.str.strip() + "; Venue Description: " + fc_df_embeddings.venue_description.str.strip() + 
    "; Event: " + fc_df_embeddings.event_name.str.strip() +  "; Event Description: " + fc_df_embeddings.event_description.str.strip() + 
    "; Price: " + fc_df_embeddings.price.str.strip() + "; Neighborhood: " + fc_df_embeddings.neighborhood.str.strip()
)

sf_events["n_tokens"] = sf_events.combined.apply(lambda x: len(encoding.encode(x)))

In [28]:
sf_events["embedding"] = sf_events.combined.apply(lambda x: get_embedding(x, engine=embedding_model))

In [36]:
sf_events.to_csv("funcheap_with_embeddings.csv")