In [20]:
## for m1 chip; https://pytorch.org/get-started/locally/
## see github issue; https://github.com/UKPLab/sentence-transformers/issues/1736

import torch
x = torch.rand(5, 3)
print(x)


tensor([[0.6332, 0.1310, 0.6753],
        [0.9108, 0.8022, 0.8582],
        [0.9889, 0.3496, 0.3971],
        [0.8231, 0.8841, 0.6034],
        [0.3201, 0.3174, 0.1249]])


## -- Step 1 -  Import necessary libraries --- 


In [21]:
import pandas as pd
import requests
import hopsworks
import exclude.key

In [22]:
from sentence_transformers import SentenceTransformer

## -- Step 2 - Define Hopsworks and Google API keys and connect to Hopsworks --- 

In [23]:

# Hopsworks and Google API keys
HOPSWORKS_API_KEY = exclude.key.HOPSWORKS_API_KEY
FEATURE_GROUP_NAME = "bars_near_london_bridge"
FEATURE_GROUP_EMBEDDING_NAME = "venue_description_embeddings"
FEATURE_GROUP_EMBEDDING_VERSION = 2
FEATURE_GROUP_VERSION = 3
GOOGLE_API_KEY = exclude.key.GOOGLE_API_KEY

# Connect to Hopsworks
project = hopsworks.login(api_key_value=HOPSWORKS_API_KEY)
fs = project.get_feature_store()

Connection closed.
Connected. Call `.close()` to terminate connection gracefully.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/25749
Connected. Call `.close()` to terminate connection gracefully.


### Step 3: Retrieve Feature Group


In [41]:
def get_or_create_feature_group():
    try:
        fg = fs.get_feature_group(name=FEATURE_GROUP_NAME, version=FEATURE_GROUP_VERSION)
    except:
        fg = fs.create_feature_group(
            name=FEATURE_GROUP_NAME,
            version=FEATURE_GROUP_VERSION,
            description="Foot traffic data for bars near London Bridge",
            primary_key=['venue_name', 'day', 'hour'],
            event_time='last_updated',
            online_enabled=True
        )
    return fg

feature_group_venues = get_or_create_feature_group()
df = feature_group_venues.read()

# Display the DataFrame
print(df)



Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (4.61s) 
                            venue_name  \
0                           Nine Lives   
1      The Hung Drawn & Quartered, EC3   
2                El Vino London Bridge   
3               The Bridge Tap, London   
4                El Vino London Bridge   
...                                ...   
15115                 The Globe Tavern   
15116             Quarter Bar & Lounge   
15117                      The Mudlark   
15118                       Anchor Tap   
15119                          TwoRuba   

                                           venue_address        day  hour  \
0          8 Holyrood St, London SE1 2EL, United Kingdom   Saturday    14   
1      26-27 Great Tower St, London EC3R 5AQ, United ...   Saturday     7   
2      5a More London Pl, London SE1 2BY, United Kingdom    Tuesday    23   
3      32 Borough High St, London SE1 1XU, United Kin...    Tuesday     6   
4      5a More London P

### Step 4: Fetch Descriptions and Handle NaNs

In [42]:
def get_place_id(venue_name, api_key=GOOGLE_API_KEY):
    endpoint = 'https://maps.googleapis.com/maps/api/place/findplacefromtext/json'
    params = {
        'input': venue_name,
        'inputtype': 'textquery',
        'fields': 'place_id',
        'key': api_key
    }
    response = requests.get(endpoint, params=params)
    place_data = response.json()
    if place_data['candidates']:
        return place_data['candidates'][0]['place_id']
    return None

def get_place_details(place_id, api_key=GOOGLE_API_KEY):
    endpoint = f'https://places.googleapis.com/v1/places/{place_id}'
    headers = {
        'Content-Type': 'application/json',
        'X-Goog-Api-Key': api_key,
        'X-Goog-FieldMask': 'displayName,formattedAddress,editorialSummary,reviews'
    }
    response = requests.get(endpoint, headers=headers)
    return response.json()

def get_detailed_venues_info(venue_names):
    detailed_venues = []
    for name in venue_names:
        place_id = get_place_id(name)
        if place_id:
            details = get_place_details(place_id)
            if 'error' not in details:
                detailed_venues.append(details)
    return detailed_venues

venue_names = df['venue_name'].unique()
detailed_venues_info = get_detailed_venues_info(venue_names)

# Convert detailed venue info to DataFrame
detailed_venues_df = pd.DataFrame(detailed_venues_info)
detailed_venues_df = detailed_venues_df.dropna(subset=['displayName', 'editorialSummary', 'reviews'])

# Print the cleaned DataFrame to debug
print(detailed_venues_df)


                                     formattedAddress  \
0                   8 Holyrood St, London SE1 2EL, UK   
1           26-27 Great Tower St, London EC3R 5AQ, UK   
2               5a More London Pl, London SE1 2BY, UK   
3              32 Borough High St, London SE1 1XU, UK   
4         206-208 Tower Bridge Rd, London SE1 2LL, UK   
6                      4 Hay's Ln, London SE1 2HB, UK   
7                  1 Duke St Hill, London SE1 2SW, UK   
8                 61 Royal Mint St, London E1 8LG, UK   
9              75 Borough High St, London SE1 1NH, UK   
11               10 Bermondsey St, London SE1 2ER, UK   
12                    Montague Cl, London SE1 9DA, UK   
13  Arch, 35B, 85B Southwark Bridge Rd, London SE1...   
14  Greater, Unit 26, Hay’s Galleria, London SE1 2...   
15             10-20 Redcross Way, London SE1 1TA, UK   
16                 2, 4 Tooley St, London SE1 2SY, UK   
18          8-18 London Bridge St, London SE1 9SG, UK   
21               King's Head Ya

### Step 5: Create Embeddings


In [44]:
def extract_description(row):
    details = row.to_dict()
    if isinstance(details, dict):
        if 'editorialSummary' in details and isinstance(details['editorialSummary'], dict) and 'text' in details['editorialSummary']:
            return details['editorialSummary']['text']
        elif 'reviews' in details and isinstance(details['reviews'], list) and len(details['reviews']) > 0 and isinstance(details['reviews'][0], dict) and 'text' in details['reviews'][0] and isinstance(details['reviews'][0]['text'], dict) and 'text' in details['reviews'][0]['text']:
            return details['reviews'][0]['text']['text']
        elif 'displayName' in details and isinstance(details['displayName'], dict) and 'text' in details['displayName']:
            return details['displayName']['text']
    return 'No description available'

# Apply the extraction function
detailed_venues_df['description'] = detailed_venues_df.apply(extract_description, axis=1)

# Initialize the SentenceTransformer model
model = SentenceTransformer('all-MiniLM-L6-v2').to('cpu')

def create_embeddings(df, model):
    descriptions = df['description'].tolist()
    embeddings = model.encode(descriptions).tolist()
    df['embeddings'] = embeddings
    return df

# Create embeddings for the descriptions
detailed_venues_df = create_embeddings(detailed_venues_df, model)
print(detailed_venues_df)


2024-06-12 13:54:49,681 INFO: Use pytorch device_name: mps
2024-06-12 13:54:49,683 INFO: Load pretrained SentenceTransformer: all-MiniLM-L6-v2



Batches:   0%|          | 0/1 [00:00<?, ?it/s]

                                     formattedAddress  \
0                   8 Holyrood St, London SE1 2EL, UK   
1           26-27 Great Tower St, London EC3R 5AQ, UK   
2               5a More London Pl, London SE1 2BY, UK   
3              32 Borough High St, London SE1 1XU, UK   
4         206-208 Tower Bridge Rd, London SE1 2LL, UK   
6                      4 Hay's Ln, London SE1 2HB, UK   
7                  1 Duke St Hill, London SE1 2SW, UK   
8                 61 Royal Mint St, London E1 8LG, UK   
9              75 Borough High St, London SE1 1NH, UK   
11               10 Bermondsey St, London SE1 2ER, UK   
12                    Montague Cl, London SE1 9DA, UK   
13  Arch, 35B, 85B Southwark Bridge Rd, London SE1...   
14  Greater, Unit 26, Hay’s Galleria, London SE1 2...   
15             10-20 Redcross Way, London SE1 1TA, UK   
16                 2, 4 Tooley St, London SE1 2SY, UK   
18          8-18 London Bridge St, London SE1 9SG, UK   
21               King's Head Ya

### Step 6: Insert Data into Feature Group


In [47]:
# Add an 'id' column as the primary key
detailed_venues_df['id'] = detailed_venues_df.index

def get_or_create_description_feature_group():
    try:
        fg = fs.get_feature_group(name=FEATURE_GROUP_EMBEDDING_NAME, version=FEATURE_GROUP_EMBEDDING_VERSION)
    except:
        fg = fs.create_feature_group(
            name=FEATURE_GROUP_EMBEDDING_NAME,
            version=FEATURE_GROUP_EMBEDDING_VERSION,
            description="Embeddings for venue descriptions",
            primary_key=['id'],
            online_enabled=True
        )
    return fg

description_feature_group = get_or_create_description_feature_group()

# Insert data into the feature group
description_feature_group.insert(detailed_venues_df)

# Read back the feature group to ensure data insertion
feature_group_desc = get_or_create_description_feature_group()
df_desc = feature_group_desc.read()

print(df_desc)


Uploading Dataframe: 0.00% |          | Rows 0/27 | Elapsed Time: 00:00 | Remaining Time: ?

Launching job: venue_description_embeddings_2_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai/p/25749/jobs/named/venue_description_embeddings_2_offline_fg_materialization/executions
Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (0.94s) 
                                     formattedaddress  \
0           26-27 Great Tower St, London EC3R 5AQ, UK   
1                King's Head Yard, London SE1 1NA, UK   
2                   1-3 Tooley St, London SE1 2PF, UK   
3                     Montague Cl, London SE1 9DA, UK   
4                      4 Hay's Ln, London SE1 2HB, UK   
5           8-18 London Bridge St, London SE1 9SG, UK   
6                  1 Duke St Hill, London SE1 2SW, UK   
7               5a More London Pl, London SE1 2BY, UK   
8                     8 Bedale St, London SE1 9AL, UK   
9           171-173 Bermondsey St, London SE1 3UW, UK   
10  Arch, 35B, 85B Southwark Bridge Rd, L