In [2]:
!pip install pandas



In [3]:
import pandas as pd


df = pd.read_csv('tmdb.csv')
df.head()

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2007-05-19,961000000,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500
2,245000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.sonypictures.com/movies/spectre/,206647,"[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...",en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,"[{""name"": ""Columbia Pictures"", ""id"": 5}, {""nam...","[{""iso_3166_1"": ""GB"", ""name"": ""United Kingdom""...",2015-10-26,880674609,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466
3,250000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...",http://www.thedarkknightrises.com/,49026,"[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,...",en,The Dark Knight Rises,Following the death of District Attorney Harve...,112.31295,"[{""name"": ""Legendary Pictures"", ""id"": 923}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2012-07-16,1084939099,165.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,The Legend Ends,The Dark Knight Rises,7.6,9106
4,260000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://movies.disney.com/john-carter,49529,"[{""id"": 818, ""name"": ""based on novel""}, {""id"":...",en,John Carter,"John Carter is a war-weary, former military ca...",43.926995,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}]","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2012-03-07,284139100,132.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"Lost in our world, found in another.",John Carter,6.1,2124


In [4]:
import pandas as pd
import json


def collapse_genres(j):
  genres = []
  ar = json.loads(j)
  for a in ar:
    genres.append(a.get("name"))
  return " ".join(sorted(genres))


def combine_features(row):
  try:
    return row['overview']+" "+row["genres_name"]
  except:
    print ("Error:", row)


def process_tmdb_csv(input_file, output_file):
  """
  Processes a TMDB movies CSV file to create a Vespa-compatible JSON format.

  This function reads a CSV file containing TMDB movie data, processes the data to
  generate new columns for text search, and outputs a JSON file with the necessary
  fields (`put` and `fields`) for indexing documents in Vespa.

  Args:
    input_file (str): The path to the input CSV file containing the TMDB movies data.
                      Expected columns are 'id', 'original_title', 'overview', and 'genres'.
    output_file (str): The path to the output JSON file to save the processed data in
                       Vespa-compatible format.

  Workflow:
    1. Reads the CSV file into a Pandas DataFrame.
    2. Processes the 'genres' column, extracting genre names into a new 'genres_name' column.
    3. Fills missing values in 'original_title', 'overview', and 'genres_name' columns with empty strings.
    4. Creates a "text" column that combines specified features using the `combine_features` function.
    5. Selects and renames columns to match required Vespa format: 'doc_id', 'title', and 'text'.
    6. Constructs a JSON-like 'fields' column that includes the record's data.
    7. Creates a 'put' column based on 'doc_id' to uniquely identify each document.
    8. Outputs the processed data to a JSON file in a Vespa-compatible format.

  Returns:
    None. Writes the processed DataFrame to `output_file` as a JSON file.

  Notes:
    - The function requires the helper function `combine_features` to be defined, which is expected to combine text features for the "text" column.
    - Output JSON file is saved with `orient='records'` and `lines=True` to create line-delimited JSON.

  Example Usage:
    >>> process_tmdb_csv("tmdb_movies.csv", "output_vespa.json")
  """
  movies = pd.read_csv(input_file)
  movies['genres_name'] = movies.apply(lambda x: collapse_genres(x.genres), axis=1)
  for f in ['original_title','overview','genres_name']:
    movies[f] = movies[f].fillna('')

  movies["text"] = movies.apply(combine_features,axis=1)
  # Select only 'id', 'original_title', and 'text' columns
  movies = movies[['id', 'original_title', 'text']]
  movies.rename(columns={'original_title': 'title', 'id': 'doc_id'}, inplace=True)

  # Create 'fields' column as JSON-like structure of each record
  movies['fields'] = movies.apply(lambda row: row.to_dict(), axis=1)

  # Create 'put' column based on 'doc_id'
  movies['put'] = movies['doc_id'].apply(lambda x: f"id:hybrid-search:doc::{x}")

  df_result = movies[['put', 'fields']]
  print(df_result.head())
  df_result.to_json(output_file, orient='records', lines=True)


process_tmdb_csv("tmdb.csv", "clean_tmdb.jsonl")

                            put  \
0   id:hybrid-search:doc::19995   
1     id:hybrid-search:doc::285   
2  id:hybrid-search:doc::206647   
3   id:hybrid-search:doc::49026   
4   id:hybrid-search:doc::49529   

                                              fields  
0  {'doc_id': 19995, 'title': 'Avatar', 'text': '...  
1  {'doc_id': 285, 'title': 'Pirates of the Carib...  
2  {'doc_id': 206647, 'title': 'Spectre', 'text':...  
3  {'doc_id': 49026, 'title': 'The Dark Knight Ri...  
4  {'doc_id': 49529, 'title': 'John Carter', 'tex...  


In [5]:
!pip3 install --ignore-installed vespacli

Collecting vespacli
  Downloading vespacli-8.391.23-py3-none-any.whl.metadata (15 kB)
Downloading vespacli-8.391.23-py3-none-any.whl (47.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.7/47.7 MB[0m [31m57.5 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: vespacli
Successfully installed vespacli-8.391.23


In [6]:
!vespa config set target local

In [7]:
!vespa status deploy --wait 300

Waiting up to [36m5m0s[0m for deploy API...
Deploy API at [36mhttp://127.0.0.1:19071[0m is [32mready[0m


In [8]:
!vespa deploy --wait 300 app

Waiting up to [36m5m0s[0m for deploy API...
Uploading application package... done;1m⣷[0;22m

[32mSuccess:[0m Deployed [36m'app'[0m with session ID [36m2[0m
Waiting up to [36m5m0s[0m for deployment to converge...
Waiting up to [36m5m0s[0m for cluster discovery...
Waiting up to [36m5m0s[0m for container default...


In [9]:
!vespa feed -t http://localhost:8080 clean_tmdb.jsonl


{
  "feeder.operation.count": 4803,
  "feeder.seconds": 12.956,
  "feeder.ok.count": 4803,
  "feeder.ok.rate": 370.720,
  "feeder.error.count": 0,
  "feeder.inflight.count": 0,
  "http.request.count": 4803,
  "http.request.bytes": 1670890,
  "http.request.MBps": 0.129,
  "http.exception.count": 0,
  "http.response.count": 4803,
  "http.response.bytes": 429878,
  "http.response.MBps": 0.033,
  "http.response.error.count": 0,
  "http.response.latency.millis.min": 13,
  "http.response.latency.millis.avg": 72,
  "http.response.latency.millis.max": 506,
  "http.response.code.counts": {
    "200": 4803
  }
}


In [10]:
!pip install pyvespa

Collecting pyvespa
  Downloading pyvespa-0.50.0-py3-none-any.whl.metadata (18 kB)
Collecting requests-toolbelt (from pyvespa)
  Downloading requests_toolbelt-1.0.0-py2.py3-none-any.whl.metadata (14 kB)
Collecting docker (from pyvespa)
  Downloading docker-7.1.0-py3-none-any.whl.metadata (3.8 kB)
Collecting aiohttp (from pyvespa)
  Downloading aiohttp-3.10.10-cp313-cp313-macosx_11_0_arm64.whl.metadata (7.6 kB)
Collecting tenacity>=8.4.1 (from pyvespa)
  Using cached tenacity-9.0.0-py3-none-any.whl.metadata (1.2 kB)
Collecting fastcore>=1.7.8 (from pyvespa)
  Downloading fastcore-1.7.19-py3-none-any.whl.metadata (3.5 kB)
Collecting aiohappyeyeballs>=2.3.0 (from aiohttp->pyvespa)
  Downloading aiohappyeyeballs-2.4.3-py3-none-any.whl.metadata (6.1 kB)
Collecting aiosignal>=1.1.2 (from aiohttp->pyvespa)
  Downloading aiosignal-1.3.1-py3-none-any.whl.metadata (4.0 kB)
Collecting frozenlist>=1.1.1 (from aiohttp->pyvespa)
  Downloading frozenlist-1.5.0-cp313-cp313-macosx_11_0_arm64.whl.metadat

In [11]:
#
# https://github.com/vespa-engine/sample-apps/blob/master/news/src/python/user_search.py
# https://docs.vespa.ai/en/tutorials/news-5-recommendation.html
#

# pip install pyvespa
import pandas as pd
from vespa.application import Vespa
from vespa.io import VespaResponse, VespaQueryResponse


def display_hits_as_df(response: VespaQueryResponse, fields) -> pd.DataFrame:
    records = []
    for hit in response.hits:
        record = {}
        for field in fields:
            record[field] = hit["fields"][field]
        records.append(record)
    return pd.DataFrame(records)


def keyword_search(app, search_query):
    query = {
        "yql": "select * from sources * where userQuery() limit 5",
        "query": search_query,
        "ranking": "bm25",
    }
    response = app.query(query)
    return display_hits_as_df(response, ["doc_id", "title"])


def semantic_search(app, query):
    query = {
        "yql": "select * from sources * where ({targetHits:100}nearestNeighbor(embedding,e)) limit 5",
        "query": query,
        "ranking": "semantic",
        "input.query(e)": "embed(@query)"
    }
    response = app.query(query)
    return display_hits_as_df(response, ["doc_id", "title"])


def get_embedding(doc_id):
    query = {
        "yql" : f"select doc_id, title, text, embedding from content.doc where doc_id contains '{doc_id}'",
        "hits": 1
    }
    result = app.query(query)
    
    if result.hits:
        return result.hits[0]
    return None


def query_movies_by_embedding(embedding_vector):
    query = {
        'hits': 5,
        'yql': 'select * from content.doc where ({targetHits:5}nearestNeighbor(embedding, user_embedding))',
        'ranking.features.query(user_embedding)': str(embedding_vector),
        'ranking.profile': 'recommendation'
    }
    return app.query(query)


# Replace with the host and port of your local Vespa instance
app = Vespa(url="http://localhost", port=8080)

query = "Harry Potter and the Half-Blood Prince"

df = keyword_search(app, query)
print(df.head())

df = semantic_search(app, query)
print(df.head())

emb = get_embedding("767")
results = query_movies_by_embedding(emb["fields"]["embedding"])
df = display_hits_as_df(results, ["doc_id", "title", "text"])
print(df.head())

  doc_id                                     title
0    767    Harry Potter and the Half-Blood Prince
1    671  Harry Potter and the Philosopher's Stone
2    674       Harry Potter and the Goblet of Fire
3    673  Harry Potter and the Prisoner of Azkaban
4  13967                               Miss Potter
   doc_id                                      title
0     767     Harry Potter and the Half-Blood Prince
1     675  Harry Potter and the Order of the Phoenix
2     672    Harry Potter and the Chamber of Secrets
3     674        Harry Potter and the Goblet of Fire
4  168705                                 BloodRayne
  doc_id                                      title  \
0    767     Harry Potter and the Half-Blood Prince   
1    675  Harry Potter and the Order of the Phoenix   
2    672    Harry Potter and the Chamber of Secrets   
3    671   Harry Potter and the Philosopher's Stone   
4    674        Harry Potter and the Goblet of Fire   

                                             