In [1]:
from google.cloud import storage

storage_client = storage.Client()
bucket = storage_client.bucket('ask-haaretz')
result = bucket.blob("metadata_movie.json")

In [2]:
blobs = storage_client.list_blobs('ask-haaretz')
temp = [blob.name for blob in blobs]
temp = [t for t in temp if t.endswith("metadata_movie.json")]

In [3]:
import pandas as pd
import json

result = []
for t in temp:
    blob = bucket.blob(t)
    data = blob.download_as_string()
    data = json.loads(data)
    embedding = t.replace("metadata_movie.json", "embedding_full_article.json")
    blob = bucket.blob(embedding)
    data["text"] = json.loads(blob.download_as_string())['text']
    result.append(data)
metadata = pd.DataFrame(result)
    


In [4]:
prompt = """Your task is to generate pairs of data for training and evaluating a language model. Each pair should consist of two parts:

**Part 1: User Query (Free-Form)**
Create a realistic user query in natural, conversational Hebrew. This query should resemble how a user would actually ask for information about a movie or TV show.  These queries should implicitly contain information related to: genre, type (movie or TV), director, producer, actors, distribution platform (e.g., Netflix, cinemas), movie length (for movies), language, number of seasons (for TV shows), release year, and number of episodes per season (for TV shows).  The query should **not explicitly list these categories, but rather incorporate them naturally into the phrasing.  The query should be focused and reflect a specific information need.**

**Part 2: Structured Information (JSON Format)**
Based **ONLY** on the user query you generated in Part 1, extract the relevant information and present it in a structured JSON format.  **Crucially, the information in the JSON should be derived *solely* from the user query. Do not add information that is not explicitly or implicitly present in the user query.**  For example, if the user query only mentions "drama," the genre in the JSON should primarily reflect "drama" or closely related subgenres *implied* by the query, and should not add genres like "crime" or "black comedy" unless those are clearly hinted at in the user query itself.

The JSON should use the following keys to represent the extracted information:


{
  "user_query": "...",  // Copy the user query from Part 1
  "query": "...",       // **EXTRACTED FROM USER QUERY ONLY**
  "genre": "...",      // **EXTRACTED FROM USER QUERY ONLY**
  "type": "...",       // "movie" or "tv" **EXTRACTED FROM USER QUERY ONLY**
  "director": "...",   // **EXTRACTED FROM USER QUERY ONLY**
  "producer": "...",   // **EXTRACTED FROM USER QUERY ONLY**
  "actors": ["...", "..."], // Array of actors **EXTRACTED FROM USER QUERY ONLY**
  "distribution_platform": "...", // **EXTRACTED FROM USER QUERY ONLY**
  "movie_length": "...", // e.g., "120 minutes" **EXTRACTED FROM USER QUERY ONLY**
  "language": "...",   // **EXTRACTED FROM USER QUERY ONLY**
  "number_of_seasons": "...", // Only for TV shows **EXTRACTED FROM USER QUERY ONLY**
  "release_year": "...", // **EXTRACTED FROM USER QUERY ONLY**
  "episodes_per_season": "..." // Only for TV shows **EXTRACTED FROM USER QUERY ONLY**
}
"""

In [5]:
from google import genai
import os
client = genai.Client(
        vertexai=False, 
        api_key=os.getenv('GOOGLE_API_KEY')
        )

scalar = 1
df_multiplied = pd.concat([metadata[['text','article_id']]]*scalar, ignore_index=True)
for idx, row in df_multiplied.iterrows():
    txt = row.loc['text']
    prompt = "You will receive text and return a possible user query that attempts to retrieve the article. The user query should be in Hebrew and should be a natural, conversational query that a user might use to search for the article. The query should be focused and reflect a specific information need. the query dont have to include the name of the artwork, but include deatils about the artwork."
    text = f"{prompt} \n\n Here is The Text: {txt}"
    response = client.models.generate_content(
    model='gemini-2.0-flash',
    contents=text,
    )
    df_multiplied.loc[idx, 'full_query'] = response.text
    

df_multiplied = df_multiplied.sort_values('article_id')
    
    

In [6]:
from src.llm_api_client import ChatBot
import streamlit as st
from config.load_config import load_config

config = load_config("config/config.yaml")

responses = []
for idx, row in df_multiplied.iterrows():
    query = row.loc['full_query']
    st.session_state.messages = []
    llm_client = ChatBot(st.session_state, config)
    response = llm_client.process_user_input(query)
    response['article_id'] = row.loc['article_id']
    responses.append(response)

df_responses = pd.DataFrame(responses)

df_multiplied = df_multiplied.merge(df_responses, on='article_id')





In [7]:
df_multiplied.to_excel("movie_query.xlsx", index=False)

In [8]:
query

'אפשר בבקשה מידע על סרט ישראלי חדש שמדבר על משפחה חרדית מזרחית בבני ברק, והבן שלהם עילוי שרוצה להתקבל לישיבה ליטאית יוקרתית? הבנתי שהסרט עוסק בגזענות וקושי השתייכות.\n'

In [9]:
response

{'type': 'סרט',
 'language': 'עברית',
 'genre': 'דרמה, חברתי',
 'distribution_platform': 'Hot, \\u05e1\\u05d8\\u05d9\\u05e0\\u05d2',
 'query': 'משפחה חרדית מזרחית בני ברק ישיבה ליטאית',
 'article_id': '00000195-345b-d641-abfd-b77bc8b10000'}

In [10]:
df_responses

Unnamed: 0,query,genre,type,language,article_id,director,actors,distribution_platform,release_Year,movie_length,location,producer,rating,festival,award
0,סדרה ישראלית חדשה על קומיקאי שהיה בכלא כנער,דרמת פשע,סדרה,עברית,00000194-2151-dda5-af9c-3dd9d89c0000,,,,,,,,,,
1,ביקורת על הסרט החדש של לוקה גואדנינו,,סרט,עברית,00000194-25d6-dcc4-a1d7-3df6703c0000,לוקה גואדנינו,,,,,,,,,
2,"ביקורת על סרט עם ניקול קידמן בתפקיד מנכ""לית חב...",,סרט,,00000194-2618-dd68-a3be-e6fc06680000,לא ידוע,ניקול קידמן,,,,,,,,
3,ביקורת על הסרט החדש של עמוס גיתאי,עיון,סרט,עברית,00000194-2681-ddb6-afdd-77e781220000,עמוס גיתאי,,,,,,,,,
4,"ביקורת על הסרט ""נוספרטו"" של רוברט אגרס",,סרט,,00000194-2b97-d9c2-a79e-2bd7330c0000,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
57,סדרת טלוויזיה על שחקן פוטבול מפורסם שהורשע ברצ...,,סדרה,,00000195-1dbd-d100-a5d5-bfbf19f10000,,,,,,,,,,
58,סרט על אדריכל ניצול שואה שהיגר לאמריקה אחרי המ...,,סרט,,00000195-2238-df71-a5fd-eab99cff0000,,,Netflix,,,,,,ונציה,אוסקר
59,משפחה בדיקטטורה בברזיל שנות 70 סרט אבא נלקח,,סרט,עברית,00000195-2272-d293-a1d5-e67745670000,,,,,,,,,,
60,robbert de niro netflix series about cyber att...,,סדרה,,00000195-3138-d670-ad97-7dff432f0000,,Robert De Niro,Netflix,,,,,,,
