In [2]:
%pip uninstall -y numpy pandas boto3 faiss-cpu scipy
%pip install numpy pandas boto3 scikit-learn scipy

Found existing installation: numpy 2.0.2
Uninstalling numpy-2.0.2:
  Successfully uninstalled numpy-2.0.2
Found existing installation: pandas 2.2.2
Uninstalling pandas-2.2.2:
  Successfully uninstalled pandas-2.2.2
[0mFound existing installation: scipy 1.16.1
Uninstalling scipy-1.16.1:
  Successfully uninstalled scipy-1.16.1
Collecting numpy
  Downloading numpy-2.3.2-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (62 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.1/62.1 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pandas
  Downloading pandas-2.3.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (91 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m91.2/91.2 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting boto3
  Downloading boto3-1.40.16-py3-none-any.whl.metadata (6.7 kB)
Collecting scipy
  Downloading scipy-1.16.1-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.

In [None]:
aws_access_key_id = ' '
aws_secret_access_key = ' / /'
aws_region = ' '

In [None]:
import pandas as pd
import boto3
import io

def read_s3_csv_to_dataframe(bucket_name, key_name, aws_access_key_id, aws_secret_access_key):
    """Reads a TSV file from S3 into a pandas DataFrame.

    Args:
        bucket_name (str): The S3 bucket name.
        key_name (str): The S3 object key (file name).
        aws_access_key_id (str): Your AWS access key ID.
        aws_secret_access_key (str): Your AWS secret access key.

    Returns:
        pandas.DataFrame: The DataFrame containing the CSV data, or None if an error occurs.
    """
    try:
        s3 = boto3.client('s3', aws_access_key_id=aws_access_key_id, aws_secret_access_key=aws_secret_access_key)
        obj = s3.get_object(Bucket=bucket_name, Key=key_name)
        df = pd.read_csv(io.BytesIO(obj['Body'].read()), sep='\t')
        return df
    except Exception as e:
        print(f"Error reading CSV from S3: {str(e)}")
        return None

bucket_name = 'motiverse-2025-data'  # Replace with your bucket name
key_name = 'video_features.tsv'  # Replace with your CSV file name in S3
df = read_s3_csv_to_dataframe(bucket_name, key_name, aws_access_key_id, aws_secret_access_key)

In [None]:
df.head()

Unnamed: 0,camera_media_id,event_timestamp,driver_name,vehicle_name,vehicle_year,latitude,longitude,city,state,street_name,...,road_type,weather,visibility,road_surface,road_signs,road_markings,vehicles_seen_with_color,location,terrain,extra_details
0,3084823540,2025-03-24 7:16:24,Mary Johnson,Freightliner Cascadia,2018.0,36.800201,-84.168529,Williamsburg,KY,I 75,...,divided highway,sunny,clear,asphalt,['speed limit'],"['lane dividers', 'solid yellow line']","['Black SUV', 'White Van', 'Green Semi-truck w...",highway,"['forest', 'hills']",['light traffic']
1,3078122527,2025-03-12 7:46:34,Patricia Brown,FORD F150,2018.0,39.172145,-77.536859,Leesburg,VA,James Monroe Hwy,...,2-lane,clear,good,asphalt,[],"['lane dividers', 'double yellow lines', 'soli...",['White pickup truck'],rural area,"['forest', 'farmland']",['shattered windshield']
2,3078122525,2025-03-12 7:46:35,Jennifer Miller,FORD F150,2018.0,39.170679,-77.536455,Leesburg,VA,James Monroe Hwy,...,2-lane,clear,good,asphalt,"['billboard', 'speed limit sign']","['lane dividers', 'double yellow lines']",['White pickup truck'],rural,"['forest', 'farmland', 'hills']",['shattered windshield']
3,3060540928,2025-02-07 2:10:21,Linda Garcia,PETERBILT 387,2006.0,28.547266,-81.498128,Orlovista,FL,East-West Expwy,...,divided highway,sunny,clear,concrete,"['Good Homes Rd exit sign', 'West Colonial Dr ...","['white dashed lane dividers', 'solid white li...","['White SUV', 'Black Semi-truck', 'Dark Sedan'...",highway,city,['windshield wipers active']
4,3076181611,2025-03-09 14:54:25,Elizabeth Wilson,Freightliner COLUMBIA,2004.0,42.228421,-71.651404,Northborough,MA,Massachusetts Tpk,...,highway,clear,dark,asphalt,[],['lane dividers'],"['Vehicle with red taillights', 'Vehicle with ...",highway,[],"['night driving', 'headlights on', 'camera vie..."


In [None]:
# --- B2: Define the AI Description Generation Function (FINAL REVISION FOR STRUCTURED NARRATIVE) ---
def generate_video_description(video_features_json):
    """
    Takes a JSON string of a video's features and uses an LLM to generate a
    consistent, structured narrative description ideal for embedding.
    """
    try:
        # Assumes aws_access_key_id, aws_secret_access_key, and aws_region are defined
        client = boto3.client("bedrock-runtime", aws_access_key_id=aws_access_key_id, aws_secret_access_key=aws_secret_access_key, region_name=aws_region)

        # --- THE NEW "STRUCTURED NARRATIVE" PROMPT ---
        # This prompt generates a consistent paragraph of text.
        prompt_text = f"""
You are a data processing agent. Your task is to convert vehicle event data from JSON format into a concise, descriptive paragraph. Adhere strictly to the sentence structures below, filling in the data from the provided JSON. Do not add any extra commentary.

**JSON DATA:**
{video_features_json}

**OUTPUT FORMAT:**

On [Insert event_timestamp], a [event_type] occurred involving a [vehicle_year] [vehicle_name] operated by [driver_name]. The specific event was a [collision_type]. The incident took place at [street_name] in [city], [state] (Coordinates: [latitude], [longitude]), within a [location] area characterized by [terrain] terrain. At the time of the event, the weather was [weather] with [visibility] visibility. The vehicle was on a [road_type] with a [road_surface] surface, which had [road_markings] and [road_signs] visible. Other vehicles seen included [List vehicles_seen_with_color]. Objects observed in the vicinity were [List objects_seen]. The vehicle made contact with [List objects_hit, or "no objects" if empty]. Additional recorded details include: [Insert extra_details].
"""

        messages = [{"role": "user", "content": [{"text": prompt_text}]}]

        # Using a low temperature for high consistency.
        inference_config = {"maxTokens": 512, "temperature": 0.1}

        # Confirmed working model ID from your notebook
        model_id = 'us.anthropic.claude-3-5-sonnet-20241022-v2:0'

        response = client.converse(modelId=model_id, messages=messages, inferenceConfig=inference_config)
        description = response["output"]["message"]["content"][0]["text"].strip()
        return description

    except ClientError as e:
        print(f"Bedrock Error: {e}")
        return f"Error generating description: {e}"
    except Exception as e:
        print(f"An unexpected error occurred: {e}")
        return "Error generating description."

In [None]:
import tqdm

In [None]:
import boto3
from botocore.exceptions import ClientError 
import json
import tqdm
import concurrent.futures 

def generate_video_description(video_features_json):
    
    try:
        # Assumes aws_access_key_id, aws_secret_access_key, and aws_region are defined
        client = boto3.client("bedrock-runtime", aws_access_key_id=aws_access_key_id, aws_secret_access_key=aws_secret_access_key, region_name=aws_region)

        prompt_text = f"""
You are a data processing agent. Your task is to convert vehicle event data from JSON format into a concise, descriptive paragraph. Adhere strictly to the sentence structures below, filling in the data from the provided JSON. Do not add any extra commentary.

**JSON DATA:**
{video_features_json}

**OUTPUT FORMAT:**

On [Insert event_timestamp], a [event_type] occurred involving a [vehicle_year] [vehicle_name] operated by [driver_name]. The specific event was a [collision_type]. The incident took place at [street_name] in [city], [state] (Coordinates: [latitude], [longitude]), within a [location] area characterized by [terrain] terrain. At the time of the event, the weather was [weather] with [visibility] visibility. The vehicle was on a [road_type] with a [road_surface] surface, which had [road_markings] and [road_signs] visible. Other vehicles seen included [List vehicles_seen_with_color]. Objects observed in the vicinity were [List objects_seen]. The vehicle made contact with [List objects_hit, or "no objects" if empty]. Additional recorded details include: [Insert extra_details].
"""
        messages = [{"role": "user", "content": [{"text": prompt_text}]}]
        inference_config = {"maxTokens": 512, "temperature": 0.1}
        model_id = 'us.anthropic.claude-3-5-sonnet-20241022-v2:0'

        response = client.converse(modelId=model_id, messages=messages, inferenceConfig=inference_config)
        description = response["output"]["message"]["content"][0]["text"].strip()
        return description

    except ClientError as e:

        return f"Bedrock Error: {e}"
    except Exception as e:
        return f"An unexpected error occurred: {e}"

print("Generating LLM-assisted descriptions using all data columns for all 500 videos...")

cols_to_exclude = ['camera_media_id']
features_for_llm = [col for col in df.columns if col not in cols_to_exclude]
print(f"The following {len(features_for_llm)} columns will be sent to the LLM:\n{features_for_llm}")

json_data_list = [row[features_for_llm].to_json(indent=2) for index, row in df.iterrows()]


MAX_WORKERS = 30
descriptions = []

with concurrent.futures.ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
    
    descriptions = list(tqdm.tqdm(executor.map(generate_video_description, json_data_list), total=len(json_data_list), desc="Generating Descriptions"))

df['llm_description'] = descriptions

print("\nDescription generation complete!")
print("Here are a few examples of the new, structured descriptions:")
for i in range(5):
    print(f"\n--- Example {i+1} ---")
    print(df['llm_description'].iloc[i])

Generating LLM-assisted descriptions using all data columns for all 500 videos...
The following 24 columns will be sent to the LLM:
['event_timestamp', 'driver_name', 'vehicle_name', 'vehicle_year', 'latitude', 'longitude', 'city', 'state', 'street_name', 'event_type', 'collision_type', 'objects_seen', 'objects_hit', 'road_type', 'weather', 'visibility', 'road_surface', 'road_signs', 'road_markings', 'vehicles_seen_with_color', 'location', 'terrain', 'extra_details', 'llm_description']


Generating Descriptions: 100%|██████████| 500/500 [02:17<00:00,  3.64it/s]


Description generation complete!
Here are a few examples of the new, structured descriptions:

--- Example 1 ---
On March 24, 2025, at 7:16:24, a collision occurred involving a 2018 Freightliner Cascadia operated by Mary Johnson. The specific event was a null. The incident took place at I 75 in Williamsburg, KY (Coordinates: 36.8002011, -84.1685292), within a highway area characterized by forest and hills terrain. At the time of the event, the weather was sunny with clear visibility. The vehicle was on a divided highway with a asphalt surface, which had lane dividers and solid yellow line and speed limit signs visible. Other vehicles seen included Black SUV, White Van, Green Semi-truck with flatbed trailer, Red car, White RV, White sedan, Light-colored semi-truck, and Dark-colored sedan. Objects observed in the vicinity were guardrails, trees, power lines, and utility poles. The vehicle made contact with no objects. Additional recorded details include: light traffic.

--- Example 2 --




In [None]:
import os

os.makedirs('content', exist_ok=True)

path = "content/revised_df.csv"

df.to_csv(path, index=False)

print(f"DataFrame saved to {path}")

DataFrame saved to content/revised_df.csv


In [1]:
import os

In [2]:
import pandas as pd

df = pd.read_csv("/content/revised_df.csv")
display(df.head())

Unnamed: 0,camera_media_id,event_timestamp,driver_name,vehicle_name,vehicle_year,latitude,longitude,city,state,street_name,...,weather,visibility,road_surface,road_signs,road_markings,vehicles_seen_with_color,location,terrain,extra_details,llm_description
0,3084823540,2025-03-24 7:16:24,Mary Johnson,Freightliner Cascadia,2018.0,36.800201,-84.168529,Williamsburg,KY,I 75,...,sunny,clear,asphalt,['speed limit'],"['lane dividers', 'solid yellow line']","['Black SUV', 'White Van', 'Green Semi-truck w...",highway,"['forest', 'hills']",['light traffic'],"On March 24, 2025, at 7:16:24, a collision occ..."
1,3078122527,2025-03-12 7:46:34,Patricia Brown,FORD F150,2018.0,39.172145,-77.536859,Leesburg,VA,James Monroe Hwy,...,clear,good,asphalt,[],"['lane dividers', 'double yellow lines', 'soli...",['White pickup truck'],rural area,"['forest', 'farmland']",['shattered windshield'],"On March 12, 2025 at 7:46:34, a collision occu..."
2,3078122525,2025-03-12 7:46:35,Jennifer Miller,FORD F150,2018.0,39.170679,-77.536455,Leesburg,VA,James Monroe Hwy,...,clear,good,asphalt,"['billboard', 'speed limit sign']","['lane dividers', 'double yellow lines']",['White pickup truck'],rural,"['forest', 'farmland', 'hills']",['shattered windshield'],"On March 12, 2025 at 7:46:35, a collision occu..."
3,3060540928,2025-02-07 2:10:21,Linda Garcia,PETERBILT 387,2006.0,28.547266,-81.498128,Orlovista,FL,East-West Expwy,...,sunny,clear,concrete,"['Good Homes Rd exit sign', 'West Colonial Dr ...","['white dashed lane dividers', 'solid white li...","['White SUV', 'Black Semi-truck', 'Dark Sedan'...",highway,city,['windshield wipers active'],"On 2025-02-07 2:10:21, a collision occurred in..."
4,3076181611,2025-03-09 14:54:25,Elizabeth Wilson,Freightliner COLUMBIA,2004.0,42.228421,-71.651404,Northborough,MA,Massachusetts Tpk,...,clear,dark,asphalt,[],['lane dividers'],"['Vehicle with red taillights', 'Vehicle with ...",highway,[],"['night driving', 'headlights on', 'camera vie...","On March 09, 2025 at 14:54:25, a collision occ..."


In [3]:
df['llm_description']

Unnamed: 0,llm_description
0,"On March 24, 2025, at 7:16:24, a collision occ..."
1,"On March 12, 2025 at 7:46:34, a collision occu..."
2,"On March 12, 2025 at 7:46:35, a collision occu..."
3,"On 2025-02-07 2:10:21, a collision occurred in..."
4,"On March 09, 2025 at 14:54:25, a collision occ..."
...,...
495,"On March 27, 2025 at 23:45:39, a hard corner o..."
496,"On March 27, 2025 at 23:29:22, a hard corner o..."
497,"On March 26, 2025 at 23:36:00, a hard corner o..."
498,Bedrock Error: An error occurred (ThrottlingEx...


In [None]:
import pandas as pd
import boto3
from botocore.exceptions import ClientError
import json
from tqdm import tqdm
import numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import time 
import random 


# --- Step 1: Build the Semantic Search Index (FIXED with Retry Logic) ---
print("Building Semantic (Meaning) Search Index with Amazon Titan...")

# --- FIX #1: Create the client ONCE outside the loop for efficiency ---
bedrock_client = boto3.client(
    "bedrock-runtime",
    aws_access_key_id=aws_access_key_id,
    aws_secret_access_key=aws_secret_access_key,
    region_name=aws_region
)

# --- FIX #2: A more robust function to handle API errors ---
def get_titan_embedding_with_backoff(text, client):
    max_retries = 5
    base_delay_seconds = 1  # Start with a 1-second delay

    for attempt in range(max_retries):
        try:
            model_id = 'amazon.titan-embed-text-v1'
            body = json.dumps({"inputText": text})
            response = client.invoke_model(
                body=body, modelId=model_id, contentType="application/json", accept="application/json"
            )
            response_body = json.loads(response['body'].read())
            return response_body['embedding']

        except ClientError as e:
            if e.response['Error']['Code'] == 'ThrottlingException':
                if attempt < max_retries - 1:
                    # Calculate wait time: (base_delay * 2^attempt) + random_jitter
                    wait_time = (base_delay_seconds * 2 ** attempt) + random.uniform(0, 1)
                    print(f"ThrottlingException caught. Retrying in {wait_time:.2f} seconds...")
                    time.sleep(wait_time)
                else:
                    print("Max retries reached. Failing.")
                    raise e  # Re-raise the exception if all retries fail
            else:
                print(f"An unexpected Bedrock error occurred: {e}")
                raise e
    return None

# # --- FIX #3: Adjusted loop to use the new robust function ---
# video_embeddings = []
# descriptions_list = df['llm_description'].tolist()

# # The tqdm loop now directly iterates over descriptions, no need for manual batching
# for text in tqdm(descriptions_list, desc="Generating Titan Embeddings"):
#     embedding = get_titan_embedding_with_backoff(text, bedrock_client)
#     if embedding:
#         video_embeddings.append(embedding)

# # --- The rest of your process remains the same ---
# if video_embeddings and len(video_embeddings) == len(descriptions_list):
#     video_embeddings = np.array(video_embeddings).astype('float32')
#     faiss.normalize_L2(video_embeddings) # Use FAISS's efficient normalization

#     d = video_embeddings.shape[1]
#     faiss_index = faiss.IndexFlatIP(d)
#     faiss_index.add(video_embeddings)
#     print("\nUPGRADED Semantic Index (FAISS) with Titan embeddings built successfully.")
# else:
#     print(f"\nWarning: Could only generate {len(video_embeddings)} out of {len(descriptions_list)} embeddings. The index will be incomplete.")


# # --- Step 2: Build the Keyword Search Index (TF-IDF) ---
# print("\nBuilding Keyword (Word-Matching) Search Index...")
# tfidf_vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1, 3))
# tfidf_matrix = tfidf_vectorizer.fit_transform(df['llm_description'].tolist())
# print("Keyword Index (TF-IDF) built successfully.")

Building Semantic (Meaning) Search Index with Amazon Titan...


In [8]:
pip install faiss_cpu

Collecting faiss_cpu
  Downloading faiss_cpu-1.12.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.1 kB)
Downloading faiss_cpu-1.12.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (31.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.4/31.4 MB[0m [31m23.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss_cpu
Successfully installed faiss_cpu-1.12.0


In [None]:
import os
import pickle
import numpy as np
import faiss

# --- 1. Define the path for  saved files ---
# All the progress will be saved inside this single folder.
save_path = "hackathon_index"

# --- 2. Create the directory ---
# os.makedirs() creates the directory.
# exist_ok=True prevents an error if we run the script multiple times.
os.makedirs(save_path, exist_ok=True
print(f"Created directory and saving all files to: '{save_path}/'")

# --- 3. Save the individual components into the path ---

# Save the Titan Embeddings as a NumPy array
embeddings_file = os.path.join(save_path, "video_embeddings.npy")
np.save(embeddings_file, video_embeddings)
print(f"  -> Saved embeddings to: {embeddings_file}")

# save the FAISS Index
faiss_index_file = os.path.join(save_path, "faiss_index.bin")
faiss.write_index(faiss_index, faiss_index_file)
print(f"  -> Saved FAISS index to: {faiss_index_file}")

# save the TF-IDF Vectorizer (the vocabulary)
tfidf_vectorizer_file = os.path.join(save_path, "tfidf_vectorizer.pkl")
with open(tfidf_vectorizer_file, 'wb') as f:
    pickle.dump(tfidf_vectorizer, f)
print(f"  -> Saved TF-IDF vectorizer to: {tfidf_vectorizer_file}")

# save the TF-IDF Matrix (the scores)
tfidf_matrix_file = os.path.join(save_path, "tfidf_matrix.pkl")
with open(tfidf_matrix_file, 'wb') as f:
    pickle.dump(tfidf_matrix, f)
print(f"  -> Saved TF-IDF matrix to: {tfidf_matrix_file}")

# save the DataFrame with LLM descriptions for reference
df_file = os.path.join(save_path, "processed_data.csv")
df.to_csv(df_file, index=False)
print(f"  -> Saved DataFrame to: {df_file}")

print("\n✅ All progress has been successfully saved locally.")

In [None]:
import os
import pickle
import faiss

save_path = "/content/"

# Load the TF-IDF Vectorizer
tfidf_vectorizer_file = os.path.join(save_path, "tfidf_vectorizer.pkl")
with open(tfidf_vectorizer_file, 'rb') as f:
    tfidf_vectorizer = pickle.load(f)
print(f"Loaded TF-IDF vectorizer from: {tfidf_vectorizer_file}")

# Load the TF-IDF Matrix
tfidf_matrix_file = os.path.join(save_path, "tfidf_matrix.pkl")
with open(tfidf_matrix_file, 'rb') as f:
    tfidf_matrix = pickle.load(f)
print(f"Loaded TF-IDF matrix from: {tfidf_matrix_file}")

# Load the FAISS Index
faiss_index_file = os.path.join(save_path, "faiss_index.bin")
faiss_index = faiss.read_index(faiss_index_file)
print(f"Loaded FAISS index from: {faiss_index_file}")

Loaded TF-IDF vectorizer from: /content/tfidf_vectorizer.pkl
Loaded TF-IDF matrix from: /content/tfidf_matrix.pkl
Loaded FAISS index from: /content/faiss_index.bin


In [38]:
def hybrid_search(query, score_threshold=6):
    """
    Performs a hybrid search, gets a relevance score and reasoning from the LLM,
    prints the LLM's thinking, and returns IDs that meet the score threshold.
    """
    print(f"\n## Performing Hybrid Search for: '{query}' with score_threshold > {score_threshold}")
    candidates_to_check = 40

    # -- Part A & B: Candidate Retrieval (No changes here) --
    query_embedding = get_titan_embedding_with_backoff(query, bedrock_client)
    if not query_embedding: semantic_indices = []
    else:
        query_embedding_normalized = np.array([query_embedding]) / np.linalg.norm([query_embedding])
        query_embedding_normalized = np.array(query_embedding_normalized).astype('float32')
        distances, semantic_indices = faiss_index.search(query_embedding_normalized, candidates_to_check)
        semantic_indices = [i for i in semantic_indices[0] if i != -1]
    query_vector = tfidf_vectorizer.transform([query])
    cosine_similarities = cosine_similarity(query_vector, tfidf_matrix).flatten()
    keyword_indices = cosine_similarities.argsort()[-candidates_to_check:][::-1].tolist()
    combined_indices = list(dict.fromkeys(semantic_indices + keyword_indices))
    candidate_ids = df['camera_media_id'].iloc[combined_indices].tolist()
    candidate_docs = df['llm_description'].iloc[combined_indices].tolist()
    print(f"Found {len(candidate_ids)} unique high-quality candidates from hybrid search.")

    # -- Part D: Expert AI Scoring & Reasoning (FINAL PROMPT) --
    context_str = "".join([f'Video ID: "{vid_id}"\nDescription: "{desc}"\n\n' for vid_id, desc in zip(candidate_ids, candidate_docs)])

    prompt_text = (
    "You are a highly precise AI Search and Ranking Analyst. First assign *Base Scores* using strict facet matching "
    "with *normalization*, then ONLY if there are 2 or more perfect matches, apply a simple +1-per-risk tie-break. Output strict JSON.\n\n"
    "check the query against all possibilities"
    " "
    "## STEP 0 — Extract & NORMALIZE Query Facets (internal; do not output)\n"
    "Extract facets from the *User Query* and mark as *Required* if explicitly specified:\n"
    "• Actor/subject names  • Behavior/Event  • Location (street/city/state)  • Time (date/month/year/range)  • Vehicle attributes (color/class/make)\n"
    "Apply these normalization rules when checking evidence in the Description (case-insensitive):\n"
    "• *Vehicle attributes:* no world-knowledge inference (do NOT infer class from make/model; do NOT equate near colors).\n"
    "  Stop sign violation ('ran a stop sign'|'failed to stop at stop sign'|'blew the stop sign'); Guardrail hit ('hit/struck/sideswiped a guardrail');\n"
    "  T-bone ('T-bone'|'side-impact'|'broadside'|'perpendicular impact').\n"
    "Evidence rule: a facet counts only if supported by a *verbatim quote* from the Description *after applying the above normalization*.\n"

    "## STEP 1 — Base Relevance Score (1–8) — compute for ALL candidates first\n"
    "Score by *Required facet coverage* with evidence (note any normalization used in reasoning):\n"
    "• *8 (Perfect Match)* = ALL Required facets matched with evidence .\n"
    "• 7 = Missing exactly one Required facet (behavior/event matched).\n"
    "• 6 = Behavior/event matched; two or more other Required facets missing/ambiguous.\n"
    "• 4–5 = Partial/indirect relation; behavior only implied; weak/partial location or vehicle evidence.\n"
    "• 1–3 = Mostly irrelevant OR contradicts a Required facet.\n"
    "Location strictness: if a *street* is specified, you must find that street *by normalized equivalence* in the Description to reach Base=8; a city-only match is insufficient.\n"
    "Temporal strictness: for month/year constraints, quote the timestamp and state it falls within the requested month/year.\n"
    "Negations: if the query excludes something, any video containing it must receive *Final Score = 1*.\n"
    "After assigning Base Scores to every candidate, count how many have *Base=8* (call this *N8*). Do NOT output results until N8 is known.\n\n"

    "## STEP 2 — Tie-Break with Risk Factors (ONLY if N8 ≥ 2)\n"
    "Consider risk factors from all the features , apply reasoning to Each unique applicable label = +1 (no double counting).\n"
    "Ignore benign labels: Clear Weather, Good Visibility, Daytime, Overcast, Light Traffic, Moderate Traffic. Only *Heavy Traffic* counts as traffic risk (+1).\n"
    "Cap so *Final Score ≤ 10*. When N8 ≥ 2, reasoning for each Base=8 candidate must include:\n"
    "\"Tie detected among N8 Base=8 candidates; Risk Bonus=X (label1 + label2 + …)\". If N8 = 1, write \"No tie (N8=1); no risk bonus applied\".\n\n"

    "## STEP 3 — Final Score\n"
    "• If N8 ≥ 2 and candidate is Base=8: *Final Score = 8 + (# of unique applicable risk labels)* (max 10).\n"
    "• Otherwise: *Final Score = Base Score*.\n\n"

    "## OUTPUT — Strict JSON ONLY\n"
    "Return one JSON object with key \"ranked_results\" → list of objects with:\n"
    "  \"video_id\" (string), \"score\" (integer), \"reasoning\" (concise).\n"
    "For temporal matches, quote the timestamp (e.g., '2025-03-11 …') and state it falls within the requested month/year.\n"
    "Sort *descending by Final Score*. If scores tie exactly, sort by: (a) greater Required facet coverage, (b) more specific actor match (full name > first name), (c) ascending Video ID.\n\n"

    "### User Query\n"
    f"\"{query}\"\n\n"
    "### Candidate Videos\n"
    f"{context_str}\n\n"
    "Analyze, score, and rank all candidates using these rules, then provide the full list in the specified JSON format."
)




    try:
        client = boto3.client("bedrock-runtime", aws_access_key_id=aws_access_key_id, aws_secret_access_key=aws_secret_access_key, region_name=aws_region)
        model_id = 'us.anthropic.claude-3-5-sonnet-20241022-v2:0'

        messages = [{"role": "user", "content": [{"text": prompt_text}]}]
        inference_config = {"maxTokens": 4096, "temperature": 0.0}
        response = client.converse(modelId=model_id, messages=messages, inferenceConfig=inference_config)
        generated_text = response["output"]["message"]["content"][0]["text"]

        json_match = re.search(r'\{.*\}', generated_text, re.DOTALL)

        if json_match:
            json_str = json_match.group(0)
            result_data = json.loads(json_str)
            ranked_results = result_data.get("ranked_results", [])

            # --- Print the LLM's raw thinking ---
            print("\n--- LLM Reasoning ---")
            for result in ranked_results:
                vid_id = result.get('video_id', 'N/A')
                score = result.get('score', 0)
                reasoning = result.get('reasoning', 'No reasoning provided.')
                print(f"ID: {vid_id:<12} | Score: {score:<2} | Reasoning: {reasoning}")
            print("---------------------\n")
            # ------------------------------------

            final_video_ids = [
                str(result['video_id'])
                for result in ranked_results
                if 'score' in result and result.get('score', 0) > score_threshold
            ]

            print(f"LLM analyzed {len(ranked_results)} results. Found {len(final_video_ids)} videos meeting the threshold.")
            return final_video_ids
        else:
            print("Warning: Could not parse JSON from LLM output. Returning empty list.")
            return []

    except Exception as e:
        print(f"An error occurred during the AI search: {e}")
        return []

In [39]:
import pandas as pd
import boto3
from botocore.exceptions import ClientError
import json
from tqdm import tqdm
import re
from google.colab import auth
from googleapiclient.discovery import build
from googleapiclient.http import MediaIoBaseUpload
import io

# This assumes 'df', 'embedding_model', 'faiss_index', 'tfidf_vectorizer', 'tfidf_matrix',
# and your AWS credentials are all loaded and correct from the previous steps.


# --- Step 1: Load Your Uploaded Query File ---

# IMPORTANT: Make sure your uploaded file's name matches this variable.
input_filename = "/content/finalmotive.csv"

try:
    submission_df = pd.read_csv(input_filename)
    print(f"Successfully loaded '{input_filename}'.")
    if 'relevant_video_ids' not in submission_df.columns:
        submission_df['relevant_video_ids'] = ""
    print(f"Found {len(submission_df)} queries to process.")
except FileNotFoundError:
    print(f"ERROR: The file '{input_filename}' was not found.")
    print("Please upload your CSV file and make sure the filename is correct.")

print("\n" + "="*50 + "\n")


# --- Step 2: Iterate and Predict for Each Query ---
# We will now loop through every row in your file and generate the video IDs.

for row_index, row in tqdm(submission_df.iterrows(), total=submission_df.shape[0], desc="Processing Queries"):

    # Get the user's query from the 'query' column
    user_query = row['query']

    # Use our final hybrid search function to get the top 20 predicted video IDs
    # Ensure that 'index' and 'embedding_model' are defined globally
    predicted_ids_list = hybrid_search(user_query)

    # Convert the list of video IDs into a single, comma-separated string
    # FIX: Convert each ID to string before joining
    predicted_ids_str = ", ".join(str(id) for id in predicted_ids_list)

    # Place the resulting string into the 'relevant_video_ids' column
    submission_df.at[row_index, 'relevant_video_ids'] = predicted_ids_str


# --- Step 3: Save the Completed DataFrame to Google Drive ---
print("\n" + "="*50 + "\n")
print("All queries processed. Preparing to save the final file to Google Drive...")

try:
    # Authenticate to Google. A pop-up will ask for your permission.
    auth.authenticate_user()
    drive_service = build('drive', 'v3')

    # Convert the DataFrame to a CSV format in memory
    csv_buffer = io.BytesIO()
    submission_df.to_csv(csv_buffer, index=False)
    csv_buffer.seek(0)

    # Define the file name and metadata for your Google Drive
    file_metadata = {'name': 'submission.csv'}

    # Upload the file
    media = MediaIoBaseUpload(csv_buffer, mimetype='text/csv', resumable=True)
    file = drive_service.files().create(body=file_metadata,
                                        media_body=media,
                                        fields='id, webViewLink').execute()

    print("✅ Success! Your submission file has been saved to Google Drive.")
    print(f"File Name: {file_metadata['name']}")
    print(f"You can view it here: {file.get('webViewLink')}")
    print("\nPreview of the first 5 predictions:")
    display(submission_df)

except Exception as e:
    print(f"An error occurred while saving to Google Drive: {e}")
    print("Saving a local copy instead as 'submission.csv'.")
    submission_df.to_csv("submission.csv", index=False)

Successfully loaded '/content/finalmotive.csv'.
Found 10 queries to process.




Processing Queries:   0%|          | 0/10 [00:00<?, ?it/s]


## Performing Hybrid Search for: 'Show me red light violations under heavy traffic.' with score_threshold > 6
Found 64 unique high-quality candidates from hybrid search.


  submission_df.at[row_index, 'relevant_video_ids'] = predicted_ids_str
Processing Queries:  10%|█         | 1/10 [00:13<02:03, 13.74s/it]


--- LLM Reasoning ---
ID: 3077470742   | Score: 8  | Reasoning: Perfect match: Red light violation + heavy traffic explicitly mentioned. Timestamp '2025-03-11 09:34:40' verified.
ID: 3077367902   | Score: 8  | Reasoning: Perfect match: Red light violation + heavy traffic + construction zone. Timestamp '2025-03-11 06:49:53' verified.
ID: 3077356819   | Score: 8  | Reasoning: Perfect match: Red light violation + heavy traffic explicitly noted. Timestamp '2025-03-11 06:35:31' verified.
ID: 3077399502   | Score: 8  | Reasoning: Perfect match: Red light violation + heavy traffic + sun glare. Timestamp '2025-03-11 07:32:59' verified.
ID: 3071927436   | Score: 8  | Reasoning: Perfect match: Red light violation + heavy traffic mentioned. Timestamp '2025-02-21 08:12:37' verified.
ID: 3061441905   | Score: 6  | Reasoning: Has red light violation but traffic level not specified as heavy
---------------------

LLM analyzed 6 results. Found 5 videos meeting the threshold.

## Performing Hybrid Sea

Processing Queries:  20%|██        | 2/10 [00:27<01:48, 13.54s/it]


--- LLM Reasoning ---
ID: 3079637179   | Score: 8  | Reasoning: Perfect match - Rollover on dirt road in snowy mountain area. Evidence: 'specific event was a rollover', 'unpaved, rural road with a dirt surface', 'hills, mountains, and snowy terrain'. No tie (N8=1); no risk bonus applied.
ID: 3083727227   | Score: 7  | Reasoning: Rollover in snowy mountain area but not on dirt road (highway surface)
ID: 3081368365   | Score: 6  | Reasoning: Rollover in snowy forest area but road surface not specified as dirt
ID: 3073554205   | Score: 6  | Reasoning: Rollover in snowy area but on highway, not dirt road
ID: 3064006492   | Score: 5  | Reasoning: Rollover in snowy hills but on asphalt highway
---------------------

LLM analyzed 5 results. Found 2 videos meeting the threshold.

## Performing Hybrid Search for: 'Show me the tailgating events on a Kenworth T680 2025.' with score_threshold > 6
Found 53 unique high-quality candidates from hybrid search.


Processing Queries:  30%|███       | 3/10 [00:38<01:28, 12.65s/it]


--- LLM Reasoning ---
ID: 3083085362   | Score: 8  | Reasoning: Perfect match - Tailgating incident with 2025 KENWORTH T680. No tie (N8=1); no risk bonus applied.
ID: 3077352052   | Score: 6  | Reasoning: Partial match - Has KENWORTH T680 but 2019 model, not 2025
ID: 3083127742   | Score: 5  | Reasoning: Weak match - Has Kenworth but different model (T800) and year (2015)
ID: 3083080319   | Score: 5  | Reasoning: Weak match - Has Kenworth but different model (T880)
---------------------

LLM analyzed 4 results. Found 1 videos meeting the threshold.

## Performing Hybrid Search for: 'Find red light violations in rainy weather.' with score_threshold > 6
Found 59 unique high-quality candidates from hybrid search.


Processing Queries:  40%|████      | 4/10 [00:49<01:11, 11.97s/it]


--- LLM Reasoning ---
ID: 3089359636   | Score: 10 | Reasoning: Perfect match (red light violation + rainy weather) with evidence. Tie detected among 3 Base=8 candidates; Risk Bonus=3 (rainy + dark visibility + night time)
ID: 3056625676   | Score: 10 | Reasoning: Perfect match (red light violation + rainy weather) with evidence. Tie detected among 3 Base=8 candidates; Risk Bonus=2 (rainy + wet surface)
ID: 3062827855   | Score: 10 | Reasoning: Perfect match (red light violation + rainy weather) with evidence. Tie detected among 3 Base=8 candidates; Risk Bonus=2 (rainy + wet surface)
ID: 3061441905   | Score: 6  | Reasoning: Partial match: has red light violation but weather is overcast, not rainy
---------------------

LLM analyzed 4 results. Found 3 videos meeting the threshold.

## Performing Hybrid Search for: 'Find videos where a vehicle loses traction under clear weather in a desert region.' with score_threshold > 6
Found 70 unique high-quality candidates from hybrid search.


Processing Queries:  50%|█████     | 5/10 [01:02<01:02, 12.43s/it]


--- LLM Reasoning ---
ID: 3045068900   | Score: 8  | Reasoning: Perfect match - Explicit loss of control in clear weather in desert terrain. No tie (N8=1); no risk bonus applied.
ID: 3077352052   | Score: 6  | Reasoning: Partial match - Desert terrain and clear weather, but no loss of traction
ID: 3077250386   | Score: 6  | Reasoning: Partial match - Desert terrain and clear weather, but no loss of traction
ID: 3077307422   | Score: 6  | Reasoning: Partial match - Desert terrain and clear weather, but no loss of traction
ID: 3079557110   | Score: 6  | Reasoning: Partial match - Desert-like terrain and clear weather, but no loss of traction
---------------------

LLM analyzed 5 results. Found 1 videos meeting the threshold.

## Performing Hybrid Search for: 'Find collision videos involving an animal hit.' with score_threshold > 6
Found 52 unique high-quality candidates from hybrid search.


Processing Queries:  60%|██████    | 6/10 [01:14<00:49, 12.31s/it]


--- LLM Reasoning ---
ID: 3047065686   | Score: 10 | Reasoning: Perfect match: explicit deer collision. Tie detected among 5 Base=8 candidates; Risk Bonus=+2 (Dark visibility + Night driving)
ID: 3078930446   | Score: 10 | Reasoning: Perfect match: explicit deer collision. Tie detected among 5 Base=8 candidates; Risk Bonus=+2 (Dark visibility + Night driving)
ID: 3078122527   | Score: 8  | Reasoning: Perfect match: explicit deer collision. Tie detected among 5 Base=8 candidates; Risk Bonus=0 (no risk factors)
ID: 3060540928   | Score: 8  | Reasoning: Perfect match: explicit bird collision. Tie detected among 5 Base=8 candidates; Risk Bonus=0 (no risk factors)
ID: 3078122525   | Score: 8  | Reasoning: Perfect match: explicit bird collision. Tie detected among 5 Base=8 candidates; Risk Bonus=0 (no risk factors)
ID: 3043866918   | Score: 4  | Reasoning: Partial match: 'contact with something on the road' is ambiguous, could be animal but not explicitly stated
---------------------

LLM a

Processing Queries:  70%|███████   | 7/10 [01:27<00:37, 12.41s/it]


--- LLM Reasoning ---
ID: 3083061071   | Score: 10 | Reasoning: Tie detected among 5 Base=8 candidates; Risk Bonus=+2 (Nighttime + Heavy Traffic). Evidence: urban/highway area, traffic signal present, explicitly notes 'heavy traffic'
ID: 3083080697   | Score: 9  | Reasoning: Tie detected among 5 Base=8 candidates; Risk Bonus=+1 (Heavy Traffic). Evidence: urban area, traffic signals, explicitly notes 'heavy traffic'
ID: 3077300360   | Score: 9  | Reasoning: Tie detected among 5 Base=8 candidates; Risk Bonus=+1 (Heavy Traffic). Evidence: city/intersection area, traffic signals present, explicitly notes 'heavy traffic'
ID: 3077460842   | Score: 9  | Reasoning: Tie detected among 5 Base=8 candidates; Risk Bonus=+1 (Heavy Traffic). Evidence: city/intersection area, traffic signal present, explicitly notes 'heavy traffic'
ID: 3077399502   | Score: 9  | Reasoning: Tie detected among 5 Base=8 candidates; Risk Bonus=+1 (Heavy Traffic). Evidence: urban/commercial area, traffic signal present, e

Processing Queries:  80%|████████  | 8/10 [01:42<00:26, 13.11s/it]


--- LLM Reasoning ---
ID: 3069115953   | Score: 10 | Reasoning: Tie detected among 6 Base=8 candidates; Risk Bonus=3 (dark visibility + night driving + debris)
ID: 3069044285   | Score: 10 | Reasoning: Tie detected among 6 Base=8 candidates; Risk Bonus=2 (dark visibility + night driving)
ID: 3056938019   | Score: 9  | Reasoning: Tie detected among 6 Base=8 candidates; Risk Bonus=1 (sun glare)
ID: 3058473909   | Score: 9  | Reasoning: Tie detected among 6 Base=8 candidates; Risk Bonus=1 (night driving)
ID: 3065432030   | Score: 9  | Reasoning: Tie detected among 6 Base=8 candidates; Risk Bonus=1 (debris)
ID: 3054697434   | Score: 9  | Reasoning: Tie detected among 6 Base=8 candidates; Risk Bonus=1 (snow/ice)
---------------------

LLM analyzed 6 results. Found 6 videos meeting the threshold.

## Performing Hybrid Search for: 'Find collisions that did not hit a guardrail.' with score_threshold > 6
Found 76 unique high-quality candidates from hybrid search.


Processing Queries:  90%|█████████ | 9/10 [01:56<00:13, 13.53s/it]


--- LLM Reasoning ---
ID: 3069044285   | Score: 8  | Reasoning: Perfect match: Collision confirmed, contacted deer only (no guardrail). No tie (N8=1); no risk bonus applied.
ID: 3070300591   | Score: 8  | Reasoning: Perfect match: Collision confirmed, contacted 'rocky embankment, trees, and orange traffic cones' (no guardrail). No tie (N8=1); no risk bonus applied.
ID: 3083727227   | Score: 8  | Reasoning: Perfect match: Collision confirmed, contacted 'snowbank' only (no guardrail). No tie (N8=1); no risk bonus applied.
ID: 3062176673   | Score: 8  | Reasoning: Perfect match: Collision confirmed, contacted 'embankment' only (no guardrail). No tie (N8=1); no risk bonus applied.
ID: 3045068900   | Score: 8  | Reasoning: Perfect match: Collision confirmed, contacted 'road surface and desert terrain' only (no guardrail). No tie (N8=1); no risk bonus applied.
ID: 3064006492   | Score: 1  | Reasoning: Collision but explicitly hit guardrail (violates negative constraint)
ID: 3076181611   | S

Processing Queries: 100%|██████████| 10/10 [02:07<00:00, 12.71s/it]


--- LLM Reasoning ---
ID: 3077483202   | Score: 10 | Reasoning: Perfect match: hard brake + speed limit sign + raindrops on windshield. Tie detected among 2 Base=8 candidates; Risk Bonus=2 (moderate traffic + hilly terrain)
ID: 3077324436   | Score: 9  | Reasoning: Perfect match: hard brake + speed limit sign + raindrops on windshield. Tie detected among 2 Base=8 candidates; Risk Bonus=1 (wet surface)
ID: 3077260073   | Score: 7  | Reasoning: Partial match: hard brake + raindrops on windshield, but speed limit sign not mentioned
ID: 3056625676   | Score: 4  | Reasoning: Weak match: has rain on windshield but different event type (red light violation)
---------------------

LLM analyzed 4 results. Found 3 videos meeting the threshold.


All queries processed. Preparing to save the final file to Google Drive...





✅ Success! Your submission file has been saved to Google Drive.
File Name: submission.csv
You can view it here: https://drive.google.com/file/d/1j1jeutu3AZcEK69H2pSRw9FrQ1xusjiL/view?usp=drivesdk

Preview of the first 5 predictions:


Unnamed: 0,query_id,query,relevant_video_ids
0,1,Show me red light violations under heavy traffic.,"3077470742, 3077367902, 3077356819, 3077399502..."
1,2,Rollover occuring on a dirt road in a snowy mo...,"3079637179, 3083727227"
2,3,Show me the tailgating events on a Kenworth T6...,3083085362
3,4,Find red light violations in rainy weather.,"3089359636, 3056625676, 3062827855"
4,5,Find videos where a vehicle loses traction und...,3045068900
5,6,Find collision videos involving an animal hit.,"3047065686, 3078930446, 3078122527, 3060540928..."
6,7,Find heavy traffic congestion in an urban area...,"3083061071, 3083080697, 3077300360, 3077460842..."
7,8,Find t-bone collision at an intersection,"3069115953, 3069044285, 3056938019, 3058473909..."
8,9,Find collisions that did not hit a guardrail.,"3069044285, 3070300591, 3083727227, 3062176673..."
9,10,ٖFind hard brake events near a speed limit sig...,"3077483202, 3077324436, 3077260073"
