In [34]:
# ==============================================================================
# STEP 1: SETUP AND INITIALIZATION
# ==============================================================================
import vertexai
from vertexai.language_models import TextEmbeddingModel
import pandas as pd
import numpy as np
import os
from tqdm.notebook import tqdm
from sklearn.metrics.pairwise import cosine_similarity
import warnings
import time

# Suppress a known, harmless warning from the embedding model's client
warnings.filterwarnings("ignore", category=UserWarning, module='google.cloud.aiplatform.compat.services.prediction_service_client')

# --- Configuration for Vertex AI Workbench ---
PROJECT_ID = "hack-thelaw25cam-586"
LOCATION = "us-central1"
vertexai.init(project=PROJECT_ID, location=LOCATION)
print(f"Vertex AI initialized for project: {PROJECT_ID}")

# --- Load the Embedding Model ---
embedding_model = TextEmbeddingModel.from_pretrained("gemini-embedding-001")
print("Text Embedding Model loaded successfully.")


# ==============================================================================
# STEP 2: LOAD AND PREPARE THE DATABASE
# ==============================================================================

DATABASE_FILE = 'legal_arguments_database_merged.csv'
if not os.path.exists(DATABASE_FILE):
    raise FileNotFoundError(f"Database file not found: {DATABASE_FILE}. Please run the previous script first.")

db_df = pd.read_csv(DATABASE_FILE)
print(f"Database loaded with {len(db_df)} arguments.")

# Prepare the text for embedding, combining fields for rich context.
db_df['embedding_text'] = (
    "Argument: " + db_df['argument_summary'].fillna('') +
    " | Legal Basis: " + db_df['legal_basis'].fillna('') +
    " | Keywords: " + db_df['key_keywords'].fillna('') +
    " | Tribunal Reasoning: " + db_df['tribunal_reasoning'].fillna('')
)


# ==============================================================================
# STEP 3: INDEXING - GENERATE OR LOAD EMBEDDINGS (WITH BATCHING FIX)
# ==============================================================================

EMBEDDINGS_FILE = 'arguments_embeddings.npy'

# To ensure compatibility, we'll always regenerate if the embeddings file exists.
if os.path.exists(EMBEDDINGS_FILE):
    print(f"\nFound existing embeddings file ('{EMBEDDINGS_FILE}'). Deleting it to ensure compatibility with the current model...")
    os.remove(EMBEDDINGS_FILE)
    print("Old embeddings file deleted.")

# --- UPDATED: Processing one by one instead of in batches ---
print(f"\nGenerating new embeddings for {len(db_df)} arguments using 'text-embedding-004'.")
print("This will be slower as it processes requests individually, but respects API limits.")

all_embeddings = []

# Loop through each row of the DataFrame
for index, row in tqdm(db_df.iterrows(), total=db_df.shape[0], desc="Embedding Texts"):
    try:
        text_to_embed = row['embedding_text']
        # The API call now sends a list containing just one item
        embeddings_response = embedding_model.get_embeddings([text_to_embed])
        # Add the embedding values to our list
        all_embeddings.append(embeddings_response[0].values)
        # Adding a very small delay to be a good citizen to the API
        time.sleep(0.1) 
    except Exception as e:
        print(f"\nError embedding text at index {index}: {e}")
        # Append a placeholder (e.g., a zero vector) or handle as needed
        # For simplicity, we'll skip this one, but in production you might add a zero vector
        # that matches the embedding dimension (768 for this model).
        # all_embeddings.append([0] * 768) 
        continue

corpus_embeddings = np.array(all_embeddings)

# Save the newly generated embeddings
np.save(EMBEDDINGS_FILE, corpus_embeddings)
print(f"New embeddings generated and saved to '{EMBEDDINGS_FILE}'")
print(f"Corpus embeddings shape: {corpus_embeddings.shape}") 


# ==============================================================================
# STEP 4: SEMANTIC SEARCH FUNCTION (No changes needed here)
# ==============================================================================

def find_similar_arguments(query_text: str, top_n: int = 5):
    if query_text is None or not query_text.strip():
        print("Query text cannot be empty.")
        return pd.DataFrame()

    print(f"\nSearching for arguments similar to: '{query_text}'")

    # 1. Embed the user's query
    query_embedding_obj = embedding_model.get_embeddings([query_text])[0]
    query_embedding = np.array(query_embedding_obj.values)
    
    # Check for empty corpus
    if corpus_embeddings.shape[0] == 0:
        print("Corpus embeddings are empty. Cannot perform search.")
        return pd.DataFrame()
        
    # 2. Calculate Cosine Similarity
    similarities = cosine_similarity(
        query_embedding.reshape(1, -1),
        corpus_embeddings
    )[0] 

    # 3. Find the top N most similar arguments
    top_n_indices = np.argsort(similarities)[::-1][:top_n]

    # 4. Retrieve the results
    results_df = db_df.iloc[top_n_indices].copy()
    results_df['similarity_score'] = similarities[top_n_indices]

    return results_df


# ==============================================================================
# STEP 5: EXAMPLE USAGE (No changes needed here)
# ==============================================================================

my_new_argument = "The respondent state argues that the tribunal has no jurisdiction because the claimant did not respect the three-year statute of limitations, as they knew about the damage to their investment more than three years before filing the claim."

# Run the search
similar_results = find_similar_arguments(my_new_argument, top_n=5)

# Display the results
if not similar_results.empty:
    print("\n--- Top Similar Arguments Found ---")
    
    display_cols = [
        'similarity_score',
        'court_followed',
        'argument_summary',
        'tribunal_reasoning',
        'case_title',
        'party',
        'legal_basis',
    ]
    # Ensure all display columns exist to prevent KeyErrors
    for col in display_cols:
        if col not in similar_results.columns:
            similar_results[col] = 'N/A'

    display(similar_results[display_cols])
else:
    print("No results found.")


Vertex AI initialized for project: hack-thelaw25cam-586
Text Embedding Model loaded successfully.
Database loaded with 7365 arguments.

Generating new embeddings for 7365 arguments using 'text-embedding-004'.
This will be slower as it processes requests individually, but respects API limits.


Embedding Texts:   0%|          | 0/7365 [00:00<?, ?it/s]

New embeddings generated and saved to 'arguments_embeddings.npy'
Corpus embeddings shape: (7365, 3072)

Searching for arguments similar to: 'The respondent state argues that the tribunal has no jurisdiction because the claimant did not respect the three-year statute of limitations, as they knew about the damage to their investment more than three years before filing the claim.'

--- Top Similar Arguments Found ---


Unnamed: 0,similarity_score,court_followed,argument_summary,tribunal_reasoning,case_title,party,legal_basis
579,0.841048,No,The Respondent argued that the Claimant's clai...,The Tribunal found that the claims were not ti...,Infinito Gold v. Costa Rica,Respondent,"BIT Art. XII(3)(c), *Spence, Corona, ST-AD, Eu..."
397,0.82882,Yes,The Respondent argued that the Claimant's clai...,The tribunal agreed that the claims were time-...,Corona Materials v. Dominican Republic,Respondent,"DR-CAFTA Art. 10.18.1, DR-CAFTA Art. 10.3, DR-..."
4434,0.826506,Yes,The Respondent argued that the Tribunal lacked...,The Tribunal found that the issue of jurisdict...,Allard v. Barbados,Respondent,BIT Article XIII(3)(d)
395,0.825483,Yes,The Respondent argued that the Tribunal lacked...,The tribunal agreed with the Respondent and fo...,Corona Materials v. Dominican Republic,Respondent,DR-CAFTA Art. 10.18.1
589,0.816992,Yes,The Respondent argued that the Tribunal lacks ...,The Tribunal agreed that the TCA Decision was ...,Infinito Gold v. Costa Rica,Respondent,BIT Article XII(3)(c)


In [25]:
# ==============================================================================
# STEP 1: SETUP AND LOAD LIBRARIES
# ==============================================================================
import vertexai
from vertexai.language_models import TextEmbeddingModel
import pandas as pd
import numpy as np
import os
from sklearn.metrics.pairwise import cosine_similarity
import textwrap # Used for formatting long text nicely

print("Libraries loaded successfully.")

# ==============================================================================
# STEP 2: INITIALIZE VERTEX AI AND LOAD THE MODEL
# ==============================================================================
# --- Configuration for Vertex AI Workbench ---
PROJECT_ID = "hack-thelaw25cam-586"
LOCATION = "us-central1"
vertexai.init(project=PROJECT_ID, location=LOCATION)

# --- Load the EXACT SAME Embedding Model used for indexing ---
embedding_model = TextEmbeddingModel.from_pretrained("gemini-embedding-001")
print("Vertex AI Initialized and Embedding Model is ready.")


# ==============================================================================
# STEP 3: LOAD THE PRE-BUILT DATABASE AND EMBEDDINGS (WITH FIX)
# ==============================================================================
DATABASE_FILE = 'legal_arguments_database_100.csv'
EMBEDDINGS_FILE = 'arguments_embeddings.npy'

# --- Check if the required files exist ---
if not os.path.exists(DATABASE_FILE) or not os.path.exists(EMBEDDINGS_FILE):
    error_message = (
        "Error: Database or embeddings file not found!\n"
        f"Please ensure '{DATABASE_FILE}' and '{EMBEDDINGS_FILE}' are in the same directory.\n"
        "You may need to run the previous indexing script first."
    )
    raise FileNotFoundError(error_message)

# --- Load the data ---
print(f"Loading database from '{DATABASE_FILE}'...")
db_df = pd.read_csv(DATABASE_FILE)

# --- FIX: ENSURE ALL TEXT COLUMNS ARE TREATED AS STRINGS ---
# This prevents pandas from interpreting empty cells as NaN (float), which causes the error.
text_columns = [
    'argument_summary',
    'legal_basis',
    'key_keywords',
    'tribunal_reasoning',
    'case_title',
    'party',
    'document_title',
    'document_type'
]
for col in text_columns:
    if col in db_df.columns:
        # .astype(str) handles all cases including NaN, converting them to the string "nan"
        # which is safe for our purposes. We could also use .fillna('') first.
        db_df[col] = db_df[col].astype(str)

print(f"Loading embeddings from '{EMBEDDINGS_FILE}'...")
corpus_embeddings = np.load(EMBEDDINGS_FILE)

print("Data loaded and text columns cleaned successfully.")
# --- Sanity Check ---
if len(db_df) != len(corpus_embeddings):
    print("Warning: The number of rows in the CSV does not match the number of embeddings. Results may be inconsistent.")


# ==============================================================================
# STEP 4: DEFINE THE SEARCH AND DISPLAY FUNCTIONS (No changes needed)
# ==============================================================================

def find_similar_arguments(query_text: str, top_n: int = 5):
    if not query_text or not query_text.strip():
        print("Query text cannot be empty.")
        return pd.DataFrame()

    query_embedding_obj = embedding_model.get_embeddings([query_text])[0]
    query_embedding = np.array(query_embedding_obj.values).reshape(1, -1)
    similarities = cosine_similarity(query_embedding, corpus_embeddings)[0]
    top_n_indices = np.argsort(similarities)[::-1][:top_n]
    results_df = db_df.iloc[top_n_indices].copy()
    results_df['similarity_score'] = similarities[top_n_indices]
    return results_df

def display_results(results_df: pd.DataFrame):
    if results_df.empty:
        print("No results to display.")
        return

    print("\n\n=======================================================")
    print("           TOP SIMILAR ARGUMENTS FOUND")
    print("=======================================================")

    for index, row in results_df.iterrows():
        # Using .get() provides a default and is safer
        similarity_score = row.get('similarity_score', 0.0)
        
        print(f"\n--- Result #{results_df.index.get_loc(index) + 1} / Similarity: {similarity_score:.2%} ---")
        
        print(f"CASE: {row.get('case_title', 'N/A')}")
        print(f"PARTY WHO ARGUED: {row.get('party', 'N/A')}")
        # The .get() provides a default if the column lookup fails
        print(f"COURT FOLLOWED?: {row.get('court_followed', 'N/A')}\n")

        print("ARGUMENT SUMMARY:")
        # The .get() call here is now safe because we've ensured the column is string type
        summary = row.get('argument_summary', 'N/A')
        print(textwrap.fill(summary, width=100, initial_indent="  ", subsequent_indent="  "))
        
        print("\nTRIBUNAL'S REASONING (The Judgment):")
        reasoning = row.get('tribunal_reasoning', 'N/A')
        print(textwrap.fill(reasoning, width=100, initial_indent="  ", subsequent_indent="  "))
        
        print("\n-------------------------------------------------------")


# ==============================================================================
# STEP 5: RUN YOUR SEARCH
# ==============================================================================

# --- Define your new argument query here ---
my_new_argument = "The case should be dismissed because the investment was not made in accordance with the host state's laws, specifically regarding registration requirements."

# --- Execute the search and display the results ---
similar_results = find_similar_arguments(my_new_argument, top_n=10)
display_results(similar_results)

Libraries loaded successfully.
Vertex AI Initialized and Embedding Model is ready.
Loading database from 'legal_arguments_database_100.csv'...
Loading embeddings from 'arguments_embeddings.npy'...
Data loaded and text columns cleaned successfully.


           TOP SIMILAR ARGUMENTS FOUND

--- Result #1 / Similarity: 74.35% ---
CASE: Rand Investments v. Serbia
PARTY WHO ARGUED: Respondent
COURT FOLLOWED?: No

ARGUMENT SUMMARY:
  The BIT does not protect investments made in disregard of legal requirements.

TRIBUNAL'S REASONING (The Judgment):
  nan

-------------------------------------------------------

--- Result #2 / Similarity: 73.84% ---
CASE: Kaloti Metals v. Peru
PARTY WHO ARGUED: Respondent
COURT FOLLOWED?: nan

ARGUMENT SUMMARY:
  Investments made in violation of Peruvian law are not protected by investment treaties or the
  ICSID Convention.

TRIBUNAL'S REASONING (The Judgment):
  The Tribunal did not explicitly address this argument in its reasoning, as it found no investmen

In [37]:
# ==============================================================================
# FINAL, PRODUCTION-READY SEARCH SCRIPT
# ==============================================================================
import vertexai
from vertexai.language_models import TextEmbeddingModel
import pandas as pd
import numpy as np
import os
import json
from sklearn.metrics.pairwise import cosine_similarity
import warnings

# Suppress a known, harmless warning from the embedding model's client
warnings.filterwarnings("ignore", category=UserWarning, module='google.cloud.aiplatform.compat.services.prediction_service_client')

# --- Configuration ---
PROJECT_ID = "hack-thelaw25cam-586"
LOCATION = "us-central1"
DATABASE_FILE = 'legal_arguments_database_merged.csv'
EMBEDDINGS_FILE = 'arguments_embeddings.npy'
MODEL_NAME = "gemini-embedding-001"

# --- Initialize Vertex AI ---
vertexai.init(project=PROJECT_ID, location=LOCATION)
embedding_model = TextEmbeddingModel.from_pretrained(MODEL_NAME)
print(f"Vertex AI Initialized. Using embedding model: '{MODEL_NAME}'.")


# --- Load Data and Embeddings (assuming they are already correct) ---
if not os.path.exists(DATABASE_FILE) or not os.path.exists(EMBEDDINGS_FILE):
    raise FileNotFoundError(f"Database ('{DATABASE_FILE}') or embeddings ('{EMBEDDINGS_FILE}') not found. Please run the indexing script.")

db_df = pd.read_csv(DATABASE_FILE)
corpus_embeddings = np.load(EMBEDDINGS_FILE)
print(f"Loaded {len(db_df)} arguments and {len(corpus_embeddings)} embeddings successfully.")


# ==============================================================================
# SELF-CONTAINED SEARCH FUNCTION WITH JSON FIX
# ==============================================================================

def get_similar_arguments_as_json(query_text: str, top_n: int = 5) -> str:
    """
    Performs a semantic search and returns the top N results as a valid JSON string.
    This function handles missing data to ensure JSON compatibility.

    Args:
        query_text: The new legal argument to find matches for.
        top_n: The number of similar arguments to return.

    Returns:
        A JSON string representing a list of the top N similar arguments.
    """
    if not query_text or not query_text.strip():
        return json.dumps({"error": "Query text cannot be empty"}, indent=4)

    try:
        # 1. Embed the user's query
        query_embedding_obj = embedding_model.get_embeddings([query_text])[0]
        query_embedding = np.array(query_embedding_obj.values).reshape(1, -1)

        # 2. Calculate Cosine Similarity
        similarities = cosine_similarity(query_embedding, corpus_embeddings)[0]

        # 3. Find and retrieve the top N results
        top_n_indices = np.argsort(similarities)[::-1][:top_n]
        results_df = db_df.iloc[top_n_indices].copy()
        results_df['similarity_score'] = similarities[top_n_indices]

        # --- FIX: Replace all NaN values with a JSON-safe string 'N/A' ---
        # This is the crucial step to prevent the "NaN is not valid JSON" error.
        results_df = results_df.fillna("N/A")

        # 5. Format the cleaned results into a list of dictionaries
        output_list = []
        for _, row in results_df.iterrows():
            result_item = {
                "similarity_score": row['similarity_score'], # No .get() needed now
                "case_identifier": row['case_identifier'],
                "case_title": row['case_title'],
                "argument_summary": row['argument_summary'],
                "judgment": row['court_followed'],
                "judgment_summary": row['tribunal_reasoning']
            }
            output_list.append(result_item)

        # 6. Convert the list to a nicely formatted JSON string
        return json.dumps(output_list, indent=4)

    except Exception as e:
        print(f"An error occurred during search: {e}")
        return json.dumps({"error": str(e)}, indent=4)


# ==============================================================================
# EXAMPLE USAGE AND SAVING TO DISK
# ==============================================================================

# --- Define your argument query ---
my_new_argument = "The case should be dismissed because the investment was not made in accordance with the host state's laws, specifically regarding registration requirements."

# --- Execute the search to get the valid JSON output ---
json_results = get_similar_arguments_as_json(my_new_argument, top_n=10)

# --- Save the JSON string to a file ---
output_filename = "search_results.json"
with open(output_filename, 'w', encoding='utf-8') as f:
    f.write(json_results)

print(f"Search complete. Results saved to '{output_filename}'")

# --- Print the JSON to the console for immediate viewing ---
print("\n--- Valid JSON Output ---")
print(json_results)

Vertex AI Initialized. Using embedding model: 'gemini-embedding-001'.
Loaded 7365 arguments and 7365 embeddings successfully.
Search complete. Results saved to 'search_results.json'

--- Valid JSON Output ---
[
    {
        "similarity_score": 0.7700352386904998,
        "case_identifier": "IDS-524",
        "case_title": "South American Silver v. Bolivia",
        "argument_summary": "The Respondent argued that the claims were inadmissible because the investment was not made in accordance with the law.",
        "judgment": "No",
        "judgment_summary": "The Tribunal found that the Respondent had not shown that the alleged illegalities had the effect of making the investment unlawful."
    },
    {
        "similarity_score": 0.7652611457363432,
        "case_identifier": "IDS-576",
        "case_title": "Krederi v. Ukraine",
        "argument_summary": "Respondent argued that Claimant\u2019s investment was made in violation of Ukrainian law.",
        "judgment": "No",
        "