In [None]:
import pandas as pd
import json
from dotenv import load_dotenv
import os
import re
    
load_dotenv()

In [14]:
df = pd.read_csv("combined_canada_rag_cleaned.csv")

In [15]:
df.head()

Unnamed: 0,id,url,title,description,content,province,timestamp,content_length,language,source,document_type
0,Alberta_1_1757152647,https://www.alberta.ca/careers-fisheries-manag...,Careers – Fisheries management | Alberta.ca,A career in fisheries management offers many o...,A career in fisheries management offers many o...,Alberta,1757153000.0,4794,en,alberta_gov,html
1,Alberta_2_1757152647,https://www.alberta.ca/carrier-profiles-and-mo...,Carrier profiles and monitoring | Alberta.ca,Supports carriers' internal monitoring of on-r...,"\n\n\n\n * Effective December 2020, the Roads...",Alberta,1757153000.0,5891,en,alberta_gov,html
2,Alberta_3_1757152647,https://www.alberta.ca/careers-land-management,Careers – Land management | Alberta.ca,There are many rewarding career paths for land...,The Lands Division is responsible for providin...,Alberta,1757153000.0,6423,en,alberta_gov,html
3,Alberta_4_1757152648,https://www.alberta.ca/careers-wildlife-manage...,Careers – Wildlife management | Alberta.ca,A career in wildlife management offers opportu...,A career in Wildlife Management offers a contr...,Alberta,1757153000.0,3919,en,alberta_gov,html
4,Alberta_5_1757152648,https://www.alberta.ca/careers-agrology,Careers – Agrology | Alberta.ca,Agrologists can find meaningful work managing ...,Consider a career that enables you to make a d...,Alberta,1757153000.0,4653,en,alberta_gov,html


In [None]:
content_list = df[df["content_length"] < 10000]["content"].sample(20000).to_list()

In [None]:
import base64
import os
import json
import re
from google import genai
from google.genai import types


def extract_json_from_data(result):
  """Extract JSON from the response text"""

  try:
    json_match  = re.findall(r"\{[\s\S]*\}", result)[0]
    
    if not json_match:
      print("No JSON found in response")
      return None, None

    json_data = json.loads(json_match)
    triplet_list = [json_data["anchor"], json_data["positive"], json_data["negative"]]
    return json_data , triplet_list
  
  except json.JSONDecodeError as e:
    print(f"Error parsing JSON as : {e}")
    print(f"Response text : {result}")
    return None


def generate(content):

  prompt_template = f"""

You are an expert in creating training data for embedding models. Your task is to generate **one** high-quality triplet in JSON format for semantic similarity fine-tuning.

A triplet consists of:
- "anchor": a realistic user query or short question related to the document.
- "positive": a sentence or short paragraph **from the document** that directly answers or matches the anchor.
- "negative": a sentence or topic that is **plausibly related but semantically distinct** (e.g., different service, policy, or domain).

Rules:
1. Use only the provided document content.
2. The "positive" must be a direct excerpt or close paraphrase from the document.
3. The "negative" must be realistic but clearly unrelated in intent.
4. Output **only valid JSON** — no extra text, no markdown, no explanation.

Example :

Imagine your company, "Shibuya Financial" offers various complex financial products like investment trusts, NISA accounts (a tax-advantaged savings account), and home loans. Your customer support team uses an internal knowledge base to quickly find answers to customer questions.


 {{
    "anchor": "How do I open a NISA account?",
    "positive": "What is the procedure for starting a new tax-free investment account?",
    "negative": "I want to check the balance of my regular savings account."
  }},
  {{
    "anchor": "Are there fees for making an early repayment on a home loan?",
    "positive": "If I pay back my house loan early, will there be any costs?",
    "negative": "What is the management fee for this investment trust?"
  }},
  {{
    "anchor": "What is the coverage for medical insurance?",
    "positive": "Tell me about the benefits of the health insurance plan.",
    "negative": "What is the cancellation policy for my life insurance?"
  }}


Document content:

{str(content)}

"""


  client = genai.Client(
      api_key = os.environ.get("GOOGLE_API_KEY"),
    )
    


  model = "gemma-3-27b-it"
  contents = [
        types.Content(
            role="user",
            parts=[
                types.Part.from_text(text=prompt_template),
            ],
        ),
    ]
  generate_content_config = types.GenerateContentConfig(
    )

    # Collect all chunks
  full_response = ""

  try:
    for chunk in client.models.generate_content_stream(
      model=model,
      contents=contents,
      config=generate_content_config,
      ):
      if chunk.text:
          full_response += chunk.text
  except Exception as e:
    print(f"Error generating content: {e}")
    return None, None
  
  json_data, triplet = extract_json_from_data(full_response)

  return json_data, triplet


def main(content_list):
  json_data_list = []
  triplet_list = []
  counter = 0
  success_count = 0

  for content in content_list[:5]:

    json_data, triplet = generate(content)

    if json_data and triplet:
      
      json_data_list.append(json_data)
      triplet_list.append(triplet)
      success_count += 1
      print(f"✅ Successfully generated triplet {success_count}")

      with open("triplet.json", "w") as f:
        json.dump(json_data_list, f, indent=2)
      print("Saved triplets to triplets.json")

      with open("triplet_list.json", "w") as f:
        json.dump(triplet_list,f,indent=2)
      print("Saved triplet lists to triplets_list.json")

    else:
      print(f"❌ Failed to generate triplet for content {counter + 1}")

    counter += 1
    
    print(f"Processed: {counter}/{len(content_list)} - Success rate: {success_count}/{counter}")

    import time
    time.sleep(1)

    print(f"\nFinal results: {success_count} successful triplets out of {counter} attempts")
 

  return json_data_list, triplet_list


In [35]:
content_list

if __name__ == "__main__":
    if not content_list:
        print("Please define content_list with your documents")

    else:
        
        json_data_list, triplet_list = main(content_list)

✅ Successfully generated triplet 1
Processed: 1/10000 - Success rate: 1/1

Final results: 1 successful triplets out of 1 attempts
✅ Successfully generated triplet 2
Processed: 2/10000 - Success rate: 2/2

Final results: 2 successful triplets out of 2 attempts
✅ Successfully generated triplet 3
Processed: 3/10000 - Success rate: 3/3

Final results: 3 successful triplets out of 3 attempts
✅ Successfully generated triplet 4
Processed: 4/10000 - Success rate: 4/4

Final results: 4 successful triplets out of 4 attempts
✅ Successfully generated triplet 5
Processed: 5/10000 - Success rate: 5/5

Final results: 5 successful triplets out of 5 attempts
Saved triplets to triplets.json
Saved triplet lists to triplets_list.json
