In [None]:
import pandas as pd
import json
from dotenv import load_dotenv
import os
import re
    
load_dotenv()

In [14]:
df = pd.read_csv("combined_canada_rag_cleaned.csv")

In [38]:
df.head()

Unnamed: 0,id,url,title,description,content,province,timestamp,content_length,language,source,document_type
0,Alberta_1_1757152647,https://www.alberta.ca/careers-fisheries-manag...,Careers – Fisheries management | Alberta.ca,A career in fisheries management offers many o...,A career in fisheries management offers many o...,Alberta,1757153000.0,4794,en,alberta_gov,html
1,Alberta_2_1757152647,https://www.alberta.ca/carrier-profiles-and-mo...,Carrier profiles and monitoring | Alberta.ca,Supports carriers' internal monitoring of on-r...,"\n\n\n\n * Effective December 2020, the Roads...",Alberta,1757153000.0,5891,en,alberta_gov,html
2,Alberta_3_1757152647,https://www.alberta.ca/careers-land-management,Careers – Land management | Alberta.ca,There are many rewarding career paths for land...,The Lands Division is responsible for providin...,Alberta,1757153000.0,6423,en,alberta_gov,html
3,Alberta_4_1757152648,https://www.alberta.ca/careers-wildlife-manage...,Careers – Wildlife management | Alberta.ca,A career in wildlife management offers opportu...,A career in Wildlife Management offers a contr...,Alberta,1757153000.0,3919,en,alberta_gov,html
4,Alberta_5_1757152648,https://www.alberta.ca/careers-agrology,Careers – Agrology | Alberta.ca,Agrologists can find meaningful work managing ...,Consider a career that enables you to make a d...,Alberta,1757153000.0,4653,en,alberta_gov,html


In [39]:
content_list = df[df["content_length"] < 10000]["content"].sample(20000).to_list()

In [81]:
import base64
import os
import json
import re
from google import genai
from google.genai import types
import random
from datetime import datetime
import time

# Rate limiting configuration
RATE_LIMITS = {
    "requests_per_minute": 30,
    "tokens_per_minute": 15000,
    "requests_per_day": 14400
}

class RateLimiter:
    def __init__(self):
      self.request_times = []
      self.token_counts = []
      self.daily_request = 0
      self.daily_reset = datetime.now().date()

    def estimate_tokens(self,text):
      """Rough token estimation (1 token ≈ 4 characters for English)"""
      return max(1, len(text) // 4)
    
    def wait_if_needed(self,prompt_text=""):

      # Reset daily counter if it's a new day
      current_date = datetime.now().date()
      if current_date != self.daily_reset:
        self.daily_request = 0
        self.daily_reset = current_date

      if self.daily_request >= RATE_LIMITS["requests_per_day"]:
        print("Daily limit reached. Waiting until tomorrow...")
        time.sleep(86400)
        self.daily_request = 0
        self.daily_reset = datetime.now().date()

      # check minute limit
      current_time = time.time()
      one_minute_ago = current_time - 60

      # remove request older than 1 minutes
      valid_indices = [i for i, t in enumerate(self.request_times) if t > one_minute_ago]
      self.request_times = [self.request_times[i] for i in valid_indices]
      self.token_counts = [self.token_counts[i] for i in valid_indices]

      current_tokens = self.estimate_tokens(prompt_text)
      total_tokens_last_minute = sum(self.token_counts) + current_tokens

      if total_tokens_last_minute >= RATE_LIMITS["tokens_per_minute"]:
        # Find when we can make the next request without exceeding token limit
          if self.request_times:
            oldest_request = min(self.request_times)
            wait_time = 60 - (current_time - oldest_request) + 1
            print(f"Rate limit reached. Waiting {wait_time:.1f} seconds....")
            time.sleep(wait_time)
            #reseting after waiting
            self.request_times = [time.time()]
            self.token_counts = [current_tokens]

          else:
            time.sleep(60)

      if len(self.request_times) >= RATE_LIMITS["requests_per_minute"]:
        oldest_request = min(self.request_times)
        wait_time = 60 - (current_time - oldest_request) + 1
        print(f"Rate limit reached. Waiting {wait_time:.1f} seconds....")
        time.sleep(wait_time)
        # update times after the waiting
        self.request_times = [time.time()]
        self.token_counts = [current_tokens]

      else:
        # add small random delay to avoid bursting
        time.sleep(random.uniform(0.1,0.5))

      # Recording the request
      self.request_times.append(time.time())
      self.token_counts.append(current_tokens)
      self.daily_request += 1

rate_limiter = RateLimiter()


def extract_json_from_data(result):
    """Extract JSON from the response text"""

    if not result or result.strip() == "":
       print("Empty response receive")
       return None, None
    
    try:
        json_match = re.findall(r"\{[\s\S]*\}", result)
        
        if not json_match:
            print("No JSON found in response")
            print(f"Response was: {result[:500]}...")  
            return None, None

        cleaned_text = json_match[0]
        json_data = json.loads(cleaned_text)
        triplet_list = [json_data["anchor"], json_data["positive"], json_data["negative"]]
        return json_data, triplet_list
    
    except (json.JSONDecodeError, KeyError, IndexError) as e:
        print(f"Error parsing JSON: {e}")
        print(f"Response text: {result[:500]}...") 
        return None, None

def generate(content):

  prompt_template = f"""


You are an expert in creating training data for embedding models. Your task is to generate **one** high-quality triplet in JSON format for semantic similarity fine-tuning.

## STRICT REQUIREMENTS:

**1. LANGUAGE DETECTION & CONSISTENCY:**
   - FIRST, detect the primary language of the document content
   - Use that SAME language for ALL THREE elements (anchor, positive, negative)
   - If document contains multiple languages, choose the dominant one consistently
   - Output language must match document's primary language

**2. POSITIVE QUALITY:**
   - Must be a COMPLETE, grammatically correct sentence from the document
   - Should directly answer the anchor question
   - Avoid incomplete sentences, fragments, or lists
   - Paraphrase if necessary but stay faithful to original meaning

**3. NEGATIVE DISTINCTION:**
   - Must be from a COMPLETELY DIFFERENT domain/service/topic
   - Should not share any keywords or concepts with the anchor
   - Different beneficiary groups, different government services, different intents

**4. CONTENT VALIDATION:**
   - Ensure positive is actually present in or directly inferrable from the document
   - Avoid technical gibberish or broken sentences
   - Use natural, conversational language

## LANGUAGE DETECTION GUIDE:
- If document contains French terms like "Québec", "procédures", "admissible" → Use FRENCH
- If document contains English terms only → Use ENGLISH  
- Be consistent throughout the triplet

## OUTPUT FORMAT (JSON ONLY):
{{
    "anchor": "question in detected language",
    "positive": "complete sentence answering question", 
    "negative": "different domain topic in same language"
}}

## BAD → GOOD EXAMPLES:

❌ BAD: Mixed languages, incomplete positives
✅ GOOD: 
{{
    "anchor": "Quelles mesures les propriétaires peuvent-ils prendre contre les punaises de lit?",
    "positive": "Les propriétaires peuvent instaurer différentes mesures pour prévenir et éliminer les punaises des lits.",
    "negative": "Quelles sont les exigences pour obtenir un permis de construction?"
}}

❌ BAD: Broken/grammatically incorrect positives  
✅ GOOD:
{{
    "anchor": "How long do plant breeders' rights last in Canada?",
    "positive": "Plant breeders receive legal protection for up to 25 years for trees and vines, and 20 years for other plant varieties.",
    "negative": "What are the requirements for importing agricultural products?"
}}

Document content:
{str(content)}


"""
  try:
    
    # Apply rate limiting before making the request

    rate_limiter.wait_if_needed(prompt_template)


    client = genai.Client(
          api_key = os.environ.get("GOOGLE_API_KEY"),
    )
        


    model = "gemma-3-27b-it"
    contents = [
          types.Content(
              role="user",
              parts=[
                  types.Part.from_text(text=prompt_template),
              ],
          ),
      ]
    generate_content_config = types.GenerateContentConfig(
      max_output_tokens=500
    )

    # Collect all chunks
    full_response = ""

    for chunk in client.models.generate_content_stream(
      model=model,
      contents=contents,
      config=generate_content_config,
      ):
      if chunk.text:
          full_response += chunk.text

    if not full_response.strip():
      print("Empty repsonse from API")
      return None, None
    
    json_data, triplet = extract_json_from_data(full_response)
    return json_data, triplet

  except Exception as e:

    if "429" in str(e) or "quota" in str(e).lower():
      # Specific handling for quota errors
      retry_seconds = 60  # Wait a full minute for token reset
      print(f"Quota exceeded. Waiting {retry_seconds} seconds...")
      time.sleep(retry_seconds)
      # Reset rate limiter after quota error
      rate_limiter.request_times = []
      rate_limiter.token_counts = []

    else:
      print(f"Error in generate function: {e}")

    return None, None
  


def main(content_list):
  json_data_list = []
  triplet_list = []
  counter = 0
  success_count = 0
  error_count = 0

  # Need to clean the file before generating
  open("triplet.json", "w").close()
  open("triplet_list.json", "w").close()

  for content in content_list:
    counter += 1

    print(f"\n--- Processing content {counter}/{len(content_list)} ---")
    
    try:
       
      json_data, triplet = generate(content)

      if json_data and triplet:
        
        json_data_list.append(json_data)
        triplet_list.append(triplet)
        success_count += 1
        print(f"✅ Successfully generated triplet {success_count}")

        if success_count %10 == 0:
           
          with open("triplet.json", "w") as f:
            json.dump(json_data_list, f, indent=2)

          with open("triplet_list.json", "w") as f:
            json.dump(triplet_list,f,indent=2)
          print(f"💾 Progress saved at {success_count} triplets")

      else:
        error_count += 1
        print(f"❌ Failed to generate triplet (Error #{error_count})")

    except Exception as e:
      error_count += 1
      print(f"🚨 Unexpected error in main loop: {e}")

    success_rate = (success_count/counter) * 100
    print(f"📊 Progress: {counter}/{len(content_list)} | Success : {success_count} | Errors : {error_count} | Rate : {success_rate:.1f}%")

    # Save final results

    if json_data_list:
      with open("triplet.json", "w") as f:
        json.dump(json_data_list, f, indent=2)
      with open("triplet_list.json", "w") as f:
        json.dump(triplet_list, f, indent=2)

    print(f"\nFinal results : {success_count} successfully triplets out of {counter} attempts")
    print(f"Total errors : {error_count}")
    print(f"Success rate: {(success_count/counter)*100:.1f}%")

  return json_data_list, triplet_list


In [82]:
def is_high_quality(content):
    # Skip if content is too short
    if len(content.strip()) < 50:
        return False
    
    # Skip if contains low-quality indicators
    low_quality_patterns = [
        r"error", r"captcha", r"server", r"bot", r"temporarily limited",
        r"page when trying to access", r"your browser", r"request.*understand",
        r"higher volume.*requests", r"network.*normal"
    ]
    
    content_lower = content.lower()
    if any(re.search(pattern, content_lower) for pattern in low_quality_patterns):
        return False
    
    # Additional quality checks
    if content_lower.count(".") < 2:  # Too few sentences
        return False
        
    return True

filtered_content_list = [content for content in content_list if is_high_quality(content)]

In [None]:
filtered_content_list

if __name__ == "__main__":
    if not filtered_content_list:
        print("Please define content_list with your documents")

    else:
        
        json_data_list, triplet_list = main(filtered_content_list[:10000])


--- Processing content 1/25 ---
✅ Successfully generated triplet 1
📊 Progress: 1/25 | Success : 1 | Errors : 0 | Rate : 100.0%

Final results : 1 successfully triplets out of 1 attempts
Total errors : 0
Success rate: 100.0%

--- Processing content 2/25 ---
✅ Successfully generated triplet 2
📊 Progress: 2/25 | Success : 2 | Errors : 0 | Rate : 100.0%

Final results : 2 successfully triplets out of 2 attempts
Total errors : 0
Success rate: 100.0%

--- Processing content 3/25 ---
✅ Successfully generated triplet 3
📊 Progress: 3/25 | Success : 3 | Errors : 0 | Rate : 100.0%

Final results : 3 successfully triplets out of 3 attempts
Total errors : 0
Success rate: 100.0%

--- Processing content 4/25 ---
✅ Successfully generated triplet 4
📊 Progress: 4/25 | Success : 4 | Errors : 0 | Rate : 100.0%

Final results : 4 successfully triplets out of 4 attempts
Total errors : 0
Success rate: 100.0%

--- Processing content 5/25 ---
✅ Successfully generated triplet 5
📊 Progress: 5/25 | Success : 5 |