In [None]:
import os
import re
import google.generativeai as genai
from dotenv import load_dotenv

load_dotenv()
# Configure Gemini API
GOOGLE_API_KEY = os.environ['GOOGLE_CLOUD_API']
genai.configure(api_key=GOOGLE_API_KEY)
model = genai.GenerativeModel('gemini-2.0-flash')

# Define categories and their descriptions
CATEGORIES = {
    'academic': 'Questions about courses, programs, grades, exams, and academic policies',
    'student_services': 'Questions about library, health services, and other campus facilities',
    'financial': 'Questions about tuition, scholarships, financial aid, and fees',
    'campus_life': 'Questions about housing, clubs, activities, and student life',
    'careers': 'Questions about internships, jobs, career services, and employment',
    'about': 'Questions about FPT history, organization, and general information'
}

def categorize_qa(question, answer):
    """Use Gemini to categorize a Q&A pair"""
    prompt = f"""
    Categorize the following question and answer into one of these categories:
    {CATEGORIES}
    
    Question: {question}
    Answer: {answer}
    
    Respond with only the category name (academic, student_services, financial, campus_life, careers, or about).
    """
    
    response = model.generate_content(prompt)
    category = response.text.strip().lower()
    
    # Validate category
    if category not in CATEGORIES:
        return 'other'
    return category

def extract_qa_pairs(content):
    """Extract Q&A pairs from markdown content"""
    qa_pattern = r'## Q: (.*?)\n\*\*A:\*\* (.*?)(?=\n## Q:|$)'
    matches = re.finditer(qa_pattern, content, re.DOTALL)
    return [(m.group(1).strip(), m.group(2).strip()) for m in matches]

def save_to_category_file(category, qa_pair):
    """Save Q&A pair to appropriate category file"""
    output_dir = '../data/md/'
    output_file = output_dir / f'{category}.md'
    
    # Create or append to category file
    mode = 'a' if output_file.exists() else 'w'
    with open(output_file, mode, encoding='utf-8') as f:
        if mode == 'w':
            f.write(f'# {category.title()} Q&A\n\n')
        f.write(f'## Q: {qa_pair[0]}\n**A:** {qa_pair[1]}\n\n')

def main():
    # Read input file
    input_file = '../data/md/qna_rag_testing.md'
    with open(input_file, 'r', encoding='utf-8') as f:
        content = f.read()
    
    # Extract Q&A pairs
    qa_pairs = extract_qa_pairs(content)
    
    # Process each Q&A pair
    for question, answer in qa_pairs:
        category = categorize_qa(question, answer)
        save_to_category_file(category, (question, answer))
        print(f'Categorized Q: {question[:50]}... as {category}')
