### Validation

In [2]:
import openai
import json
import pandas as pd
from datetime import datetime
import os
from typing import List, Dict, Tuple
from tqdm.notebook import tqdm
import numpy as np
from concurrent.futures import ThreadPoolExecutor
import time

os.environ['OPENAI_API_KEY'] = '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'
# Initialize client
client = openai.OpenAI()

In [4]:
class ModelComparator:
    def __init__(self, 
                 fine_tuned_model: str = "ft:gpt-4o-mini-2024-07-18:personal::Af1GA1or",
                 base_model: str = "gpt-4o-mini-2024-07-18"):
        self.fine_tuned_model = fine_tuned_model
        self.base_model = base_model
        self.results_dir = '../data/final/model_comparison'
        os.makedirs(self.results_dir, exist_ok=True)
        
    def get_model_response(self, model: str, query: str, system_prompt: str = "You are an expert poker strategist.") -> Tuple[str, float]:
        """Get response from specified model with timing"""
        start_time = time.time()
        try:
            response = client.chat.completions.create(
                model=model,
                messages=[
                    {"role": "system", "content": system_prompt},
                    {"role": "user", "content": query}
                ],
                temperature=0.7
            )
            elapsed_time = time.time() - start_time
            return response.choices[0].message.content, elapsed_time
        except Exception as e:
            print(f"Error getting response from {model}: {e}")
            return None, None

    def compare_responses(self, test_cases: List[Dict]) -> pd.DataFrame:
        """Compare responses between fine-tuned and base models"""
        results = []
        
        for case in tqdm(test_cases, desc="Processing test cases"):
            query = case['query']
            category = case.get('category', 'general')
            
            # Get responses from both models
            ft_response, ft_time = self.get_model_response(self.fine_tuned_model, query)
            base_response, base_time = self.get_model_response(self.base_model, query)
            
            if ft_response and base_response:
                results.append({
                    'category': category,
                    'query': query,
                    'fine_tuned_response': ft_response,
                    'base_response': base_response,
                    'ft_response_time': ft_time,
                    'base_response_time': base_time
                })
        
        return pd.DataFrame(results)