## Validation on Model ID: ft:gpt-4o-mini-2024-07-18:personal::Af1GA1or

In [5]:
import openai
import json
import pandas as pd
from datetime import datetime
import os
from typing import List, Dict, Tuple
from tqdm.notebook import tqdm
import numpy as np
from concurrent.futures import ThreadPoolExecutor
import time
import logging
from datetime import datetime

In [6]:
os.environ['OPENAI_API_KEY'] = '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'
# Initialize client
client = openai.OpenAI()

In [7]:
class ModelComparator:
    def __init__(self, 
                 fine_tuned_model: str = "ft:gpt-4o-mini-2024-07-18:personal::Af1GA1or",
                 base_model: str = "gpt-4o-mini-2024-07-18"):
        self.fine_tuned_model = fine_tuned_model
        self.base_model = base_model
        self.results_dir = '../data/final/model_comparison'
        os.makedirs(self.results_dir, exist_ok=True)
        
        # Setup logging
        self.setup_logging()
        
    def setup_logging(self):
        """Setup detailed logging configuration"""
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        log_file = f'{self.results_dir}/comparison_{timestamp}.log'
        
        # Create formatter
        formatter = logging.Formatter(
            '%(asctime)s - %(levelname)s - %(message)s'
        )
        
        # Setup file handler
        file_handler = logging.FileHandler(log_file)
        file_handler.setFormatter(formatter)
        
        # Setup stream handler
        stream_handler = logging.StreamHandler()
        stream_handler.setFormatter(formatter)
        
        # Setup logger
        self.logger = logging.getLogger(f'ModelComparator_{timestamp}')
        self.logger.setLevel(logging.INFO)
        self.logger.addHandler(file_handler)
        self.logger.addHandler(stream_handler)
        
        self.logger.info(f"Initialized ModelComparator")
        self.logger.info(f"Fine-tuned model: {self.fine_tuned_model}")
        self.logger.info(f"Base model: {self.base_model}")
        
    def get_model_response(self, model: str, query: str, system_prompt: str = "You are an expert poker strategist.") -> Tuple[str, float]:
        """Get response from specified model with timing"""
        start_time = time.time()
        self.logger.info(f"Querying {model}")
        self.logger.debug(f"Query: {query[:100]}...")
        
        try:
            response = client.chat.completions.create(
                model=model,
                messages=[
                    {"role": "system", "content": system_prompt},
                    {"role": "user", "content": query}
                ],
                temperature=0.7
            )
            elapsed_time = time.time() - start_time
            
            self.logger.info(f"Response received from {model} in {elapsed_time:.2f} seconds")
            self.logger.debug(f"Response: {response.choices[0].message.content[:100]}...")
            
            return response.choices[0].message.content, elapsed_time
            
        except Exception as e:
            self.logger.error(f"Error getting response from {model}: {e}")
            return None, None

    def compare_responses(self, test_cases: List[Dict]) -> pd.DataFrame:
        """Compare responses between fine-tuned and base models"""
        results = []
        total_cases = len(test_cases)
        
        self.logger.info(f"Starting comparison of {total_cases} test cases")
        
        for idx, case in enumerate(tqdm(test_cases, desc="Processing test cases")):
            query = case['query']
            category = case.get('category', 'general')
            
            self.logger.info(f"Processing case {idx+1}/{total_cases} - Category: {category}")
            
            # Get responses from both models
            ft_response, ft_time = self.get_model_response(self.fine_tuned_model, query)
            base_response, base_time = self.get_model_response(self.base_model, query)
            
            if ft_response and base_response:
                results.append({
                    'category': category,
                    'query': query,
                    'fine_tuned_response': ft_response,
                    'base_response': base_response,
                    'ft_response_time': ft_time,
                    'base_response_time': base_time
                })
                
                self.logger.info(f"Case {idx+1} completed - FT time: {ft_time:.2f}s, Base time: {base_time:.2f}s")
            else:
                self.logger.warning(f"Case {idx+1} failed to get responses")
        
        df = pd.DataFrame(results)
        
        # Log summary statistics
        self.logger.info("\nComparison Summary:")
        self.logger.info(f"Total cases processed: {len(df)}")
        self.logger.info(f"Average FT response time: {df['ft_response_time'].mean():.2f}s")
        self.logger.info(f"Average Base response time: {df['base_response_time'].mean():.2f}s")
        
        return df

    def save_results(self, df: pd.DataFrame):
        """Save comparison results with detailed logging"""
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        results_file = f'{self.results_dir}/comparison_results_{timestamp}.csv'
        
        try:
            df.to_csv(results_file, index=False)
            self.logger.info(f"Results saved to {results_file}")
            
            # Log detailed statistics
            self.logger.info("\nDetailed Statistics:")
            for category in df['category'].unique():
                cat_data = df[df['category'] == category]
                self.logger.info(f"\nCategory: {category}")
                self.logger.info(f"Cases: {len(cat_data)}")
                self.logger.info(f"Avg FT time: {cat_data['ft_response_time'].mean():.2f}s")
                self.logger.info(f"Avg Base time: {cat_data['base_response_time'].mean():.2f}s")
                
        except Exception as e:
            self.logger.error(f"Error saving results: {e}")

In [8]:
def create_test_suite():
    """Create comprehensive poker test scenarios"""
    
    test_cases = {
        # 1. General Concepts
        "basic_concepts": [
            {
                "category": "fundamentals",
                "query": "What are the key principles of position play in poker?"
            },
            {
                "category": "fundamentals",
                "query": "Explain pot odds and implied odds in simple terms."
            }
        ],
        
        # 2. Strategic Decision Making
        "strategic": [
            {
                "category": "preflop_strategy",
                "query": "You're UTG with 100BB and AKs. Two players behind you are known LAGs. What's your optimal play?"
            },
            {
                "category": "postflop_strategy",
                "query": "On A♠7♠2♣ board, you have K♠Q♠ in position against a tight player who c-bets 60% pot. Explain your thought process."
            }
        ],
        
        # 3. Complex Game Situations
        "complex_spots": [
            {
                "category": "multiway_pots",
                "query": "You're in a 3-way pot with JJ on K72r board, facing a bet and a raise. Stack depths are 150BB. Break down the decision."
            },
            {
                "category": "bluff_spots",
                "query": "On river (board: A♣7♠2♦5♣K♣), you have 8♦6♦ and villain checks to you. Previous action was standard. Analyze this bluff spot."
            }
        ],
        
        # 4. Tournament Specific
        "tournament": [
            {
                "category": "icm_pressure",
                "query": "Final table, you're second in chips with 25BB. Button shoves, you have AQo in BB. Three shorter stacks are sub-10BB. What's your call?"
            },
            {
                "category": "bubble_play",
                "query": "2 from money in MTT, you're chip leader. How do you adjust your button stealing range against different stack sizes?"
            }
        ],
        
        # 5. Player Type Adjustments
        "player_adjustments": [
            {
                "category": "exploitative",
                "query": "You notice villain is folding to 3-bets 80% of time. How do you adjust your 3-betting range from the blinds?"
            },
            {
                "category": "table_dynamics",
                "query": "You're at a table with two aggressive players on your left and three nitty players on right. How do you adjust your opening ranges?"
            }
        ],
        
        # 6. Hand Reading Exercises
        "hand_reading": [
            {
                "category": "range_analysis",
                "query": "BTN raises, you 3-bet BB with AK, they call. Flop T72r. They check back. Turn 3. What's their likely range and how should you proceed?"
            },
            {
                "category": "live_tells",
                "query": "In live poker, villain shows signs of discomfort and bets 30% pot on river after checking flop and turn. What range do you put them on?"
            }
        ],
        
        # 7. End Game Scenarios
        "end_game": [
            {
                "category": "heads_up",
                "query": "Heads-up, 20BB effective, you're on button with a 40/60 stack disadvantage. How do you adjust your preflop ranges?"
            },
            {
                "category": "final_table",
                "query": "3-handed at final table, you're shortest with 15BB. Describe optimal push/fold strategy from each position."
            }
        ],
        
        # 8. Bankroll and Mental Game
        "mental_game": [
            {
                "category": "tilt_control",
                "query": "You've lost three all-ins as 80% favorite. How do you maintain optimal decision making?"
            },
            {
                "category": "bankroll_management",
                "query": "Explain stop-loss strategies and their implementation in both cash games and tournaments."
            }
        ]
    }
    
    # Flatten the test cases for the comparator
    flat_test_cases = []
    for category, cases in test_cases.items():
        for case in cases:
            flat_test_cases.append({
                'category': f"{category}_{case['category']}",
                'query': case['query']
            })
    
    return flat_test_cases

In [9]:
# Run the comparison
def run_comprehensive_test():
    # Initialize comparator
    comparator = ModelComparator()
    
    # Get test cases
    test_cases = create_test_suite()
    
    # Run comparisons
    results_df = comparator.compare_responses(test_cases)
    
    # Save results
    comparator.save_results(results_df)
    
    return results_df

In [10]:
results = run_comprehensive_test()

# Display summary by category
print("\nResponse Analysis by Category:")
category_stats = results.groupby('category').agg({
    'ft_response_time': ['mean', 'std'],
    'base_response_time': ['mean', 'std']
}).round(2)

print(category_stats)

2024-12-16 04:23:27,773 - INFO - Initialized ModelComparator
2024-12-16 04:23:27,773 - INFO - Fine-tuned model: ft:gpt-4o-mini-2024-07-18:personal::Af1GA1or
2024-12-16 04:23:27,774 - INFO - Base model: gpt-4o-mini-2024-07-18
2024-12-16 04:23:27,774 - INFO - Starting comparison of 16 test cases


Processing test cases:   0%|          | 0/16 [00:00<?, ?it/s]

2024-12-16 04:23:27,795 - INFO - Processing case 1/16 - Category: basic_concepts_fundamentals
2024-12-16 04:23:27,796 - INFO - Querying ft:gpt-4o-mini-2024-07-18:personal::Af1GA1or
2024-12-16 04:23:31,414 - INFO - Response received from ft:gpt-4o-mini-2024-07-18:personal::Af1GA1or in 3.62 seconds
2024-12-16 04:23:31,415 - INFO - Querying gpt-4o-mini-2024-07-18
2024-12-16 04:23:43,691 - INFO - Response received from gpt-4o-mini-2024-07-18 in 12.28 seconds
2024-12-16 04:23:43,693 - INFO - Case 1 completed - FT time: 3.62s, Base time: 12.28s
2024-12-16 04:23:43,694 - INFO - Processing case 2/16 - Category: basic_concepts_fundamentals
2024-12-16 04:23:43,696 - INFO - Querying ft:gpt-4o-mini-2024-07-18:personal::Af1GA1or
2024-12-16 04:23:45,736 - INFO - Response received from ft:gpt-4o-mini-2024-07-18:personal::Af1GA1or in 2.04 seconds
2024-12-16 04:23:45,738 - INFO - Querying gpt-4o-mini-2024-07-18
2024-12-16 04:23:53,989 - INFO - Response received from gpt-4o-mini-2024-07-18 in 8.25 secon


Response Analysis by Category:
                                  ft_response_time       base_response_time  \
                                              mean   std               mean   
category                                                                      
basic_concepts_fundamentals                   2.83  1.11              10.26   
complex_spots_bluff_spots                     4.57   NaN               7.39   
complex_spots_multiway_pots                   6.24   NaN               9.52   
end_game_final_table                          3.01   NaN              14.51   
end_game_heads_up                             0.73   NaN              13.18   
hand_reading_live_tells                       1.39   NaN               4.56   
hand_reading_range_analysis                   0.90   NaN              12.05   
mental_game_bankroll_management               6.37   NaN               7.53   
mental_game_tilt_control                      0.61   NaN              13.70   
player_adjustments_e

In [31]:
def analyze_results(results_df: pd.DataFrame):
    """Analyze both timing and response content with enhanced metrics"""
    
    results_df['ft_response_length'] = results_df['fine_tuned_response'].str.len()
    results_df['base_response_length'] = results_df['base_response'].str.len()
    
    # 1. Performance Metrics
    print("\n=== Performance Metrics ===")
    performance = {
        'avg_ft_time': results_df['ft_response_time'].mean(),
        'avg_base_time': results_df['base_response_time'].mean(),
        'time_improvement': (results_df['base_response_time'].mean() - results_df['ft_response_time'].mean()) / results_df['base_response_time'].mean() * 100,
        'avg_ft_length': results_df['ft_response_length'].mean(),
        'avg_base_length': results_df['base_response_length'].mean(),
        'conciseness_improvement': (results_df['base_response_length'].mean() - results_df['ft_response_length'].mean()) / results_df['base_response_length'].mean() * 100
    }
    print("Overall Performance:")
    print(f"Time Improvement: {performance['time_improvement']:.2f}%")
    print(f"Conciseness Improvement: {performance['conciseness_improvement']:.2f}%")

    # 2. Category-wise Analysis
    print("\n=== Category Performance ===")
    category_stats = results_df.groupby('category').agg({
        'ft_response_time': ['mean', 'std'],
        'base_response_time': ['mean', 'std'],
        'ft_response_length': 'mean',
        'base_response_length': 'mean'
    }).round(2)
    
    # Add efficiency metrics
    category_stats['time_efficiency'] = ((category_stats[('base_response_time', 'mean')] - 
                                        category_stats[('ft_response_time', 'mean')]) / 
                                       category_stats[('base_response_time', 'mean')] * 100)
    category_stats['length_efficiency'] = ((category_stats[('base_response_length', 'mean')] - 
                                          category_stats[('ft_response_length', 'mean')]) / 
                                         category_stats[('base_response_length', 'mean')] * 100)
    print(category_stats)

    # 3. Response Quality Analysis
    print("\n=== Response Quality Metrics ===")
    poker_terms = ['GTO', 'range', 'equity', 'position', 'stack', 'ICM', 'raise', 'fold', 'call', 
                  'bet', 'check', 'pot odds', 'implied odds', 'bluff', '3-bet', 'probability', 'river']
    
    term_usage = pd.DataFrame()
    for term in poker_terms:
        term_usage[f'ft_{term}'] = results_df['fine_tuned_response'].str.count(term)
        term_usage[f'base_{term}'] = results_df['base_response'].str.count(term)
    
    term_stats = {
        'ft_term_density': term_usage.filter(like='ft_').sum(axis=1).mean() / results_df['ft_response_length'].mean(),
        'base_term_density': term_usage.filter(like='base_').sum(axis=1).mean() / results_df['base_response_length'].mean()
    }
    print(f"Technical Term Density:")
    print(f"Fine-tuned model: {term_stats['ft_term_density']:.4f}")
    print(f"Base model: {term_stats['base_term_density']:.4f}")

     # 4. Save Analysis Results - Fixed Version
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    analysis_file = f'../data/final/detailed_analysis_{timestamp}.json'
    
    # Convert MultiIndex DataFrame to serializable format
    category_stats_dict = {}
    for category in category_stats.index:
        category_stats_dict[category] = {
            'ft_response_time': {
                'mean': float(category_stats.loc[category, ('ft_response_time', 'mean')]),
                'std': float(category_stats.loc[category, ('ft_response_time', 'std')])
            },
            'base_response_time': {
                'mean': float(category_stats.loc[category, ('base_response_time', 'mean')]),
                'std': float(category_stats.loc[category, ('base_response_time', 'std')])
            },
            'ft_response_length': float(category_stats.loc[category, 'ft_response_length']),
            'base_response_length': float(category_stats.loc[category, 'base_response_length']),
            'time_efficiency': float(category_stats.loc[category, 'time_efficiency']),
            'length_efficiency': float(category_stats.loc[category, 'length_efficiency'])
        }

    analysis_results = {
        'overall_performance': {
            'avg_ft_time': float(performance['avg_ft_time']),
            'avg_base_time': float(performance['avg_base_time']),
            'time_improvement': float(performance['time_improvement']),
            'avg_ft_length': float(performance['avg_ft_length']),
            'avg_base_length': float(performance['avg_base_length']),
            'conciseness_improvement': float(performance['conciseness_improvement'])
        },
        'category_stats': category_stats_dict,
        'term_stats': {
            'ft_term_density': float(term_stats['ft_term_density']),
            'base_term_density': float(term_stats['base_term_density'])
        },
        'detailed_responses': results_df[['category', 'query', 
                                        'fine_tuned_response', 
                                        'base_response']].to_dict('records')
    }

    try:
        os.makedirs('../data/final', exist_ok=True)
        with open(analysis_file, 'w') as f:
            json.dump(analysis_results, f, indent=2)
        print(f"\nAnalysis saved to: {analysis_file}")
    except Exception as e:
        print(f"Error saving results: {e}")
        
    return analysis_results

In [12]:
# Run comprehensive test and analysis
results = run_comprehensive_test()


=== Response Time Analysis ===
                                  ft_response_time       base_response_time  \
                                              mean   std               mean   
category                                                                      
basic_concepts_fundamentals                   3.20  1.09               9.48   
complex_spots_bluff_spots                     4.10   NaN              12.65   
complex_spots_multiway_pots                   4.91   NaN              22.46   
end_game_final_table                          1.63   NaN              13.73   
end_game_heads_up                             0.82   NaN              10.22   
hand_reading_live_tells                       1.57   NaN              16.25   
hand_reading_range_analysis                   0.92   NaN              11.37   
mental_game_bankroll_management               5.92   NaN              10.68   
mental_game_tilt_control                      0.47   NaN              10.25   
player_adjustments_e

FileNotFoundError: [Errno 2] No such file or directory: '../results/detailed_analysis_20241216_132358.json'

In [22]:
results

Unnamed: 0,category,query,fine_tuned_response,base_response,ft_response_time,base_response_time,ft_response_length,base_response_length
0,basic_concepts_fundamentals,What are the key principles of position play i...,Position play in poker is crucial for maximizi...,Position is one of the most critical concepts ...,3.968587,9.751271,1219,3072
1,basic_concepts_fundamentals,Explain pot odds and implied odds in simple te...,**Pot Odds:** Think of pot odds as a quick dec...,Sure! Let's break down pot odds and implied od...,2.42286,9.218197,960,2028
2,strategic_preflop_strategy,You're UTG with 100BB and AKs. Two players beh...,"With AKs UTG and two known LAGs behind, the op...",When you are under the gun (UTG) with 100 big ...,0.845608,3.219036,246,1749
3,strategic_postflop_strategy,"On A♠7♠2♣ board, you have K♠Q♠ in position aga...","On the A♠7♠2♣ board with K♠Q♠, you're in a fav...",When facing a tight player who c-bets 60% of t...,2.049343,12.303279,896,2637
4,complex_spots_multiway_pots,"You're in a 3-way pot with JJ on K72r board, f...","With JJ on a K72r board in a 3-way pot, facing...",In a 3-way pot with a board of K72 rainbow (no...,4.911726,22.461336,1736,2695
5,complex_spots_bluff_spots,"On river (board: A♣7♠2♦5♣K♣), you have 8♦6♦ an...","With the board A♣7♠2♦5♣K♣ and your hand 8♦6♦, ...","In this spot, you are holding 8♦6♦ on a board ...",4.101735,12.645228,1379,2761
6,tournament_icm_pressure,"Final table, you're second in chips with 25BB....",With AQo in the BB against a Button shove and ...,"In this situation, you're facing a shove from ...",0.632607,8.985323,150,1797
7,tournament_bubble_play,"2 from money in MTT, you're chip leader. How d...",As the chip leader in a 2 from the money situa...,When you're the chip leader at a final table i...,2.831744,7.08718,807,2868
8,player_adjustments_exploitative,You notice villain is folding to 3-bets 80% of...,"With an 80% fold rate to 3-bets, widen your 3-...",When facing a villain who folds to 3-bets 80% ...,0.819991,9.926422,269,2243
9,player_adjustments_table_dynamics,You're at a table with two aggressive players ...,"With aggressive players on your left, tighten ...","In this situation, you need to adjust your ope...",1.021913,5.621661,268,2548


In [26]:
analysis = analyze_results(results)


=== Performance Metrics ===
Overall Performance:
Time Improvement: 79.90%
Conciseness Improvement: 69.45%

=== Category Performance ===
                                  ft_response_time       base_response_time  \
                                              mean   std               mean   
category                                                                      
basic_concepts_fundamentals                   3.20  1.09               9.48   
complex_spots_bluff_spots                     4.10   NaN              12.65   
complex_spots_multiway_pots                   4.91   NaN              22.46   
end_game_final_table                          1.63   NaN              13.73   
end_game_heads_up                             0.82   NaN              10.22   
hand_reading_live_tells                       1.57   NaN              16.25   
hand_reading_range_analysis                   0.92   NaN              11.37   
mental_game_bankroll_management               5.92   NaN              10.

### Second Round of More Comprehensive Test

In [29]:
def create_comprehensive_test_suite():
    """Create comprehensive poker test scenarios optimized for fine-tuned model comparison.
       Each scenario pushes the model to demonstrate nuanced reasoning, GTO understanding,
       interpretability of solver-like outputs, and advanced strategic concepts.
       
       Added #7: A carefully-crafted prompt designed to showcase a scenario where a base model
       would likely fail (hallucinate or give incorrect reasoning), but a fine-tuned model 
       would produce a coherent, GTO-aligned and contextually correct explanation.
    """
    
    test_cases = {
        # 1. Basic Concepts & Fundamentals
        "basic_concepts": [
            {
                "category": "fundamentals",
                "query": "Explain the role of 'position' in poker and why acting last can significantly alter your strategy."
            },
            {
                "category": "fundamentals",
                "query": "How do pot odds and implied odds work together when deciding whether to chase a draw?"
            },
            {
                "category": "fundamentals",
                "query": "Describe how stack depth influences the range of hands you should open from early position."
            },
            {
                "category": "fundamentals",
                "query": "What are the core differences between a balanced GTO range and an exploitative range?"
            },
            {
                "category": "fundamentals",
                "query": "Why is it important to understand blocker effects, and how do they impact your bluffing frequency?"
            },
            {
                "category": "fundamentals",
                "query": "Explain how understanding distribution of hand equities across different board textures can guide decision-making."
            }
        ],
        
        # 2. Strategic Decision Making (Preflop & Postflop)
        "strategic": [
            {
                "category": "preflop_strategy",
                "query": "Under the Gun with 100BB, holding AKo in a 9-handed game with two aggressive players to your left: how should GTO strategy inform your opening and 4-bet responses?"
            },
            {
                "category": "preflop_strategy",
                "query": "Facing a BTN raise and you're in the SB with QJs and 60BB effective stacks. Villain is a competent reg. How does GTO handle calling vs. 3-betting here?"
            },
            {
                "category": "postflop_strategy",
                "query": "On a dry A♣7♦2♠ flop, you hold K♠Q♠ in position against a balanced opponent who c-bets small. According to GTO principles, how should you mix calls, raises, and folds?"
            },
            {
                "category": "postflop_strategy",
                "query": "You hold QQ on a J♥T♥9♣ board facing a pot-sized lead from a straightforward player. Discuss the GTO approach to calling vs. raising and how future streets might play out."
            },
            {
                "category": "postflop_strategy",
                "query": "In a 3-bet pot, you hold A♠K♣ on a K♦J♦3♠ board as the aggressor. GTO recommendations often suggest certain bet sizings. How do you choose between small, medium, or large c-bets?"
            },
            {
                "category": "postflop_strategy",
                "query": "You 4-bet preflop with JJ and see a T-high flop rainbow. Opponent calls your c-bet. How do GTO solutions balance turn double-barrels vs. pot control lines?"
            }
        ],
        
        # 3. Complex Spots (Multiway Pots, Bluff Spots, Mixed Frequencies)
        "complex_spots": [
            {
                "category": "multiway_pots",
                "query": "You're in a 3-way pot with JJ on a K-7-2 rainbow board facing a bet and a raise at 150BB depth. Discuss how GTO ranges narrow and the recommended folding frequencies."
            },
            {
                "category": "multiway_pots",
                "query": "SB calls, BB calls, you opened from MP with AQo. Flop: T♣8♣6♦. SB leads for 1/3 pot. How does a GTO approach weigh calling vs. raising and what turn cards most benefit your range?"
            },
            {
                "category": "bluff_spots",
                "query": "On a river A♣7♠2♦5♣K♣ board, holding 8♦6♦, GTO strategies often identify certain blocker effects. How do solver-driven recommendations decide whether to bluff in this scenario?"
            },
            {
                "category": "bluff_spots",
                "query": "You 3-bet pre with AKo and got called. The runout is Q♠5♦2♥–7♣–3♣ and villain checks river. How do GTO strategies determine if a missed AK should bluff at low frequency or give up?"
            },
            {
                "category": "balance_sizing",
                "query": "In a single-raised pot, you have a mid-strength made hand on a dynamic board. How does GTO splitting of bet sizes maintain balanced strategies and protect your range?"
            },
            {
                "category": "turn_barreling",
                "query": "You hold a semi-bluff draw on the turn after opponent calls flop c-bet. GTO solutions often suggest mixing turn barrel frequencies. Explain the factors influencing these mixed strategies."
            }
        ],
        
        # 4. Tournament Specific (ICM, Bubble, Final Table)
        "tournament": [
            {
                "category": "icm_pressure",
                "query": "Final table scenario: you’re 2nd in chips with AQo facing a shove from the shortest stack on your left. How do GTO solutions adjust calling ranges under ICM pressure?"
            },
            {
                "category": "icm_pressure",
                "query": "As chip leader near the bubble, you have KJs on the BTN. Two short stacks remain. According to GTO under ICM conditions, how does your opening range shift?"
            },
            {
                "category": "bubble_play",
                "query": "On the direct money bubble, you’re a mid-stack in CO with A7s. GTO adjustments for bubble scenarios often tighten or loosen ranges. Which factors guide this decision?"
            },
            {
                "category": "bubble_play",
                "query": "3 off the money in a MTT, you pick up 99 UTG. With multiple short stacks behind, how do GTO-based ICM calculations influence your open or fold decision?"
            },
            {
                "category": "final_table",
                "query": "5-handed at the final table, blinds are large, and pay jumps are significant. How do GTO strategies modify open-raising frequencies to preserve chip EV and account for ICM?"
            },
            {
                "category": "short_stack_icm",
                "query": "With 12BB as a short stack facing a min-raise from a big stack, how do GTO push/fold solutions differ under ICM pressure compared to chip EV-only strategies?"
            }
        ],
        
        # 5. Hand Reading & Live Dynamics (Range Construction, Live Tells)
        "hand_reading": [
            {
                "category": "range_analysis",
                "query": "BTN raises, you 3-bet from BB with AK. They call. Flop: T-7-2 rainbow. GTO range analysis: how do you categorize BTN’s likely holdings and choose a c-bet frequency?"
            },
            {
                "category": "range_analysis",
                "query": "MP opens, CO calls, you squeeze from SB with KQs. CO calls. Flop J-8-3 with two spades. Discuss CO’s likely range and whether GTO recommends a c-bet, check, or mixed strategy."
            },
            {
                "category": "live_tells",
                "query": "In a live setting, villain’s body language suggests discomfort as they bet turn small after checking flop. How might GTO analysis integrate (or ignore) such meta-information?"
            },
            {
                "category": "live_tells",
                "query": "Villain’s hand trembles slightly making a large river bet. You hold a marginal bluff-catcher. Without relying purely on heuristics, how could GTO principles still guide a decision?"
            },
            {
                "category": "combo_counting",
                "query": "On a K-Q-4 rainbow board, you suspect villain’s range is capped. How do you use combo counting and GTO frequencies to decide on a bluff or thin value bet?"
            },
            {
                "category": "node_locking",
                "query": "Describe how node-locking in a solver scenario could adjust villain’s ranges based on observed tendencies and how that reflects in GTO-like advice."
            }
        ],
        
        # 6. End Game Scenarios (Heads-Up, Short Stacks, Hyper-Turbos)
        "end_game": [
            {
                "category": "heads_up",
                "query": "Heads-up with 20BB effective. As BTN, how does GTO strategy suggest adjusting limp-raise frequencies to exploit a slightly passive BB opponent?"
            },
            {
                "category": "heads_up",
                "query": "HU scenario: Opponent overfolds to 3-bets. You have 30BB effective. How does GTO-exploitative hybrid suggest shifting from a balanced approach?"
            },
            {
                "category": "final_table",
                "query": "3-handed at the final table, you hold K9s on the BTN with a small stack in the blinds. GTO-based push/fold charts guide what range in this exact scenario?"
            },
            {
                "category": "final_table",
                "query": "4-handed, you’re second in chips facing an aggressive chip leader. GTO-inspired strategies often recommend certain defense frequencies from the BB. How do you tailor these defenses?"
            },
            {
                "category": "short_stack",
                "query": "With 15BB on the BTN and a nitty BB player, how do ICM and GTO combined push/fold solutions differ from pure chip EV solutions?"
            },
            {
                "category": "short_stack",
                "query": "In a hyper-turbo SNG with 10BB effective stacks, how do GTO Nash charts inform your all-in calling ranges against a wide-shoving opponent?"
            }
        ],
        
        # 7. Specially Crafted Challenge Scenario (Base Model vs. Fine-Tuned Model)
        # This scenario includes a subtle GTO concept that a generic model might misunderstand or invent facts for.
        # The fine-tuned model should provide a coherent, GTO-aligned reasoning process and correct recommendation.
        "challenge_scenario": [
            {
                "category": "gto_subtlety",
                "query": "On a monotone flop J♣8♣4♣ in a 3-bet pot (BTN vs. CO), GTO solutions often choose mixed lines with specific micro-frequency splits. Holding A♣K♦ as the caller, how do you decide between a small lead, a check-raise, or a delayed action on the turn, and what exact reasoning justifies this choice?"
            }
        ]
    }
    
    # Flatten the test cases into a single list
    flat_test_cases = []
    for category, cases in test_cases.items():
        for case in cases:
            flat_test_cases.append({
                'category': f"{category}_{case['category']}",
                'query': case['query']
            })
    
    return flat_test_cases

In [30]:
# Modify the run_comprehensive_test function to use the new test suite
def run_comprehensive_test():
    # Initialize comparator
    comparator = ModelComparator()
    
    # Get comprehensive test cases
    test_cases = create_comprehensive_test_suite()  # Changed this line
    
    # Run comparisons
    comprehensive_results_df = comparator.compare_responses(test_cases)
    
    # Save results
    comparator.save_results(comprehensive_results_df)
    
    return comprehensive_results_df

# Run the test
comprehensive_results = run_comprehensive_test()

2024-12-16 14:08:59,476 - INFO - Initialized ModelComparator
2024-12-16 14:08:59,476 - INFO - Fine-tuned model: ft:gpt-4o-mini-2024-07-18:personal::Af1GA1or
2024-12-16 14:08:59,477 - INFO - Base model: gpt-4o-mini-2024-07-18
2024-12-16 14:08:59,477 - INFO - Starting comparison of 37 test cases


Processing test cases:   0%|          | 0/37 [00:00<?, ?it/s]

2024-12-16 14:08:59,480 - INFO - Processing case 1/37 - Category: basic_concepts_fundamentals
2024-12-16 14:08:59,481 - INFO - Querying ft:gpt-4o-mini-2024-07-18:personal::Af1GA1or
2024-12-16 14:09:04,679 - INFO - Response received from ft:gpt-4o-mini-2024-07-18:personal::Af1GA1or in 5.20 seconds
2024-12-16 14:09:04,685 - INFO - Querying gpt-4o-mini-2024-07-18
2024-12-16 14:09:13,224 - INFO - Response received from gpt-4o-mini-2024-07-18 in 8.54 seconds
2024-12-16 14:09:13,225 - INFO - Case 1 completed - FT time: 5.20s, Base time: 8.54s
2024-12-16 14:09:13,226 - INFO - Processing case 2/37 - Category: basic_concepts_fundamentals
2024-12-16 14:09:13,227 - INFO - Querying ft:gpt-4o-mini-2024-07-18:personal::Af1GA1or
2024-12-16 14:09:14,661 - INFO - Response received from ft:gpt-4o-mini-2024-07-18:personal::Af1GA1or in 1.43 seconds
2024-12-16 14:09:14,662 - INFO - Querying gpt-4o-mini-2024-07-18
2024-12-16 14:09:28,950 - INFO - Response received from gpt-4o-mini-2024-07-18 in 14.29 second


=== Performance Metrics ===


KeyError: 'ft_response_length'

In [32]:
analysis = analyze_results(comprehensive_results)


=== Performance Metrics ===
Overall Performance:
Time Improvement: 76.24%
Conciseness Improvement: 74.37%

=== Category Performance ===
                                ft_response_time       base_response_time  \
                                            mean   std               mean   
category                                                                    
basic_concepts_fundamentals                 3.59  2.31              11.34   
challenge_scenario_gto_subtlety             5.47   NaN              26.15   
complex_spots_balance_sizing                3.62   NaN              12.91   
complex_spots_bluff_spots                   2.42  0.75              10.04   
complex_spots_multiway_pots                 1.71  0.46              11.56   
complex_spots_turn_barreling                2.79   NaN              18.50   
end_game_final_table                        4.29  3.76              10.24   
end_game_heads_up                           1.39  0.39               9.20   
end_game_short_s

### Mid-analysis Report

#### Performance Improvements
Time Improvement: 76.24%
Conciseness Improvement: 74.37%

#### Category-specific Insights
- Best Time Efficiency: Mental game (95.41%), Tournament ICM (92.99%)
- Best Length Efficiency: Mental game (94.64%), ICM pressure (91.65%)
- Technical Term Density: Fine-tuned (0.0117) > Base (0.0094)

In [56]:
import textstat
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from sklearn.metrics.pairwise import cosine_similarity
from transformers import AutoTokenizer, AutoModel
import torch
import re
import pandas as pd
import numpy as np
from typing import Dict, Any, List, Optional, Union
from dataclasses import dataclass
import json
import os

In [62]:
@dataclass
class ResponseMetrics:
    """Data class for storing response metrics"""
    semantic_similarity: float
    term_density: float
    readability_score: float
    action_density: float
    strategic_depth: float

class PokerResponseAnalyzer:
    def __init__(self):
        # Initialize NLTK resources
        for resource in ['punkt', 'stopwords', 'averaged_perceptron_tagger']:
            try:
                nltk.download(resource, quiet=True)
            except Exception as e:
                print(f"Warning: Failed to download NLTK resource {resource}: {str(e)}")
        
        # Set up device
        self.device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
        print(f"Using device: {self.device}")
        
        # Initialize transformer model with error handling
        try:
            self.tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
            self.model = AutoModel.from_pretrained('bert-base-uncased').to(self.device)
            print(f"Model device: {next(self.model.parameters()).device}")
        except Exception as e:
            print(f"Error initializing BERT model: {str(e)}")
            raise
        
        # Load resources
        self.poker_terms = self._load_poker_terms()
        self.strategic_concepts = self._load_strategic_concepts()
        self.action_verbs = self._load_action_verbs()

    def enhance_analysis(self, data: Dict) -> Dict[str, Any]:
        """Enhance the analysis with additional metrics and comparisons"""
        try:
            # Convert detailed_responses to DataFrame
            responses_df = pd.DataFrame(data.get('detailed_responses', []))
            
            # Process metrics
            enhanced_data = {
                'quality_metrics': self.analyze_technical_accuracy(responses_df),
                'actionability': self.measure_actionable_advice(responses_df),
                'strategic_depth': self.evaluate_strategic_depth(responses_df),
                'comparative': self.perform_comparative_analysis(responses_df),
                'ux_metrics': self.analyze_user_experience(responses_df)
            }
            
            # Calculate overall metrics
            enhanced_data['overall_performance'] = self.calculate_overall_performance(
                enhanced_data['quality_metrics'],
                enhanced_data['actionability'],
                enhanced_data['strategic_depth']
            )
            
            # Add term statistics
            enhanced_data['term_stats'] = self.calculate_term_stats(enhanced_data['quality_metrics'])
            
            # Add category statistics from original data
            if 'category_stats' in data:
                enhanced_data['category_stats'] = data['category_stats']
            
            return enhanced_data
                
        except Exception as e:
            print(f"Error in enhance_analysis: {str(e)}")
            raise

    def analyze_technical_accuracy(self, df: pd.DataFrame) -> Dict[str, Dict[str, float]]:
        """Analyze technical accuracy of responses"""
        metrics = {}
        
        try:
            for idx, row in df.iterrows():
                ft_response = row.get('fine_tuned_response', '')
                base_response = row.get('base_response', '')
                
                metrics[f'sample_{idx}'] = {
                    'ft': {
                        'term_density': self._calculate_term_density(ft_response),
                        'math_accuracy': self._verify_mathematical_statements(ft_response),
                        'strategic_consistency': self._check_strategic_consistency(ft_response)
                    },
                    'base': {
                        'term_density': self._calculate_term_density(base_response),
                        'math_accuracy': self._verify_mathematical_statements(base_response),
                        'strategic_consistency': self._check_strategic_consistency(base_response)
                    }
                }
            
            return metrics
        except Exception as e:
            print(f"Error in analyze_technical_accuracy: {str(e)}")
            return {}

    def measure_actionable_advice(self, df: pd.DataFrame) -> Dict[str, Dict[str, Any]]:
        """Measure the actionability of advice"""
        actionability_metrics = {}
        
        try:
            for idx, row in df.iterrows():
                ft_response = row.get('fine_tuned_response', '')
                base_response = row.get('base_response', '')
                
                # Process action verbs
                ft_actions = self._extract_action_verbs(ft_response)
                base_actions = self._extract_action_verbs(base_response)
                
                # Process decision points
                ft_decisions = self._identify_decision_points(ft_response)
                base_decisions = self._identify_decision_points(base_response)
                
                # Count implementation steps
                ft_steps = self._count_implementation_steps(ft_response)
                base_steps = self._count_implementation_steps(base_response)
                
                # Calculate word counts for density
                ft_words = len(word_tokenize(ft_response))
                base_words = len(word_tokenize(base_response))
                
                actionability_metrics[f'sample_{idx}'] = {
                    'ft': {
                        'action_density': len(ft_actions) / ft_words if ft_words > 0 else 0,
                        'decision_points': len(ft_decisions),
                        'implementation_steps': ft_steps
                    },
                    'base': {
                        'action_density': len(base_actions) / base_words if base_words > 0 else 0,
                        'decision_points': len(base_decisions),
                        'implementation_steps': base_steps
                    }
                }
            
            return actionability_metrics
        except Exception as e:
            print(f"Error in measure_actionable_advice: {str(e)}")
            return {}

    def evaluate_strategic_depth(self, df: pd.DataFrame) -> Dict[str, Dict[str, Any]]:
        """Evaluate the strategic depth of responses"""
        depth_metrics = {}
        
        try:
            for idx, row in df.iterrows():
                ft_response = row.get('fine_tuned_response', '')
                base_response = row.get('base_response', '')
                query = row.get('query', '')
                
                depth_metrics[f'sample_{idx}'] = {
                    'ft': {
                        'reasoning_levels': self._analyze_reasoning_levels(ft_response),
                        'concept_coverage': self._measure_concept_coverage(ft_response),
                        'situation_adaptation': self._assess_situation_adaptation(ft_response, query)
                    },
                    'base': {
                        'reasoning_levels': self._analyze_reasoning_levels(base_response),
                        'concept_coverage': self._measure_concept_coverage(base_response),
                        'situation_adaptation': self._assess_situation_adaptation(base_response, query)
                    }
                }
            
            return depth_metrics
        except Exception as e:
            print(f"Error in evaluate_strategic_depth: {str(e)}")
            return {}

    def analyze_user_experience(self, df: pd.DataFrame) -> Dict[str, Dict[str, Any]]:
        """Analyze user experience metrics"""
        ux_metrics = {}
        
        try:
            for idx, row in df.iterrows():
                ft_response = row.get('fine_tuned_response', '')
                base_response = row.get('base_response', '')
                
                ux_metrics[f'sample_{idx}'] = {
                    'ft': {
                        'readability': {
                            'flesch_reading_ease': textstat.flesch_reading_ease(ft_response),
                            'flesch_kincaid_grade': textstat.flesch_kincaid_grade(ft_response),
                            'gunning_fog': textstat.gunning_fog(ft_response)
                        },
                        'structure': self._analyze_response_structure(ft_response),
                        'info_density': self._calculate_information_density(ft_response)
                    },
                    'base': {
                        'readability': {
                            'flesch_reading_ease': textstat.flesch_reading_ease(base_response),
                            'flesch_kincaid_grade': textstat.flesch_kincaid_grade(base_response),
                            'gunning_fog': textstat.gunning_fog(base_response)
                        },
                        'structure': self._analyze_response_structure(base_response),
                        'info_density': self._calculate_information_density(base_response)
                    }
                }
            
            return ux_metrics
        except Exception as e:
            print(f"Error in analyze_user_experience: {str(e)}")
            return {}

    def perform_comparative_analysis(self, df: pd.DataFrame) -> Dict[str, Dict[str, float]]:
        """Perform comparative analysis using BERT embeddings"""
        comparative_metrics = {}
        
        try:
            for idx, row in df.iterrows():
                ft_response = row.get('fine_tuned_response', '')
                base_response = row.get('base_response', '')
                query = row.get('query', '')
                
                # Get embeddings
                ft_emb = self._get_embeddings(ft_response)
                base_emb = self._get_embeddings(base_response)
                query_emb = self._get_embeddings(query)
                
                # Calculate similarities
                ft_similarity = float(cosine_similarity(ft_emb, query_emb)[0][0])
                base_similarity = float(cosine_similarity(base_emb, query_emb)[0][0])
                
                comparative_metrics[f'sample_{idx}'] = {
                    'ft_semantic_similarity': ft_similarity,
                    'base_semantic_similarity': base_similarity,
                    'relative_improvement': ((ft_similarity - base_similarity) / base_similarity * 100) 
                        if base_similarity > 0 else 0
                }
            
            return comparative_metrics
        except Exception as e:
            print(f"Error in perform_comparative_analysis: {str(e)}")
            return {}

    def _load_poker_terms(self) -> Dict[str, List[str]]:
        """Load poker-specific terminology"""
        return {
            'basic_terms': ['call', 'raise', 'fold', 'bet', 'check', 'all-in'],
            'positions': ['UTG', 'MP', 'CO', 'BTN', 'SB', 'BB'],
            'hand_types': ['pair', 'straight', 'flush', 'full house', 'quads'],
            'technical_terms': ['GTO', 'EV', 'ICM', 'SPR', 'MDF', 'polarized'],
            'strategic_terms': ['range', 'equity', 'blockers', 'protection', 'balance']
        }

    def _load_strategic_concepts(self) -> Dict[str, List[str]]:
        """Load strategic poker concepts"""
        return {
            'basic_strategy': ['position', 'pot odds', 'implied odds', 'stack size'],
            'advanced_strategy': ['range advantage', 'board coverage', 'removal effects'],
            'tournament_concepts': ['ICM pressure', 'bubble factor', 'pay jumps'],
            'exploitative_concepts': ['player tendencies', 'adjustments', 'tells'],
            'game_theory': ['equilibrium', 'mixed strategy', 'optimal play']
        }

    def _load_action_verbs(self) -> Dict[str, List[str]]:
        """Load poker-specific action verbs"""
        return {
            'basic_actions': ['bet', 'call', 'raise', 'fold', 'check'],
            'advanced_actions': ['isolate', 'squeeze', '3-bet', '4-bet', 'jam'],
            'strategic_actions': ['balance', 'protect', 'exploit', 'adjust', 'target']
        }

    def _get_embeddings(self, text: str) -> torch.Tensor:
        """Get BERT embeddings with explicit device placement"""
        try:
            inputs = self.tokenizer(text, return_tensors='pt', padding=True, truncation=True)
            inputs = {k: v.to(self.device) for k, v in inputs.items()}
            
            with torch.no_grad():
                outputs = self.model(**inputs)
                embeddings = outputs.last_hidden_state.mean(dim=1).cpu()
            return embeddings
        except Exception as e:
            print(f"Error generating embeddings: {str(e)}")
            # Return zero tensor as fallback
            return torch.zeros((1, self.model.config.hidden_size))

    def _calculate_term_density(self, text: str) -> float:
        """Calculate poker term density in text"""
        words = word_tokenize(text.lower())
        total_words = len(words)
        term_count = 0
        
        for category in self.poker_terms.values():
            term_count += sum(1 for term in category if term.lower() in words)
            
        return term_count / total_words if total_words > 0 else 0

    def _verify_mathematical_statements(self, text: str) -> float:
        """Verify mathematical accuracy of poker calculations"""
        patterns = {
            'pot_odds': r'(\d+(?:\.\d+)?)\s*%\s*pot odds',
            'equity': r'(\d+(?:\.\d+)?)\s*%\s*equity',
            'implied_odds': r'(\d+(?:\.\d+)?)\s*:\s*1'
        }
        
        math_statements = {}
        for key, pattern in patterns.items():
            matches = re.finditer(pattern, text.lower())
            math_statements[key] = [float(m.group(1)) for m in matches]
        
        consistency_score = 0
        if math_statements['pot_odds']:
            consistency_score += all(0 <= x <= 100 for x in math_statements['pot_odds'])
        if math_statements['equity']:
            consistency_score += all(0 <= x <= 100 for x in math_statements['equity'])
        
        total_checks = len(patterns)
        actual_checks = sum(1 for key in math_statements if math_statements[key])
        return consistency_score / actual_checks if actual_checks > 0 else 0

    def _check_strategic_consistency(self, text: str) -> float:
        """Check consistency of strategic advice"""
        sentences = sent_tokenize(text)
        contradictions = 0
        strategy_statements = []
        
        for sent in sentences:
            if any(term in sent.lower() for term in self.strategic_concepts['basic_strategy']):
                strategy_statements.append(sent)
        
        for i, stmt1 in enumerate(strategy_statements):
            for stmt2 in strategy_statements[i+1:]:
                if self._detect_contradiction(stmt1, stmt2):
                    contradictions += 1
        
        total_possible = len(strategy_statements) * (len(strategy_statements) - 1) / 2
        return 1 - (contradictions / total_possible) if total_possible > 0 else 1

    def _detect_contradiction(self, stmt1: str, stmt2: str) -> bool:
        """Detect potential contradictions between statements"""
        emb1 = self._get_embeddings(stmt1)
        emb2 = self._get_embeddings(stmt2)
        
        similarity = cosine_similarity(emb1, emb2)
        
        opposing_pairs = [
            ('always', 'never'),
            ('increase', 'decrease'),
            ('aggressive', 'passive'),
            ('wide', 'tight'),
            ('call', 'fold'),
            ('raise', 'check')
        ]
        
        stmt1_lower = stmt1.lower()
        stmt2_lower = stmt2.lower()
        
        for word1, word2 in opposing_pairs:
            if (word1 in stmt1_lower and word2 in stmt2_lower) or \
               (word2 in stmt1_lower and word1 in stmt2_lower):
                if similarity < 0.8:
                    return True
        return False

    def _extract_action_verbs(self, text: str) -> List[str]:
        """Extract poker-specific action verbs"""
        words = word_tokenize(text)
        pos_tags = nltk.pos_tag(words)
        action_verbs = []
        
        for word, tag in pos_tags:
            if tag.startswith('VB'):
                if any(word.lower() in actions for actions in self.action_verbs.values()):
                    action_verbs.append(word)
        return action_verbs

    def _identify_decision_points(self, text: str) -> List[str]:
        """Identify clear decision points in response"""
        sentences = sent_tokenize(text)
        decision_points = []
        
        decision_indicators = ['if', 'when', 'should', 'choose', 'decide']
        for sent in sentences:
            if any(indicator in sent.lower() for indicator in decision_indicators):
                decision_points.append(sent)
        return decision_points

    def _count_implementation_steps(self, text: str) -> int:
        """Count clear implementation steps"""
        sentences = sent_tokenize(text)
        step_indicators = ['first', 'second', 'then', 'next', 'finally', 'step']
        numbered_steps = len(re.findall(r'^\d+\.', text, re.MULTILINE))
        indicator_steps = sum(1 for sent in sentences if any(ind in sent.lower() for ind in step_indicators))
        return numbered_steps + indicator_steps

    def _analyze_reasoning_levels(self, text: str) -> Dict[str, int]:
        """Analyze levels of strategic reasoning"""
        sentences = sent_tokenize(text)
        reasoning_levels = {
            'level1': 0,  # Direct strategy
            'level2': 0,  # Opponent consideration
            'level3': 0   # Meta-game/adaptation
        }
        
        for sent in sentences:
            if any(term in sent.lower() for term in self.strategic_concepts['game_theory']):
                reasoning_levels['level1'] += 1
            if any(term in sent.lower() for term in self.strategic_concepts['exploitative_concepts']):
                reasoning_levels['level2'] += 1
            if any(term in sent.lower() for term in ['adjust', 'adapt', 'exploit', 'change']):
                reasoning_levels['level3'] += 1
        return reasoning_levels

    def _measure_concept_coverage(self, text: str) -> Dict[str, float]:
        """Measure coverage of poker concepts"""
        concept_coverage = {}
        for category, concepts in self.strategic_concepts.items():
            coverage = sum(1 for concept in concepts if concept.lower() in text.lower())
            concept_coverage[category] = coverage / len(concepts) if concepts else 0
        return concept_coverage

    def _assess_situation_adaptation(self, response: str, query: str) -> float:
        """Assess how well response adapts to specific situation"""
        query_keywords = set(word_tokenize(query.lower()))
        response_keywords = set(word_tokenize(response.lower()))
        
        relevance = len(query_keywords.intersection(response_keywords)) / len(query_keywords) if query_keywords else 0
        situation_specific = any(term in response.lower() for term in ['in this case', 'given the', 'in this situation'])
        
        return (relevance + (1 if situation_specific else 0)) / 2

    def _analyze_response_structure(self, text: str) -> Dict[str, int]:
        """Analyze the structure of the response"""
        return {
            'paragraphs': len(text.split('\n\n')),
            'bullet_points': len(re.findall(r'•|\*|\-\s', text)),
            'numbered_items': len(re.findall(r'^\d+\.', text, re.MULTILINE)),
            'sections': len(re.findall(r'#{1,3}\s', text))
        }

    def _calculate_information_density(self, text: str) -> Dict[str, float]:
        """Calculate information density metrics"""
        words = word_tokenize(text)
        stop_words = set(stopwords.words('english'))
        
        total_words = len(words)
        content_words = len([w for w in words if w.lower() not in stop_words])
        poker_terms = sum(1 for w in words if any(w.lower() in terms for terms in self.poker_terms.values()))
        
        return {
            'content_ratio': content_words / total_words if total_words > 0 else 0,
            'poker_term_ratio': poker_terms / total_words if total_words > 0 else 0,
            'avg_sentence_length': len(words) / len(sent_tokenize(text)) if sent_tokenize(text) else 0
        }

    def calculate_overall_performance(self, quality_metrics: Dict, 
                               actionability: Dict, 
                               strategic_depth: Dict) -> Dict[str, float]:
        """Calculate overall performance metrics"""
        try:
            # Extract numeric values from metrics
            ft_metrics = []
            base_metrics = []
            
            for metrics in [quality_metrics, actionability, strategic_depth]:
                for sample in metrics.values():
                    if isinstance(sample, dict) and 'ft' in sample and 'base' in sample:
                        # Convert nested dictionaries to averages
                        ft_val = np.mean([v for v in sample['ft'].values() 
                                        if isinstance(v, (int, float))])
                        base_val = np.mean([v for v in sample['base'].values() 
                                        if isinstance(v, (int, float))])
                        ft_metrics.append(ft_val)
                        base_metrics.append(base_val)
            
            # Calculate averages
            avg_ft = np.mean(ft_metrics) if ft_metrics else 0
            avg_base = np.mean(base_metrics) if base_metrics else 0
            
            return {
                'overall_ft_score': float(avg_ft),
                'overall_base_score': float(avg_base),
                'overall_improvement': float(((avg_ft - avg_base) / avg_base * 100) 
                                        if avg_base > 0 else 0)
            }
        except Exception as e:
            print(f"Error calculating overall performance: {str(e)}")
            return {
                'overall_ft_score': 0,
                'overall_base_score': 0,
                'overall_improvement': 0
            }

    def calculate_term_stats(self, quality_metrics: Dict) -> Dict[str, float]:
        """Calculate term density statistics"""
        try:
            ft_densities = []
            base_densities = []
            
            for metrics in quality_metrics.values():
                if isinstance(metrics, dict) and 'ft' in metrics and 'base' in metrics:
                    if 'term_density' in metrics['ft']:
                        ft_densities.append(metrics['ft']['term_density'])
                    if 'term_density' in metrics['base']:
                        base_densities.append(metrics['base']['term_density'])
            
            return {
                'ft_term_density': np.mean(ft_densities) if ft_densities else 0,
                'base_term_density': np.mean(base_densities) if base_densities else 0
            }
        except Exception as e:
            print(f"Error calculating term stats: {str(e)}")
            return {
                'ft_term_density': 0,
                'base_term_density': 0
            }

    def analyze_by_category(self, df: pd.DataFrame) -> Dict[str, Dict[str, Any]]:
        """Analyze responses grouped by category"""
        category_metrics = {}
        
        for category in df['category'].unique():
            category_data = df[df['category'] == category]
            category_metrics[category] = {
                'ft_response_time': {
                    'mean': category_data['ft_response_time'].mean(),
                    'std': category_data['ft_response_time'].std()
                },
                'base_response_time': {
                    'mean': category_data['base_response_time'].mean(),
                    'std': category_data['base_response_time'].std()
                },
                'ft_response_length': category_data['ft_response_length'].mean(),
                'base_response_length': category_data['base_response_length'].mean()
            }
        
        return category_metrics

    def _process_response(self, row: pd.Series) -> Dict[str, str]:
        """Process a single response row"""
        return {
            'fine_tuned_response': row.get('fine_tuned_response', ''),
            'base_response': row.get('base_response', ''),
            'category': row.get('category', ''),
            'query': row.get('query', '')
        }

In [63]:
def process_analysis_data(file_path: str, analyzer: PokerResponseAnalyzer) -> Dict[str, Any]:
    """
    Process the analysis data from JSON file and structure it for visualization
    
    Args:
        file_path: Path to the JSON analysis file
        analyzer: Instance of PokerResponseAnalyzer
    """
    try:
        # Load JSON file
        with open(file_path, 'r') as f:
            analysis_data = json.load(f)
        
        # Convert detailed_responses to DataFrame
        responses_df = pd.DataFrame(analysis_data['detailed_responses'])
        
        # Get enhanced analysis from analyzer
        enhanced_metrics = analyzer.enhance_analysis({
            'detailed_responses': analysis_data['detailed_responses'],
            'category_stats': analysis_data['category_stats']
        })
        
        # Structure the data for analysis
        structured_data = {
            'responses': {
                idx: {
                    'category': row['category'],
                    'query': row['query'],
                    'fine_tuned_response': row['fine_tuned_response'],
                    'base_response': row['base_response']
                }
                for idx, row in responses_df.iterrows()
            },
            'quality_metrics': enhanced_metrics['quality_metrics'],
            'actionability': enhanced_metrics['actionability'],
            'strategic_depth': enhanced_metrics['strategic_depth'],
            'term_stats': enhanced_metrics['term_stats'],
            'category_stats': enhanced_metrics['category_stats'],
            'metadata': {
                'sample_count': len(responses_df),
                'categories': list(analysis_data['category_stats'].keys()),
                'analysis_date': os.path.basename(file_path).split('_')[2].split('.')[0]
            }
        }
        
        return structured_data
        
    except Exception as e:
        print(f"Error processing analysis data: {str(e)}")
        raise

def print_analysis_results(enhanced_analysis: Dict[str, Any]) -> None:
    """Print formatted analysis results"""
    try:
        print("\nAnalysis Results:")
        print("=" * 50)
        
        # Quality Metrics
        print("\nQuality Metrics:")
        print("-" * 20)
        for sample, metrics in enhanced_analysis['quality_metrics'].items():
            print(f"\n{sample}:")
            print(f"Fine-tuned: {metrics['ft']}")
            print(f"Base: {metrics['base']}")
        
        # Actionability Metrics
        print("\nActionability Metrics:")
        print("-" * 20)
        for sample, metrics in enhanced_analysis['actionability'].items():
            print(f"\n{sample}:")
            print(f"Fine-tuned action density: {metrics['ft']['action_density']:.3f}")
            print(f"Base action density: {metrics['base']['action_density']:.3f}")
        
        # Strategic Depth
        print("\nStrategic Depth:")
        print("-" * 20)
        for sample, metrics in enhanced_analysis['strategic_depth'].items():
            print(f"\n{sample}:")
            print("Fine-tuned reasoning levels:", metrics['ft']['reasoning_levels'])
            print("Base reasoning levels:", metrics['base']['reasoning_levels'])
        
        # Term Usage
        print("\nTerm Usage:")
        print("-" * 20)
        term_stats = enhanced_analysis['term_stats']
        print(f"Fine-tuned term density: {term_stats['ft_term_density']:.3f}")
        print(f"Base term density: {term_stats['base_term_density']:.3f}")
        
        # Metadata
        print("\nMetadata:")
        print("-" * 20)
        print(f"Total samples: {enhanced_analysis['metadata']['sample_count']}")
        print(f"Categories: {len(enhanced_analysis['metadata']['categories'])}")
        print(f"Analysis date: {enhanced_analysis['metadata']['analysis_date']}")
            
    except Exception as e:
        print(f"Error printing analysis results: {str(e)}")

In [64]:
# Initialize analyzer
analyzer = PokerResponseAnalyzer()

# Process the analysis data
file_path = '../data/final/detailed_analysis_20241216_142035.json'
structured_data = process_analysis_data(file_path, analyzer)

# Print results
print_analysis_results(structured_data)

Using device: mps


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Model device: mps:0

Analysis Results:

Quality Metrics:
--------------------

sample_0:
Fine-tuned: {'term_density': 0.0, 'math_accuracy': 0, 'strategic_consistency': 1.0}
Base: {'term_density': 0.0065040650406504065, 'math_accuracy': 0, 'strategic_consistency': 0.987012987012987}

sample_1:
Fine-tuned: {'term_density': 0.02666666666666667, 'math_accuracy': 0, 'strategic_consistency': 1.0}
Base: {'term_density': 0.0060882800608828, 'math_accuracy': 0.0, 'strategic_consistency': 1.0}

sample_2:
Fine-tuned: {'term_density': 0.0028735632183908046, 'math_accuracy': 0, 'strategic_consistency': 1.0}
Base: {'term_density': 0.00641025641025641, 'math_accuracy': 0, 'strategic_consistency': 1.0}

sample_3:
Fine-tuned: {'term_density': 0.023809523809523808, 'math_accuracy': 0, 'strategic_consistency': 1}
Base: {'term_density': 0.007155635062611807, 'math_accuracy': 0.0, 'strategic_consistency': 1}

sample_4:
Fine-tuned: {'term_density': 0.030303030303030304, 'math_accuracy': 0, 'strategic_consis

In [82]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from typing import Dict, List
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from scipy import stats

class PokerResponseVisualizer:
    def __init__(self, style='plotly'):
        """Initialize visualizer with style preference."""
        self.style = style
        self.colors = {
            'ft': '#2ecc71',  # Green for fine-tuned
            'base': '#3498db', # Blue for base
            'background': '#f9f9f9',
            'grid': '#ecf0f1'
        }
        if style == 'matplotlib':
            plt.style.use('seaborn')
            
    def visualize_comprehensive_analysis(self, analysis_results: Dict):
        """Create comprehensive visualization dashboard."""
        if self.style == 'plotly':
            return self._create_plotly_dashboard(analysis_results)
        else:
            return self._create_matplotlib_dashboard(analysis_results)

    def _create_plotly_dashboard(self, results: Dict):
        """Create interactive Plotly dashboard."""
        # Create subplot figure
        fig = make_subplots(
            rows=3, cols=2,
            subplot_titles=(
                'Quality Metrics Comparison',
                'Strategic Depth Analysis',
                'Actionability Metrics',
                'Response Time Analysis',
                'Information Density',
                'Overall Performance'
            )
        )
    
        # 1. Quality Metrics
        quality_metrics = self._prepare_quality_metrics(results)
        fig.add_trace(
            go.Bar(
                name='Fine-tuned Model',
                x=quality_metrics['metrics'],
                y=quality_metrics['ft_scores'],
                marker_color='blue'
            ),
            row=1, col=1
        )
        fig.add_trace(
            go.Bar(
                name='Base Model',
                x=quality_metrics['metrics'],
                y=quality_metrics['base_scores'],
                marker_color='red'
            ),
            row=1, col=1
        )
        
        # 2. Strategic Depth
        strategic_depth = self._prepare_strategic_depth(results)
        fig.add_trace(
            go.Heatmap(
                z=[strategic_depth['ft_values'], strategic_depth['base_values']],
                x=strategic_depth['levels'],
                y=['Fine-tuned', 'Base'],
                colorscale='Viridis'
            ),
            row=1, col=2
        )
        # 3. Actionability Metrics Bar Chart
        actionability = self._prepare_actionability_metrics(results)
        fig.add_trace(
            go.Bar(
                x=actionability['metrics'],
                y=actionability['ft_scores'],
                name='Fine-tuned',
                marker_color=self.colors['ft']
            ),
            row=2, col=1
        )
        fig.add_trace(
            go.Bar(
                x=actionability['metrics'],
                y=actionability['base_scores'],
                name='Base',
                marker_color=self.colors['base']
            ),
            row=2, col=1
        )

        # 4. Readability Scores Bar Chart
        readability = self._prepare_readability_metrics(results)
        fig.add_trace(
            go.Bar(
                x=readability['metrics'],
                y=readability['ft_scores'],
                name='Fine-tuned',
                marker_color=self.colors['ft']
            ),
            row=2, col=2
        )
        fig.add_trace(
            go.Bar(
                x=readability['metrics'],
                y=readability['base_scores'],
                name='Base',
                marker_color=self.colors['base']
            ),
            row=2, col=2
        )

        # 5. Semantic Similarity Scatter Plot
        semantic_similarity = self._prepare_semantic_similarity(results)
        fig.add_trace(
            go.Scatter(
                x=semantic_similarity['ft_similarity'],
                y=semantic_similarity['base_similarity'],
                mode='markers',
                name='Semantic Similarity',
                marker=dict(
                    size=10,
                    color=self.colors['ft'],
                    symbol='circle'
                )
            ),
            row=3, col=1
        )
        fig.add_trace(
            go.Scatter(
                x=semantic_similarity['ft_similarity'],
                y=semantic_similarity['base_similarity'],
                mode='markers',
                name='',
                marker=dict(
                    size=10,
                    color=self.colors['base'],
                    symbol='circle'
                )
            ),
            row=3, col=1
        )

        # 6. Information Density Scatter Plot
        information_density = self._prepare_information_density(results)
        fig.add_trace(
            go.Scatter(
                x=information_density['ft_content_ratio'],
                y=information_density['ft_info_density'],
                mode='markers',
                name='Fine-tuned',
                marker=dict(
                    size=10,
                    color=self.colors['ft'],
                    symbol='circle'
                )
            ),
            row=3, col=2
        )
        fig.add_trace(
            go.Scatter(
                x=information_density['base_content_ratio'],
                y=information_density['base_info_density'],
                mode='markers',
                name='Base',
                marker=dict(
                    size=10,
                    color=self.colors['base'],
                    symbol='circle'
                )
            ),
            row=3, col=2
        )

        # Update layout
        fig.update_layout(
            height=1200,
            width=1600,
            showlegend=True,
            template='plotly_white',
            title_text="Comprehensive Poker Response Analysis"
        )

        return fig

    def _create_matplotlib_dashboard(self, results: Dict):
        """Create static Matplotlib dashboard."""
        fig = plt.figure(figsize=(20, 15))
        gs = fig.add_gridspec(3, 2)
        
        # 1. Quality Metrics Radar
        ax1 = fig.add_subplot(gs[0, 0], polar=True)
        self._plot_quality_radar(results, ax1)
        
        # 2. Strategic Depth Heatmap
        ax2 = fig.add_subplot(gs[0, 1])
        self._plot_strategic_depth_heatmap(results, ax2)
        
        # 3. Actionability Metrics
        ax3 = fig.add_subplot(gs[1, 0])
        self._plot_actionability_metrics(results, ax3)
        
        # 4. Readability Scores
        ax4 = fig.add_subplot(gs[1, 1])
        self._plot_readability_scores(results, ax4)
        
        # 5. Semantic Similarity Scatter
        ax5 = fig.add_subplot(gs[2, 0])
        self._plot_semantic_similarity(results, ax5)
        
        # 6. Information Density Scatter
        ax6 = fig.add_subplot(gs[2, 1])
        self._plot_information_density(results, ax6)
        
        plt.tight_layout()
        return fig

    def _prepare_quality_metrics(self, results: Dict) -> Dict:
        metrics = []
        ft_scores = []
        base_scores = []
        
        metric_keys = ['term_density', 'math_accuracy', 'strategic_consistency']
        
        # Collect metric values from all samples
        for metric_data in results['quality_metrics'].values():
            # metric_data has the form: {'ft': {...}, 'base': {...}}
            for key in metric_keys:
                if isinstance(metric_data['ft'], dict) and key in metric_data['ft']:
                    metrics.append(key)
                    ft_scores.append(metric_data['ft'][key])
                    base_scores.append(metric_data['base'][key])
        
        return {
            'metrics': metrics,
            'ft_scores': ft_scores,
            'base_scores': base_scores
        }


    def _prepare_strategic_depth(self, results: Dict) -> Dict:
        """Prepare strategic depth metrics for visualization."""
        try:
            ft_values = []
            base_values = []
            levels = ['Level 1', 'Level 2', 'Level 3']
            
            # Calculate means for each reasoning level across all samples
            for level in ['level1', 'level2', 'level3']:
                ft_avg = np.mean([v['ft']['reasoning_levels'][level] 
                                  for v in results['strategic_depth'].values()])
                base_avg = np.mean([v['base']['reasoning_levels'][level] 
                                   for v in results['strategic_depth'].values()])
                ft_values.append(ft_avg)
                base_values.append(base_avg)
            
            return {
                'levels': levels,
                'ft_values': ft_values,
                'base_values': base_values,
                'concept_coverage': {
                    'ft': np.mean([v['ft']['concept_coverage']['basic_strategy'] 
                                   for v in results['strategic_depth'].values()]),
                    'base': np.mean([v['base']['concept_coverage']['basic_strategy'] 
                                     for v in results['strategic_depth'].values()])
                }
            }
        except Exception as e:
            print(f"Error preparing strategic depth: {str(e)}")
            return {
                'levels': ['Level 1', 'Level 2', 'Level 3'],
                'ft_values': [0, 0, 0],
                'base_values': [0, 0, 0],
                'concept_coverage': {'ft': 0, 'base': 0}
            }

    def _prepare_actionability_metrics(self, results: Dict) -> Dict:
        ft_action_density = [v['ft']['action_density'] for v in results['actionability'].values()]
        ft_decision_points = [v['ft']['decision_points'] for v in results['actionability'].values()]
        ft_implementation_steps = [v['ft']['implementation_steps'] for v in results['actionability'].values()]
        
        base_action_density = [v['base']['action_density'] for v in results['actionability'].values()]
        base_decision_points = [v['base']['decision_points'] for v in results['actionability'].values()]
        base_implementation_steps = [v['base']['implementation_steps'] for v in results['actionability'].values()]

        ft_scores = [
            np.mean(ft_action_density),
            np.mean(ft_decision_points),
            np.mean(ft_implementation_steps)
        ]
        base_scores = [
            np.mean(base_action_density),
            np.mean(base_decision_points),
            np.mean(base_implementation_steps)
        ]
        
        return {
            'metrics': ['Action Density', 'Decision Points', 'Implementation Steps'],
            'ft_scores': ft_scores,
            'base_scores': base_scores
        }


    def _prepare_readability_metrics(self, results: Dict) -> Dict:
        ft_fre = [v['ft']['readability']['flesch_reading_ease'] for v in results['ux_metrics'].values()]
        ft_fk = [v['ft']['readability']['flesch_kincaid_grade'] for v in results['ux_metrics'].values()]
        ft_gf = [v['ft']['readability']['gunning_fog'] for v in results['ux_metrics'].values()]
        
        base_fre = [v['base']['readability']['flesch_reading_ease'] for v in results['ux_metrics'].values()]
        base_fk = [v['base']['readability']['flesch_kincaid_grade'] for v in results['ux_metrics'].values()]
        base_gf = [v['base']['readability']['gunning_fog'] for v in results['ux_metrics'].values()]

        ft_scores = [
            np.mean(ft_fre),
            np.mean(ft_fk),
            np.mean(ft_gf)
        ]
        base_scores = [
            np.mean(base_fre),
            np.mean(base_fk),
            np.mean(base_gf)
        ]
        
        return {
            'metrics': ['Flesch Reading Ease', 'Flesch-Kincaid Grade', 'Gunning Fog'],
            'ft_scores': ft_scores,
            'base_scores': base_scores
        }

    def _prepare_semantic_similarity(self, results: Dict) -> Dict:
        ft_similarity = [v['ft_semantic_similarity'] for v in results['comparative'].values()]
        base_similarity = [v['base_semantic_similarity'] for v in results['comparative'].values()]
        return {
            'ft_similarity': ft_similarity,
            'base_similarity': base_similarity
        }


    def _prepare_information_density(self, results: Dict) -> Dict:
        ft_content_ratio = [v['ft']['info_density']['content_ratio'] for v in results['ux_metrics'].values()]
        base_content_ratio = [v['base']['info_density']['content_ratio'] for v in results['ux_metrics'].values()]
        ft_poker_term_ratio = [v['ft']['info_density']['poker_term_ratio'] for v in results['ux_metrics'].values()]
        base_poker_term_ratio = [v['base']['info_density']['poker_term_ratio'] for v in results['ux_metrics'].values()]

        return {
            'ft_content_ratio': ft_content_ratio,
            'base_content_ratio': base_content_ratio,
            'ft_info_density': ft_poker_term_ratio,
            'base_info_density': base_poker_term_ratio
        }


    def _plot_quality_radar(self, results: Dict, ax):
        """Plot quality metrics radar chart using matplotlib."""
        quality = self._prepare_quality_metrics(results)
        metrics = quality['metrics']
        ft_scores = quality['ft_scores']
        base_scores = quality['base_scores']
        num_metrics = len(metrics)

        angles = np.linspace(0, 2 * np.pi, num_metrics, endpoint=False).tolist()
        angles += angles[:1]  # Complete the loop

        ft_scores += [ft_scores[0]]
        base_scores += [base_scores[0]]

        ax.plot(angles, ft_scores, 'o-', linewidth=2, label='Fine-tuned', color=self.colors['ft'])
        ax.fill(angles, ft_scores, alpha=0.25, color=self.colors['ft'])
        ax.plot(angles, base_scores, 'o-', linewidth=2, label='Base', color=self.colors['base'])
        ax.fill(angles, base_scores, alpha=0.25, color=self.colors['base'])

        ax.set_xticks(angles[:-1])
        ax.set_xticklabels(metrics)
        ax.set_title('Quality Metrics Radar', pad=20)
        ax.legend(loc='upper right', bbox_to_anchor=(0.1, 0.1))

    def _plot_strategic_depth_heatmap(self, results: Dict, ax):
        """Plot strategic depth heatmap using matplotlib.

        We have levels (e.g., Level 1, Level 2, Level 3) along the columns
        and models (Fine-tuned, Base) along the rows.
        """
        strategic_depth = self._prepare_strategic_depth(results)
        levels = strategic_depth['levels']
        ft_values = strategic_depth['ft_values']  # a list of 3 values (one per level)
        base_values = strategic_depth['base_values']  # a list of 3 values (one per level)

        # Create a 2D array for the heatmap: two rows (FT, Base), three columns (levels)
        data = np.array([ft_values, base_values])

        sns.heatmap(
            data,
            annot=True,
            fmt=".2f",
            cmap='viridis',
            ax=ax,
            cbar=True,
            linewidths=.5
        )
        ax.set_yticklabels(['Fine-tuned', 'Base'], rotation=0)
        ax.set_xticklabels(levels, rotation=45)
        ax.set_xlabel('Reasoning Levels', labelpad=10)
        ax.set_ylabel('Model', labelpad=10)
        ax.set_title('Strategic Depth Heatmap', pad=20)

    def _plot_actionability_metrics(self, results: Dict, ax):
        """Plot actionability metrics bar chart using matplotlib."""
        action = self._prepare_actionability_metrics(results)
        metrics = action['metrics']
        ft_scores = action['ft_scores']
        base_scores = action['base_scores']

        x = np.arange(len(metrics))
        width = 0.35

        ax.bar(x - width/2, ft_scores, width, label='Fine-tuned', color=self.colors['ft'])
        ax.bar(x + width/2, base_scores, width, label='Base', color=self.colors['base'])

        ax.set_xticks(x)
        ax.set_xticklabels(metrics, rotation=45)
        ax.set_ylabel('Average Score')
        ax.set_title('Actionability Metrics')
        ax.legend()

    def _plot_readability_scores(self, results: Dict, ax):
        """Plot readability scores bar chart using matplotlib."""
        readability = self._prepare_readability_metrics(results)
        metrics = readability['metrics']
        ft_scores = readability['ft_scores']
        base_scores = readability['base_scores']

        x = np.arange(len(metrics))
        width = 0.35

        ax.bar(x - width/2, ft_scores, width, label='Fine-tuned', color=self.colors['ft'])
        ax.bar(x + width/2, base_scores, width, label='Base', color=self.colors['base'])

        ax.set_xticks(x)
        ax.set_xticklabels(metrics, rotation=45)
        ax.set_ylabel('Score')
        ax.set_title('Readability Scores')
        ax.legend()

    def _plot_semantic_similarity(self, results: Dict, ax):
        """Plot semantic similarity scatter plot using matplotlib."""
        semantic = self._prepare_semantic_similarity(results)
        ft_similarity = semantic['ft_similarity']
        base_similarity = semantic['base_similarity']

        ax.scatter(ft_similarity, base_similarity, color=self.colors['ft'], label='Fine-tuned', alpha=0.7)
        ax.scatter(ft_similarity, base_similarity, color=self.colors['base'], label='Base', alpha=0.7)

        ax.set_xlabel('Fine-tuned Semantic Similarity')
        ax.set_ylabel('Base Semantic Similarity')
        ax.set_title('Semantic Similarity Comparison')
        ax.legend()

    def _plot_information_density(self, results: Dict, ax):
        """Plot information density scatter plot using matplotlib."""
        density = self._prepare_information_density(results)
        ft_content = density['ft_content_ratio']
        base_content = density['base_content_ratio']
        ft_info = density['ft_info_density']
        base_info = density['base_info_density']

        ax.scatter(ft_content, ft_info, color=self.colors['ft'], label='Fine-tuned', alpha=0.7)
        ax.scatter(base_content, base_info, color=self.colors['base'], label='Base', alpha=0.7)

        ax.set_xlabel('Content Ratio')
        ax.set_ylabel('Poker Term Ratio')
        ax.set_title('Information Density Comparison')
        ax.legend()

    def _plot_quality_radar(self, results: Dict, ax):
        """Plot quality metrics radar chart using matplotlib."""
        quality = self._prepare_quality_metrics(results)
        metrics = quality['metrics']
        ft_scores = quality['ft_scores']
        base_scores = quality['base_scores']
        num_metrics = len(metrics)

        angles = np.linspace(0, 2 * np.pi, num_metrics, endpoint=False).tolist()
        angles += angles[:1]  # Complete the loop

        ft_scores += [ft_scores[0]]
        base_scores += [base_scores[0]]

        ax.plot(angles, ft_scores, 'o-', linewidth=2, label='Fine-tuned', color=self.colors['ft'])
        ax.fill(angles, ft_scores, alpha=0.25, color=self.colors['ft'])
        ax.plot(angles, base_scores, 'o-', linewidth=2, label='Base', color=self.colors['base'])
        ax.fill(angles, base_scores, alpha=0.25, color=self.colors['base'])

        ax.set_xticks(angles[:-1])
        ax.set_xticklabels(metrics)
        ax.set_title('Quality Metrics Radar')
        ax.legend(loc='upper right', bbox_to_anchor=(0.1, 0.1))

    def _plot_strategic_depth_heatmap(self, results: Dict, ax):
        """Plot strategic depth heatmap using matplotlib."""
        strategic_depth = self._prepare_strategic_depth(results)
        categories = strategic_depth['categories']
        levels = strategic_depth['levels']
        ft_values = strategic_depth['ft_values'][0]
        base_values = strategic_depth['base_values'][0]

        data = np.array([ft_values, base_values])

        sns.heatmap(
            data,
            annot=True,
            fmt=".2f",
            cmap='viridis',
            ax=ax,
            cbar=True,
            linewidths=.5
        )
        ax.set_yticklabels(strategic_depth['categories'], rotation=0)
        ax.set_xticklabels(levels, rotation=45)
        ax.set_xlabel('Reasoning Levels')
        ax.set_ylabel('Models')
        ax.set_title('Strategic Depth Heatmap')

    def _plot_actionability_metrics(self, results: Dict, ax):
        """Plot actionability metrics bar chart using matplotlib."""
        action = self._prepare_actionability_metrics(results)
        metrics = action['metrics']
        ft_scores = action['ft_scores']
        base_scores = action['base_scores']

        x = np.arange(len(metrics))
        width = 0.35

        ax.bar(x - width/2, ft_scores, width, label='Fine-tuned', color=self.colors['ft'])
        ax.bar(x + width/2, base_scores, width, label='Base', color=self.colors['base'])

        ax.set_xticks(x)
        ax.set_xticklabels(metrics, rotation=45)
        ax.set_ylabel('Average Score')
        ax.set_title('Actionability Metrics')
        ax.legend()

    def _plot_readability_scores(self, results: Dict, ax):
        """Plot readability scores bar chart using matplotlib."""
        readability = self._prepare_readability_metrics(results)
        metrics = readability['metrics']
        ft_scores = readability['ft_scores']
        base_scores = readability['base_scores']

        x = np.arange(len(metrics))
        width = 0.35

        ax.bar(x - width/2, ft_scores, width, label='Fine-tuned', color=self.colors['ft'])
        ax.bar(x + width/2, base_scores, width, label='Base', color=self.colors['base'])

        ax.set_xticks(x)
        ax.set_xticklabels(metrics, rotation=45)
        ax.set_ylabel('Score')
        ax.set_title('Readability Scores')
        ax.legend()

    def _plot_semantic_similarity(self, results: Dict, ax):
        """Plot semantic similarity scatter plot using matplotlib."""
        semantic = self._prepare_semantic_similarity(results)
        ft_similarity = semantic['ft_similarity']
        base_similarity = semantic['base_similarity']

        ax.scatter(ft_similarity, base_similarity, color=self.colors['ft'], label='Fine-tuned', alpha=0.7)
        ax.scatter(ft_similarity, base_similarity, color=self.colors['base'], label='Base', alpha=0.7)

        ax.set_xlabel('Fine-tuned Semantic Similarity')
        ax.set_ylabel('Base Semantic Similarity')
        ax.set_title('Semantic Similarity Comparison')
        ax.legend()

    def _plot_information_density(self, results: Dict, ax):
        """Plot information density scatter plot using matplotlib."""
        density = self._prepare_information_density(results)
        ft_content = density['ft_content_ratio']
        base_content = density['base_content_ratio']
        ft_info = density['ft_info_density']
        base_info = density['base_info_density']

        ax.scatter(ft_content, ft_info, color=self.colors['ft'], label='Fine-tuned', alpha=0.7)
        ax.scatter(base_content, base_info, color=self.colors['base'], label='Base', alpha=0.7)

        ax.set_xlabel('Content Ratio')
        ax.set_ylabel('Poker Term Ratio')
        ax.set_title('Information Density Comparison')
        ax.legend()

    def analyze_model_differences(self, results: Dict) -> Dict:
        """Analyze statistical differences between models."""
        return {
            'time_improvement': self._calculate_time_improvement(results),
            'length_improvement': self._calculate_length_improvement(results),
            'quality_difference': self._calculate_quality_difference(results),
            'statistical_tests': self._perform_statistical_tests(results)
        }

    def _calculate_quality_difference(self, results: Dict) -> Dict:
        """Calculate quality metric differences."""
        quality = self._prepare_quality_metrics(results)
        return {
            metric: ft - base
            for metric, ft, base in zip(
                quality['metrics'],
                quality['ft_scores'],
                quality['base_scores']
            )
        }

    def _perform_statistical_tests(self, results: Dict) -> Dict:
        tests = {}
        
        # Extract term density
        # quality_metrics format: {'sample_0': {'ft': {...}, 'base': {...}}, ...}
        # Each 'ft' and 'base' dict in quality_metrics contains 'term_density', 'math_accuracy', 'strategic_consistency'
        ft_term_density = []
        base_term_density = []
        ft_math = []
        base_math = []
        ft_consistency = []
        base_consistency = []
        
        for sample_key, sample_metrics in results['quality_metrics'].items():
            if 'term_density' in sample_metrics['ft']:
                ft_term_density.append(sample_metrics['ft']['term_density'])
                base_term_density.append(sample_metrics['base']['term_density'])
            if 'math_accuracy' in sample_metrics['ft']:
                ft_math.append(sample_metrics['ft']['math_accuracy'])
                base_math.append(sample_metrics['base']['math_accuracy'])
            if 'strategic_consistency' in sample_metrics['ft']:
                ft_consistency.append(sample_metrics['ft']['strategic_consistency'])
                base_consistency.append(sample_metrics['base']['strategic_consistency'])

        # Run T-tests for quality metrics if there's data
        from scipy import stats
        
        if ft_term_density and base_term_density:
            t_stat, p_value = stats.ttest_ind(ft_term_density, base_term_density)
            tests['term_density'] = {'t_statistic': t_stat, 'p_value': p_value}
        if ft_math and base_math:
            t_stat, p_value = stats.ttest_ind(ft_math, base_math)
            tests['mathematical_accuracy'] = {'t_statistic': t_stat, 'p_value': p_value}
        if ft_consistency and base_consistency:
            t_stat, p_value = stats.ttest_ind(ft_consistency, base_consistency)
            tests['strategic_consistency'] = {'t_statistic': t_stat, 'p_value': p_value}
        
        # Actionability metrics
        # actionability format: {'sample_0': {'ft': {...}, 'base': {...}}, ...}
        # Each 'ft' and 'base' dict in actionability contains 'action_density', 'decision_points', 'implementation_steps'
        ft_action_density = [m['ft']['action_density'] for m in results['actionability'].values()]
        base_action_density = [m['base']['action_density'] for m in results['actionability'].values()]
        ft_decision_points = [m['ft']['decision_points'] for m in results['actionability'].values()]
        base_decision_points = [m['base']['decision_points'] for m in results['actionability'].values()]

        # Test action_density
        if ft_action_density and base_action_density:
            t_stat, p_value = stats.ttest_ind(ft_action_density, base_action_density)
            tests['action_density'] = {'t_statistic': t_stat, 'p_value': p_value}
        
        # Test decision_points
        if ft_decision_points and base_decision_points:
            t_stat, p_value = stats.ttest_ind(ft_decision_points, base_decision_points)
            tests['decision_points'] = {'t_statistic': t_stat, 'p_value': p_value}

        # Strategic Depth
        # strategic_depth format: {'sample_0': {'ft': {...}, 'base': {...}}, ...}
        # reasoning_levels: {'level1', 'level2', 'level3'}
        ft_reasoning = []
        base_reasoning = []
        for v in results['strategic_depth'].values():
            ft_levels = v['ft']['reasoning_levels']
            base_levels = v['base']['reasoning_levels']
            # sum of all levels as a measure of total reasoning depth
            ft_total = ft_levels['level1'] + ft_levels['level2'] + ft_levels['level3']
            base_total = base_levels['level1'] + base_levels['level2'] + base_levels['level3']
            ft_reasoning.append(ft_total)
            base_reasoning.append(base_total)
        
        if ft_reasoning and base_reasoning:
            t_stat, p_value = stats.ttest_ind(ft_reasoning, base_reasoning)
            tests['reasoning_depth'] = {'t_statistic': t_stat, 'p_value': p_value}
        
        # Semantic Similarity
        # comparative format: {'sample_0': {'ft_semantic_similarity':..., 'base_semantic_similarity':..., ...}, ...}
        ft_similarity = [m['ft_semantic_similarity'] for m in results['comparative'].values()]
        base_similarity = [m['base_semantic_similarity'] for m in results['comparative'].values()]
        if ft_similarity and base_similarity:
            t_stat, p_value = stats.ttest_ind(ft_similarity, base_similarity)
            tests['semantic_similarity'] = {'t_statistic': t_stat, 'p_value': p_value}
        
        # Compute effect sizes for each test
        for metric, test_results in tests.items():
            # Determine which groups to use for effect size calculation
            if metric == 'term_density':
                group1, group2 = ft_term_density, base_term_density
            elif metric == 'mathematical_accuracy':
                group1, group2 = ft_math, base_math
            elif metric == 'strategic_consistency':
                group1, group2 = ft_consistency, base_consistency
            elif metric == 'action_density':
                group1, group2 = ft_action_density, base_action_density
            elif metric == 'decision_points':
                group1, group2 = ft_decision_points, base_decision_points
            elif metric == 'reasoning_depth':
                group1, group2 = ft_reasoning, base_reasoning
            elif metric == 'semantic_similarity':
                group1, group2 = ft_similarity, base_similarity
            else:
                continue
            
            # Calculate Cohen's d
            d = self._cohens_d(group1, group2)
            test_results['effect_size'] = d
            test_results['effect_magnitude'] = self._interpret_effect_size(d)
        
        return tests

    def _cohens_d(self, group1: List[float], group2: List[float]) -> float:
        """Calculate Cohen's d effect size."""
        n1, n2 = len(group1), len(group2)
        var1, var2 = np.var(group1, ddof=1), np.var(group2, ddof=1)
        
        # Pooled standard deviation
        pooled_se = np.sqrt(((n1 - 1) * var1 + (n2 - 1) * var2) / (n1 + n2 - 2))
        
        # Cohen's d
        return (np.mean(group1) - np.mean(group2)) / pooled_se if pooled_se != 0 else 0

    def _interpret_effect_size(self, d: float) -> str:
        """Interpret Cohen's d effect size magnitude."""
        d = abs(d)
        if d < 0.2:
            return "negligible"
        elif d < 0.5:
            return "small"
        elif d < 0.8:
            return "medium"
        else:
            return "large"

    def _calculate_time_improvement(self, results: Dict) -> float:
        """Calculate time improvement percentage based on response generation times."""
        try:
            ft_times = [v['ft_response_time'] for v in results['ux_metrics'].values()]
            base_times = [v['base_response_time'] for v in results['ux_metrics'].values()]
            
            if not base_times:
                return 0.0
                
            return ((np.mean(base_times) - np.mean(ft_times)) / np.mean(base_times)) * 100
        except KeyError:
            print("Warning: Response time data not found in results")
            return 0.0

    def _calculate_length_improvement(self, results: Dict) -> float:
        """Calculate length improvement percentage based on response lengths."""
        try:
            ft_lengths = [len(v['fine_tuned_response'].split()) for v in results['responses'].values()]
            base_lengths = [len(v['base_response'].split()) for v in results['responses'].values()]
            
            if not base_lengths:
                return 0.0
                
            return ((np.mean(base_lengths) - np.mean(ft_lengths)) / np.mean(base_lengths)) * 100
        except KeyError:
            print("Warning: Response length data not found in results")
            return 0.0

    def plot_statistical_analysis(self, results: Dict):
        """Create statistical analysis plots."""
        tests = self._perform_statistical_tests(results)
        
        if self.style == 'plotly':
            return self._create_plotly_stats_dashboard(tests)
        else:
            return self._create_matplotlib_stats_dashboard(tests)

    def _create_plotly_stats_dashboard(self, tests: Dict):
        """Create statistical analysis dashboard using Plotly."""
        fig = make_subplots(
            rows=2, cols=2,
            subplot_titles=(
                'P-Values by Metric',
                'Effect Sizes',
                'T-Statistics',
                'Statistical Significance Summary'
            )
        )
        
        # Extract data
        metrics = list(tests.keys())
        p_values = [test['p_value'] for test in tests.values()]
        effect_sizes = [abs(test['effect_size']) for test in tests.values()]
        t_stats = [abs(test['t_statistic']) for test in tests.values()]
        
        # 1. P-Values Bar Chart
        fig.add_trace(
            go.Bar(
                x=metrics,
                y=p_values,
                name='P-Value',
                marker_color='rgba(55, 128, 191, 0.7)'
            ),
            row=1, col=1
        )
        fig.add_hline(y=0.05, line_dash="dash", line_color="red", row=1, col=1)
        
        # 2. Effect Sizes Bar Chart
        fig.add_trace(
            go.Bar(
                x=metrics,
                y=effect_sizes,
                name='Effect Size',
                marker_color='rgba(50, 171, 96, 0.7)'
            ),
            row=1, col=2
        )
        
        # 3. T-Statistics Bar Chart
        fig.add_trace(
            go.Bar(
                x=metrics,
                y=t_stats,
                name='|T-Statistic|',
                marker_color='rgba(219, 64, 82, 0.7)'
            ),
            row=2, col=1
        )
        
        # 4. Significance Summary Heatmap
        significance_matrix = np.zeros((len(metrics), 3))
        for i, test in enumerate(tests.values()):
            significance_matrix[i, 0] = test['p_value'] < 0.05
            significance_matrix[i, 1] = abs(test['effect_size'])
            significance_matrix[i, 2] = abs(test['t_statistic'])
        
        fig.add_trace(
            go.Heatmap(
                z=significance_matrix,
                x=['Significant', 'Effect Size', '|T-Stat|'],
                y=metrics,
                colorscale='RdYlBu'
            ),
            row=2, col=2
        )
        
        # Update layout
        fig.update_layout(
            height=800,
            width=1200,
            showlegend=False,
            title_text="Statistical Analysis Dashboard"
        )
        
        return fig

    def _create_matplotlib_stats_dashboard(self, tests: Dict):
        """Create statistical analysis dashboard using Matplotlib."""
        fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(15, 12))
        
        # Extract data
        metrics = list(tests.keys())
        p_values = [test['p_value'] for test in tests.values()]
        effect_sizes = [abs(test['effect_size']) for test in tests.values()]
        t_stats = [abs(test['t_statistic']) for test in tests.values()]
        
        # 1. P-Values Bar Chart
        ax1.bar(metrics, p_values, color='skyblue')
        ax1.axhline(y=0.05, color='red', linestyle='--')
        ax1.set_title('P-Values by Metric')
        ax1.set_xticklabels(metrics, rotation=45)
        ax1.set_ylabel('P-Value')
        
        # 2. Effect Sizes Bar Chart
        ax2.bar(metrics, effect_sizes, color='lightgreen')
        ax2.set_title('Effect Sizes')
        ax2.set_xticklabels(metrics, rotation=45)
        ax2.set_ylabel('|Effect Size|')
        
        # 3. T-Statistics Bar Chart
        ax3.bar(metrics, t_stats, color='salmon')
        ax3.set_title('|T-Statistics|')
        ax3.set_xticklabels(metrics, rotation=45)
        ax3.set_ylabel('|T-Statistic|')
        
        # 4. Significance Summary Heatmap
        significance_matrix = np.zeros((len(metrics), 3))
        for i, test in enumerate(tests.values()):
            significance_matrix[i, 0] = test['p_value'] < 0.05
            significance_matrix[i, 1] = abs(test['effect_size'])
            significance_matrix[i, 2] = abs(test['t_statistic'])
        
        sns.heatmap(
            significance_matrix,
            xticklabels=['Significant', 'Effect Size', '|T-Stat|'],
            yticklabels=metrics,
            cmap='RdYlBu',
            ax=ax4
        )
        ax4.set_title('Significance Summary')
        
        plt.tight_layout()
        return fig
        

    def save_visualizations(self, fig, filename: str):
        """Save visualizations to file."""
        if self.style == 'plotly':
            fig.write_html(f"../data/final/{filename}.html")
            fig.write_image(f"../data/final/{filename}.png")
        else:
            fig.savefig(f"../data/final/{filename}.png", dpi=300, bbox_inches='tight')


In [83]:
# Initialize both analyzer and visualizer
analyzer = PokerResponseAnalyzer()
visualizer = PokerResponseVisualizer(style='plotly')

try:
    # 1. Load raw data
    with open('../data/final/detailed_analysis_20241216_142035.json', 'r') as f:
        raw_data = json.load(f)
    
    # 2. Process with analyzer
    enhanced_analysis = analyzer.enhance_analysis(raw_data)
    
    # Debug: Print structure of enhanced analysis
    print("\nEnhanced Analysis Structure:")
    for key, value in enhanced_analysis.items():
        print(f"\n{key}:")
        if isinstance(value, dict):
            sample_item = next(iter(value.items()))
            print(f"  Sample item: {sample_item}")
    
    # 3. Structure data for visualization
    visualization_data = {
        'responses': {
            idx: {
                'fine_tuned_response': resp['fine_tuned_response'],
                'base_response': resp['base_response'],
                'query': resp['query']
            }
            for idx, resp in enumerate(raw_data['detailed_responses'])
        },
        'quality_metrics': enhanced_analysis['quality_metrics'],
        'actionability': enhanced_analysis['actionability'],
        'strategic_depth': enhanced_analysis['strategic_depth'],
        'comparative': enhanced_analysis.get('comparative', {}),
        'ux_metrics': enhanced_analysis.get('ux_metrics', {})
    }
    
    # Debug: Print structure of visualization data
    print("\nVisualization Data Structure:")
    for key, value in visualization_data.items():
        print(f"\n{key}:")
        if isinstance(value, dict):
            sample_item = next(iter(value.items()))
            print(f"  Sample item: {sample_item}")
    
    # 4. Generate visualizations
    dashboard = visualizer.visualize_comprehensive_analysis(visualization_data)
    stats_dashboard = visualizer.plot_statistical_analysis(visualization_data)
    
    # 5. Save and display
    visualizer.save_visualizations(dashboard, 'poker_analysis_results')
    visualizer.save_visualizations(stats_dashboard, 'poker_statistical_analysis')
    
    print("\nDisplaying Analysis Dashboard...")
    dashboard.show()
    print("\nDisplaying Statistical Analysis Dashboard...")
    stats_dashboard.show()
    
except Exception as e:
    print(f"Error in visualization: {str(e)}")
    traceback.print_exc()

Using device: mps



`resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Model device: mps:0

Enhanced Analysis Structure:

quality_metrics:
  Sample item: ('sample_0', {'ft': {'term_density': 0.0, 'math_accuracy': 0, 'strategic_consistency': 1.0}, 'base': {'term_density': 0.0065040650406504065, 'math_accuracy': 0, 'strategic_consistency': 0.987012987012987}})

actionability:
  Sample item: ('sample_0', {'ft': {'action_density': 0.0, 'decision_points': 2, 'implementation_steps': 0}, 'base': {'action_density': 0.0032520325203252032, 'decision_points': 7, 'implementation_steps': 10}})

strategic_depth:
  Sample item: ('sample_0', {'ft': {'reasoning_levels': {'level1': 0, 'level2': 0, 'level3': 0}, 'concept_coverage': {'basic_strategy': 0.25, 'advanced_strategy': 0.0, 'tournament_concepts': 0.0, 'exploitative_concepts': 0.0, 'game_theory': 0.0}, 'situation_adaptation': 0.3055555555555556}, 'base': {'reasoning_levels': {'level1': 0, 'level2': 1, 'level3': 1}, 'concept_coverage': {'basic_strategy': 0.25, 'advanced_strategy': 0.0, 'tournament_concepts': 0.0, 'exp


Precision loss occurred in moment calculation due to catastrophic cancellation. This occurs when the data are nearly identical. Results may be unreliable.




Displaying Statistical Analysis Dashboard...


### Re-run with saves

In [84]:
import pandas as pd

# Initialize both analyzer and visualizer
analyzer = PokerResponseAnalyzer()
visualizer = PokerResponseVisualizer(style='plotly')

try:
    # 1. Load raw data
    with open('../data/final/detailed_analysis_20241216_142035.json', 'r') as f:
        raw_data = json.load(f)
    
    # 2. Process with analyzer
    enhanced_analysis = analyzer.enhance_analysis(raw_data)
    
    # Convert enhanced_analysis data into DataFrames
    quality_metrics_df = pd.DataFrame.from_dict({
        sample: {
            "ft_term_density": metrics['ft']['term_density'],
            "ft_math_accuracy": metrics['ft']['math_accuracy'],
            "ft_strategic_consistency": metrics['ft']['strategic_consistency'],
            "base_term_density": metrics['base']['term_density'],
            "base_math_accuracy": metrics['base']['math_accuracy'],
            "base_strategic_consistency": metrics['base']['strategic_consistency'],
        }
        for sample, metrics in enhanced_analysis['quality_metrics'].items()
    }, orient='index')

    actionability_df = pd.DataFrame.from_dict({
        sample: {
            "ft_action_density": metrics['ft']['action_density'],
            "ft_decision_points": metrics['ft']['decision_points'],
            "ft_implementation_steps": metrics['ft']['implementation_steps'],
            "base_action_density": metrics['base']['action_density'],
            "base_decision_points": metrics['base']['decision_points'],
            "base_implementation_steps": metrics['base']['implementation_steps'],
        }
        for sample, metrics in enhanced_analysis['actionability'].items()
    }, orient='index')

    strategic_depth_df = pd.DataFrame.from_dict({
        sample: {
            "ft_level1": metrics['ft']['reasoning_levels']['level1'],
            "ft_level2": metrics['ft']['reasoning_levels']['level2'],
            "ft_level3": metrics['ft']['reasoning_levels']['level3'],
            "base_level1": metrics['base']['reasoning_levels']['level1'],
            "base_level2": metrics['base']['reasoning_levels']['level2'],
            "base_level3": metrics['base']['reasoning_levels']['level3'],
        }
        for sample, metrics in enhanced_analysis['strategic_depth'].items()
    }, orient='index')

    # 3. Save DataFrames as CSV files for further analysis
    quality_metrics_df.to_csv('../data/final/quality_metrics.csv')
    actionability_df.to_csv('../data/final/actionability_metrics.csv')
    strategic_depth_df.to_csv('../data/final/strategic_depth.csv')
    
    # Debug: Print sample rows from the DataFrames
    print("\nSample from Quality Metrics DataFrame:")
    print(quality_metrics_df.head())
    
    print("\nSample from Actionability Metrics DataFrame:")
    print(actionability_df.head())
    
    print("\nSample from Strategic Depth DataFrame:")
    print(strategic_depth_df.head())
    
    # 4. Generate visualizations
    visualization_data = {
        'responses': {
            idx: {
                'fine_tuned_response': resp['fine_tuned_response'],
                'base_response': resp['base_response'],
                'query': resp['query']
            }
            for idx, resp in enumerate(raw_data['detailed_responses'])
        },
        'quality_metrics': enhanced_analysis['quality_metrics'],
        'actionability': enhanced_analysis['actionability'],
        'strategic_depth': enhanced_analysis['strategic_depth'],
        'comparative': enhanced_analysis.get('comparative', {}),
        'ux_metrics': enhanced_analysis.get('ux_metrics', {})
    }
    
    dashboard = visualizer.visualize_comprehensive_analysis(visualization_data)
    stats_dashboard = visualizer.plot_statistical_analysis(visualization_data)
    
    # 5. Save and display visualizations
    visualizer.save_visualizations(dashboard, 'poker_analysis_results')
    visualizer.save_visualizations(stats_dashboard, 'poker_statistical_analysis')
    
    print("\nDisplaying Analysis Dashboard...")
    dashboard.show()
    print("\nDisplaying Statistical Analysis Dashboard...")
    stats_dashboard.show()

except Exception as e:
    print(f"Error in visualization: {str(e)}")
    traceback.print_exc()

Using device: mps



`resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Model device: mps:0

Sample from Quality Metrics DataFrame:
          ft_term_density  ft_math_accuracy  ft_strategic_consistency  \
sample_0         0.000000                 0                       1.0   
sample_1         0.026667                 0                       1.0   
sample_2         0.002874                 0                       1.0   
sample_3         0.023810                 0                       1.0   
sample_4         0.030303                 0                       1.0   

          base_term_density  base_math_accuracy  base_strategic_consistency  
sample_0           0.006504                 0.0                    0.987013  
sample_1           0.006088                 0.0                    1.000000  
sample_2           0.006410                 0.0                    1.000000  
sample_3           0.007156                 0.0                    1.000000  
sample_4           0.012170                 0.0                    1.000000  

Sample from Actionability Metric


Precision loss occurred in moment calculation due to catastrophic cancellation. This occurs when the data are nearly identical. Results may be unreliable.




Displaying Statistical Analysis Dashboard...


## Reasoning & QA

In [103]:
import json
import textstat
from typing import Dict, List, Any


class PokerStrategyQA:
    """Benchmark for testing poker reasoning and QA accuracy."""
    
    def __init__(self, json_path: str):
        """Initialize with path to JSON data."""
        with open(json_path, 'r') as f:
            data = json.load(f)
        
        # Extract detailed responses
        self.responses = data.get('detailed_responses', [])
        self.categories = self._group_by_category(self.responses)
        print(f"Loaded {len(self.responses)} responses across {len(self.categories)} categories.")

    def _group_by_category(self, responses: List[Dict[str, Any]]) -> Dict[str, List[Dict]]:
        """Group responses by category."""
        grouped = {}
        for response in responses:
            category = response.get('category', 'uncategorized')
            grouped.setdefault(category, []).append(response)
        return grouped

    def run_tests(self) -> Dict[str, Any]:
        """Run all QA reasoning tests and aggregate results."""
        results = {}
        for category, cases in self.categories.items():
            print(f"Processing category: {category} ({len(cases)} cases)")
            results[category] = self._run_category_test(cases)
        
        return results

    def _run_category_test(self, cases: List[Dict]) -> Dict[str, Any]:
        """Evaluate all cases in a category."""
        metrics = {
            "ft_conciseness": 0.0, "base_conciseness": 0.0,
            "ft_clarity": 0.0, "base_clarity": 0.0,
            "ft_reasoning_depth": 0.0, "base_reasoning_depth": 0.0,
        }
        total_cases = len(cases)

        for case in cases:
            evaluation = self._evaluate_case(case)
            for key in metrics:
                metrics[key] += evaluation[key]

        # Average the metrics
        for key in metrics:
            metrics[key] /= total_cases

        # Calculate improvement percentages
        improvement = {
            "conciseness_improvement": self._calculate_improvement(metrics["ft_conciseness"], metrics["base_conciseness"]),
            "clarity_improvement": self._calculate_improvement(metrics["ft_clarity"], metrics["base_clarity"]),
            "reasoning_depth_improvement": self._calculate_improvement(metrics["ft_reasoning_depth"], metrics["base_reasoning_depth"]),
        }

        return {"metrics": metrics, "improvement": improvement}

    def _evaluate_case(self, case: Dict) -> Dict[str, float]:
        """Evaluate a single case for conciseness, clarity, and reasoning depth."""
        ft_response = case.get("fine_tuned_response", "")
        base_response = case.get("base_response", "")

        return {
            "ft_conciseness": self._evaluate_conciseness(ft_response),
            "base_conciseness": self._evaluate_conciseness(base_response),
            "ft_clarity": self._evaluate_clarity(ft_response),
            "base_clarity": self._evaluate_clarity(base_response),
            "ft_reasoning_depth": self._evaluate_reasoning_depth(ft_response),
            "base_reasoning_depth": self._evaluate_reasoning_depth(base_response),
        }

    def _evaluate_conciseness(self, response: str) -> float:
        """Evaluate conciseness based on word count."""
        word_count = len(response.split())
        if word_count <= 50:
            return 1.0
        elif word_count <= 100:
            return 0.75
        elif word_count <= 200:
            return 0.5
        return 0.25

    def _evaluate_clarity(self, response: str) -> float:
        """Evaluate clarity using Flesch Reading Ease."""
        try:
            score = textstat.flesch_reading_ease(response)
            return min(1.0, score / 100)  # Normalize score to [0, 1]
        except Exception:
            return 0.0

    def _evaluate_reasoning_depth(self, response: str) -> float:
        """Evaluate reasoning depth based on logical terms."""
        reasoning_terms = ["because", "therefore", "thus", "if", "then", "hence", "strategy", "logical", "reasoning"]
        count = sum(response.lower().count(term) for term in reasoning_terms)
        return min(1.0, count / 5)  # Normalize score to [0, 1]

    def _calculate_improvement(self, ft_score: float, base_score: float) -> float:
        """Calculate percentage improvement."""
        if base_score > 0:
            return ((ft_score - base_score) / base_score) * 100
        return 0.0

    def save_results(self, results: Dict[str, Any], output_path: str = "../data/final/qa_test_results.json"):
        """Save test results to JSON."""
        with open(output_path, "w") as f:
            json.dump(results, f, indent=2)
        print(f"Results saved to {output_path}")


# Run the new test
qa_benchmark = PokerStrategyQA(json_path="../data/final/detailed_analysis_20241216_142035.json")
results = qa_benchmark.run_tests()
qa_benchmark.save_results(results)

# Print a summary
for category, data in results.items():
    print(f"\n--- {category.replace('_', ' ').title()} ---")
    for metric, value in data["metrics"].items():
        print(f"{metric.replace('_', ' ').title()}: {value:.2f}")
    for improvement, value in data["improvement"].items():
        print(f"{improvement.replace('_', ' ').title()}: {value:.2f}%")

Loaded 37 responses across 19 categories.
Processing category: basic_concepts_fundamentals (6 cases)
Processing category: strategic_preflop_strategy (2 cases)
Processing category: strategic_postflop_strategy (4 cases)
Processing category: complex_spots_multiway_pots (2 cases)
Processing category: complex_spots_bluff_spots (2 cases)
Processing category: complex_spots_balance_sizing (1 cases)
Processing category: complex_spots_turn_barreling (1 cases)
Processing category: tournament_icm_pressure (2 cases)
Processing category: tournament_bubble_play (2 cases)
Processing category: tournament_final_table (1 cases)
Processing category: tournament_short_stack_icm (1 cases)
Processing category: hand_reading_range_analysis (2 cases)
Processing category: hand_reading_live_tells (2 cases)
Processing category: hand_reading_combo_counting (1 cases)
Processing category: hand_reading_node_locking (1 cases)
Processing category: end_game_heads_up (2 cases)
Processing category: end_game_final_table (2 c