# Polish Language Model Evaluation with Polish GLUE

This notebook evaluates and compares three language models on Polish language tasks using the Polish GLUE (KLEJ) benchmark:

1. **Bielik-11B-v2.3-Instruct** - Specialized Polish language model
2. **Gemma-3-4B-IT** - Multilingual model from Google
3. **Phi-4-mini-instruct** - Multilingual model from Microsoft

The Polish GLUE (KLEJ) benchmark includes multiple tasks specifically designed to evaluate language models on Polish language understanding.

## Tasks in Polish GLUE (KLEJ)

1. **CDSC-E** - Compositional Distributional Semantics Corpus - Entailment task
2. **CDSC-R** - Compositional Distributional Semantics Corpus - Relatedness task
3. **CBD** - Cyberbullying Detection dataset
4. **PolEmo2-in** - Sentiment analysis (in-domain)
5. **PolEmo2-out** - Sentiment analysis (out-of-domain)
6. **DYK** - Did You Know - Genuine question detection 
7. **PSC** - Polish Summaries Corpus
8. **AR** - Allegro Reviews - Sentiment analysis

## Environment Setup

In [None]:
# Add parent directory to path to allow importing from src
import sys
import os
sys.path.append(os.path.abspath('..'))

# Import necessary libraries
import torch
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from dotenv import load_dotenv
from datetime import datetime
import json
import numpy as np

# Import project modules
from src.utils import authenticate_huggingface, get_device
from src.init_models import initialize_models, configure_generation_parameters
from src.datasets import prepare_polish_datasets, create_task_specific_prompt
from src.evaluation import evaluate_models

# Configure visualization
%matplotlib inline
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("viridis")

# Set up logging
import logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')

## Hugging Face Authentication

In [None]:
# Load environment variables from .env file
load_dotenv()

# Get Hugging Face token from environment
hf_token = os.getenv("HF_TOKEN")  # Load from environment
if not hf_token:
    # If not found, ask for token input
    import getpass
    hf_token = os.getenv("HF_TOKEN")  # Load from environment

# Authenticate with Hugging Face
authenticate_huggingface(hf_token)

## Check Available Hardware

In [None]:
# Check if CUDA is available
print(f"CUDA available: {torch.cuda.is_available()}")

# Check number of GPUs
if torch.cuda.is_available():
    device_count = torch.cuda.device_count()
    print(f"Number of GPUs: {device_count}")
    
    for i in range(device_count):
        device_name = torch.cuda.get_device_name(i)
        total_memory = torch.cuda.get_device_properties(i).total_memory / 1e9  # Convert to GB
        print(f"GPU {i}: {device_name} ({total_memory:.2f} GB)")
else:
    print("No GPUs detected, using CPU")

# Determine the device to use
device = get_device()
print(f"Using device: {device}")

## Load Polish GLUE Datasets

In [None]:
# Load a small subset of KLEJ datasets for initial testing
# You can adjust max_samples for full evaluation later
datasets = prepare_polish_datasets(
    tasks=["klej"],
    # Select specific KLEJ tasks (comment out to load all)
    klej_datasets=["cdsc-e", "polemo2-in"],
    max_samples=10  # Small number for testing, increase for full evaluation
)

# Add translation dataset as well
translation_dataset = prepare_polish_datasets(
    tasks=["translation"],
    max_samples=10
)
datasets.update(translation_dataset)

# Display dataset information
for task_name, dataset_info in datasets.items():
    print(f"\nTask: {task_name}")
    print(f"Description: {dataset_info['description']}")
    print(f"Task type: {dataset_info['task_type']}")
    print(f"Metric: {dataset_info['metric']}")
    print(f"Dataset size: {len(dataset_info['dataset'])}")
    
    # Show a sample example
    print("\nSample item:")
    sample = dataset_info['dataset'][0]
    for key in dataset_info['keys']:
        value = sample[key]
        if isinstance(value, str) and len(value) > 100:
            print(f"  {key}: {value[:100]}...")
        else:
            print(f"  {key}: {value}")
    
    # Show example prompt
    prompt = create_task_specific_prompt(task_name, sample)
    print(f"\nExample prompt:\n{prompt}")

## Initialize Models

**Note**: Loading all models at once may exceed GPU memory. Consider loading them one by one as needed.

In [None]:
# For initial testing, let's start with the smallest model (Phi-4-mini)
models = initialize_models(
    device=device,
    load_in_8bit=True,  # Use 8-bit quantization to save memory
    token= os.getenv("HF_TOKEN")  # Load from environment
    models_to_load=["phi"]  # Start with just one model
)

# Print model info
for name, (model, tokenizer) in models.items():
    print(f"\nModel: {name}")
    print(f"  - Model type: {type(model).__name__}")
    print(f"  - Tokenizer type: {type(tokenizer).__name__}")
    print(f"  - Vocabulary size: {len(tokenizer)}")

## Test Simple Generation

Let's test a simple Polish text generation to verify the model is working properly:

In [None]:
# Pick the first model
model_name = list(models.keys())[0]
model, tokenizer = models[model_name]

# Define a simple Polish prompt
prompt = "Napisz krótki tekst o języku polskim. Język polski to język, który"

# Get model-specific generation parameters
gen_params = configure_generation_parameters(model_name)

# Tokenize input
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

# Generate text
with torch.no_grad():
    outputs = model.generate(
        **inputs,
        max_new_tokens=150,
        **gen_params
    )

# Decode output
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(f"Generated text:\n{generated_text}")

## Test Polish GLUE Task Example

Let's test a single example from a Polish GLUE task:

In [None]:
# Select a task and sample
task_name = next(iter(datasets.keys()))
dataset_info = datasets[task_name]
sample = dataset_info['dataset'][0]

# Create prompt for this task
prompt = create_task_specific_prompt(task_name, sample)
print(f"Task: {task_name}")
print(f"Prompt:\n{prompt}")

# Generate response
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
with torch.no_grad():
    outputs = model.generate(
        **inputs,
        max_new_tokens=50,
        **gen_params
    )

# Decode output
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(f"\nModel response:\n{generated_text}")

# Extract the actual answer (after the prompt)
answer = generated_text.replace(prompt, "").strip()
print(f"\nExtracted answer: {answer}")

# Show the ground truth answer
label_key = dataset_info['keys'][-1]  # Usually the last key is the label
label_value = sample[label_key]
print(f"Ground truth: {label_value}")

## Run Evaluation

Now let's evaluate the model on Polish GLUE tasks:

In [None]:
# Configure evaluation parameters
metrics = {
    "classification": ["accuracy", "f1"],
    "regression": ["mse", "spearman"],
    "generation": ["bleu", "rouge"]
}

# Use model-specific generation parameters
generation_params = {}
for model_name in models.keys():
    generation_params[model_name] = configure_generation_parameters(model_name)

# Define results directory
results_dir = os.path.join(os.path.dirname(os.path.abspath('.')), "results")
os.makedirs(results_dir, exist_ok=True)
print(f"Results will be saved to: {results_dir}")

# Run evaluation
evaluation_results = evaluate_models(
    models=models,
    datasets=datasets,
    metrics=metrics,
    max_samples=None,  # Use all examples in the datasets
    generation_params=generation_params,
    save_results=True,
    results_dir=results_dir
)

print("Evaluation completed!")

## Visualize Results

In [None]:
# Create a summary table of results
results_summary = []

for model_name, model_results in evaluation_results.items():
    for task_name, task_results in model_results.items():
        metrics = task_results.get('metrics', {})
        
        for metric_name, score in metrics.items():
            results_summary.append({
                'Model': model_name,
                'Task': task_name,
                'Metric': metric_name,
                'Score': score
            })

# Convert to DataFrame
results_df = pd.DataFrame(results_summary)

# Display the results table
display(results_df)

# Create visualizations
plt.figure(figsize=(12, 8))

# Group by task and metric, showing scores for different models
for task in results_df['Task'].unique():
    task_df = results_df[results_df['Task'] == task]
    
    plt.figure(figsize=(10, 6))
    sns.barplot(x='Metric', y='Score', hue='Model', data=task_df)
    plt.title(f'Performance on {task}')
    plt.ylim(0, max(1.0, task_df['Score'].max() * 1.1))  # Adjust y-axis limit based on max score
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

## Analysis of Results

In [None]:
# Analyze model performance across all tasks
model_summaries = {}

for model_name in results_df['Model'].unique():
    model_data = results_df[results_df['Model'] == model_name]
    
    # For metrics like accuracy, f1 where higher is better
    higher_better_metrics = ['accuracy', 'f1', 'f1_macro', 'bleu', 'rouge1', 'rouge2', 'rougeL', 'spearman']
    higher_scores = model_data[model_data['Metric'].isin(higher_better_metrics)]
    
    # For metrics like mse, mae where lower is better
    lower_better_metrics = ['mse', 'mae']
    lower_scores = model_data[model_data['Metric'].isin(lower_better_metrics)]
    
    model_summaries[model_name] = {
        'avg_accuracy': higher_scores[higher_scores['Metric'] == 'accuracy']['Score'].mean(),
        'avg_f1': higher_scores[higher_scores['Metric'].str.contains('f1')]['Score'].mean(),
        'best_task': higher_scores.loc[higher_scores['Score'].idxmax()]['Task'],
        'best_score': higher_scores['Score'].max(),
        'worst_task': higher_scores.loc[higher_scores['Score'].idxmin()]['Task'],
        'worst_score': higher_scores['Score'].min()
    }

# Convert to DataFrame for display
summary_df = pd.DataFrame.from_dict(model_summaries, orient='index')
display(summary_df)

# Visualize model comparison across different metrics
plt.figure(figsize=(12, 6))
sns.barplot(x='Model', y='Score', hue='Metric', 
            data=results_df[results_df['Metric'].isin(['accuracy', 'f1_macro'])])
plt.title('Model Performance Comparison')
plt.ylim(0, 1.0)
plt.xticks(rotation=0)
plt.tight_layout()
plt.show()

## Save Detailed Analysis

In [None]:
# Generate comprehensive analysis report
analysis_report = {
    'summary': model_summaries,
    'detailed_results': evaluation_results,
    'metadata': {
        'evaluation_date': datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
        'models_evaluated': list(models.keys()),
        'tasks_evaluated': list(datasets.keys()),
        'sample_counts': {task: len(info['dataset']) for task, info in datasets.items()}
    }
}

# Save to JSON
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
report_path = os.path.join(results_dir, f"analysis_report_{timestamp}.json")

# Convert any non-serializable objects to strings
def convert_for_json(obj):
    if isinstance(obj, (np.int64, np.int32, np.float64, np.float32)):
        return obj.item()
    return str(obj)

with open(report_path, 'w', encoding='utf-8') as f:
    json.dump(analysis_report, f, ensure_ascii=False, indent=2, default=convert_for_json)

print(f"Analysis report saved to: {report_path}")

## Next Steps

1. Evaluate the other models (Gemma and Bielik)
2. Test with more KLEJ benchmark tasks
3. Increase the sample size for more robust evaluation
4. Perform error analysis on model predictions
5. Explore model performance on different categories of examples
6. Generate a comprehensive report comparing all three models