In [1]:
# import the necessary libraries
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling,
    BitsAndBytesConfig,
)
import numpy as np
import pandas as pd
import json
import os
import warnings
from tqdm.auto import tqdm
import gc
import torch

In [2]:
# Suppress warnings
warnings.filterwarnings("ignore")

# Check device availability in priority order (optimized for GPU)
if torch.cuda.is_available():
    device = torch.device("cuda")
    print(f"Using NVIDIA GPU (CUDA): {torch.cuda.get_device_name()}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB")

Using NVIDIA GPU (CUDA): NVIDIA H100 80GB HBM3
GPU Memory: 79.2 GB


In [3]:
# Load the PokerBench dataset
print("Loading dataset...")

try:
    dataset = load_dataset("RZ412/PokerBench")
    print("Dataset loaded")
    print(f"Train: {len(dataset['train'])}, Test: {len(dataset['test'])}")

except Exception as e:
    print(f"Error loading dataset: {e}")

Loading dataset...
Dataset loaded
Train: 563200, Test: 11000


In [None]:
# log into Hugging Face
from huggingface_hub import login

# IMPORTANT: Remove your token before pushing to GitHub!
# Option 1: Use environment variable
# login(token=os.getenv("HF_TOKEN"))

# Option 2: Use interactive login (will prompt for token)
login()

# Option 3: If you've already logged in once, you can just skip this cell
# The token is cached in ~/.huggingface/token

In [12]:
# load two models
MODEL_NAME_1 = "google/gemma-2b"
MODEL_NAME_2 = "meta-llama/Meta-Llama-3-8B"

# Load tokenizer and model for MODEL_NAME_1
tokenizer_1 = AutoTokenizer.from_pretrained(MODEL_NAME_1)
model_1 = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME_1,
    load_in_8bit=True,
    device_map="auto",
)
# Load tokenizer and model for MODEL_NAME_2
tokenizer_2 = AutoTokenizer.from_pretrained(MODEL_NAME_2)
model_2 = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME_2,
    load_in_8bit=True,
    device_map="auto",
)

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

tokenizer_config.json:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/73.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/654 [00:00<?, ?B/s]

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/177 [00:00<?, ?B/s]

In [26]:
# Helper function to format prompts (similar to poker_finetuning.ipynb)
def format_prompt(instruction, output=""):
    """Format instruction-output pairs for inference"""
    return f"### Instruction:\n{instruction}\n\n### Response:\n{output}"

# Test on a single data point with reasoning/explanation
def generate_single_response(model, tokenizer, instruction, max_length=200, include_reasoning=True):
    """Generate response for a single poker instruction
    
    Args:
        model: The model to use for generation
        tokenizer: The tokenizer to use
        instruction: The poker scenario instruction
        max_length: Maximum length of generated response
        include_reasoning: If True, keeps explanations; if False, returns only action
    """
    
    # Format the prompt (leave output empty for generation)
    prompt = format_prompt(instruction, "")
    
    # Tokenize
    inputs = tokenizer.encode(prompt, return_tensors="pt").to(model.device)
    
    # Generate with parameters optimized for reasoning
    with torch.no_grad():
        outputs = model.generate(
            inputs,
            max_length=inputs.shape[1] + max_length,  # Add to input length
            temperature=0.7,  # Higher temperature for more natural explanations
            do_sample=True,
            top_p=0.9,
            pad_token_id=tokenizer.eos_token_id,
            eos_token_id=tokenizer.eos_token_id,
        )
    
    # Decode response
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    # Extract only the generated part (remove the prompt)
    generated_part = response[len(prompt):].strip()
    
    # If user doesn't want reasoning, extract just the first line (action only)
    if not include_reasoning:
        lines = generated_part.split('\n')
        for line in lines:
            line = line.strip()
            if line and not line.startswith('#'):
                generated_part = line
                break
    
    return generated_part

In [20]:
# Test the first model
sample_input = dataset['train'][0]['instruction']
print(sample_input)



You are a specialist in playing 6-handed No Limit Texas Holdem. The following will be a game scenario and you need to make the optimal decision.

Here is a game summary:

The small blind is 0.5 chips and the big blind is 1 chips. Everyone started with 100 chips.
The player positions involved in this game are UTG, HJ, CO, BTN, SB, BB.
In this hand, your position is HJ, and your holding is [King of Diamond and Jack of Spade].
Before the flop, HJ raise 2.0 chips, and BB call. Assume that all other players that is not mentioned folded.
The flop comes King Of Spade, Seven Of Heart, and Two Of Diamond, then BB check, and HJ check.
The turn comes Jack Of Club, then BB check, HJ bet 3 chips, BB raise 10 chips, and HJ call.
The river comes Seven Of Club, then BB check.


Now it is your turn to make a move.
To remind you, the current pot size is 24.0 chips, and your holding is [King of Diamond and Jack of Spade].

Decide on an action based on the strength of your hand on this board, your posit

In [27]:
# Generate responses for the train set using MODEL_NAME_2
print("Generating responses for MODEL_NAME_2...")
responses_2 = generate_single_response(model_2, tokenizer_2, sample_input, include_reasoning=True)

Generating responses for MODEL_NAME_2...


In [28]:
print("True response:", dataset['train'][0]['output'])
print("Generated response:", responses_2)

True response: bet 18
Generated response: Check

### Reasoning:
There is no draw and my pair is not strong enough to call. I want to give up this hand and continue to play later.
