In [1]:
#!pip install datasets

In [2]:
from datasets import load_dataset

In [3]:
data = load_dataset('majorSeaweed/financeQA_100K')


In [4]:
data

DatasetDict({
    train: Dataset({
        features: ['question', 'context', 'answer'],
        num_rows: 90566
    })
    validation: Dataset({
        features: ['question', 'context', 'answer'],
        num_rows: 5031
    })
    test: Dataset({
        features: ['question', 'context', 'answer'],
        num_rows: 5032
    })
})

In [5]:
data_tr = load_dataset('majorSeaweed/financeQA_100K', split = 'train')
data_val = load_dataset('majorSeaweed/financeQA_100K', split = 'validation')
data_test = load_dataset('majorSeaweed/financeQA_100K', split = 'test')


In [6]:
print(data_tr)
print(data_val)
print(data_test)

Dataset({
    features: ['question', 'context', 'answer'],
    num_rows: 90566
})
Dataset({
    features: ['question', 'context', 'answer'],
    num_rows: 5031
})
Dataset({
    features: ['question', 'context', 'answer'],
    num_rows: 5032
})


In [7]:
import tensorflow as tf
from transformers import TFGPT2LMHeadModel, GPT2Tokenizer, DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments
import numpy as np
import pandas as pd
import re

2025-10-09 17:11:02.512196: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2025-10-09 17:11:10.155470: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2025-10-09 17:11:24.717173: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.


In [8]:
# Take small subsets for fast trainingtrain_data = data['train'].select(range(5000))  # Good for substantial training
train_data = data['train'].select(range(5000))  # Good for substantial training
val_data = data['validation'].select(range(1000))  # Better validation
test_data = data['test'].select(range(1000))  # Proper testing

print(f"Training samples: {len(train_data)}")
print(f"Validation samples: {len(val_data)}")
print(f"Test samples: {len(test_data)}")

Training samples: 5000
Validation samples: 1000
Test samples: 1000


In [9]:
# STEP 2: Initialize tokenizer FIRST
from transformers import GPT2Tokenizer

print("Loading tokenizer...")
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

Loading tokenizer...


In [10]:
# Let's examine what's actually in our tokenized data
print("=== EXAMINING TOKENIZED DATA QUALITY ===")

# Check a few examples to see if they look right
print("Sample tokenized examples:")
for i in range(3):
    print(f"\n--- Example {i+1} ---")
    print(f"Original question: {train_data[i]['question'][:100]}...")
    print(f"Original answer: {train_data[i]['answer'][:100]}...")
    print(f"Tokenized input_ids length: {len(tokenized_train_list[i]['input_ids'])}")
    print(f"Tokenized attention_mask length: {len(tokenized_train_list[i]['attention_mask'])}")
    
    # Decode back to text to see if it makes sense
    decoded_text = tokenizer.decode(tokenized_train_list[i]['input_ids'], skip_special_tokens=True)
    print(f"Decoded text: {decoded_text[:150]}...")

# Check for problematic data
print("\n=== CHECKING FOR DATA ISSUES ===")

# 1. Check for empty sequences
empty_sequences = [i for i, item in enumerate(tokenized_train_list) if len(item['input_ids']) == 0]
print(f"Empty sequences found: {len(empty_sequences)}")

# 2. Check for very short sequences (might be corrupted)
short_sequences = [i for i, item in enumerate(tokenized_train_list) if len(item['input_ids']) < 10]
print(f"Very short sequences (<10 tokens): {len(short_sequences)}")

# 3. Check sequence length distribution
lengths = [len(item['input_ids']) for item in tokenized_train_list]
print(f"Sequence length stats:")
print(f"  Min: {min(lengths)}, Max: {max(lengths)}, Mean: {np.mean(lengths):.1f}")
print(f"  Standard deviation: {np.std(lengths):.1f}")

# 4. Check if all sequences have both input_ids and attention_mask
incomplete_sequences = [i for i, item in enumerate(tokenized_train_list) 
                       if 'input_ids' not in item or 'attention_mask' not in item]
print(f"Incomplete sequences: {len(incomplete_sequences)}")

=== EXAMINING TOKENIZED DATA QUALITY ===
Sample tokenized examples:

--- Example 1 ---
Original question: What is the total estimated project cost mentioned in the document?...
Original answer: The grand total of the estimated project cost is $7,594,720....


NameError: name 'tokenized_train_list' is not defined

In [11]:
def clean_text(text):
    """Clean text but KEEP stop words for natural conversation"""
    if not isinstance(text, str):
        return ""
    
    # Remove markdown patterns
    text = re.sub(r'#+\s*Document Type[:]?', '', text)
    text = re.sub(r'\*\*.*?\*\*', '', text)
    text = re.sub(r'###\s*', '', text)
    text = re.sub(r'- \*\*', '', text)
    
    # Clean extra whitespace but keep natural language
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

def create_conversation(example):
    """Format for conversational training"""
    question = clean_text(example['question'])
    answer = clean_text(example['answer'])
    
    # Simple conversational format
    formatted_text = f"User: {question} Assistant: {answer}{tokenizer.eos_token}"
    return {'text': formatted_text}

print("Cleaning and formatting data...")
train_data_clean = train_data.map(create_conversation)
val_data_clean = val_data.map(create_conversation)
# Show some examples
print("\nSample formatted conversations:")
for i in range(2):
    print(f"Example {i+1}: {train_data_clean[i]['text'][:100]}...")


Cleaning and formatting data...

Sample formatted conversations:
Example 1: User: What is the total estimated project cost mentioned in the document? Assistant: The grand total...
Example 2: User: Where should the payment be remitted to? Assistant: The payment should be remitted to Wolf Kni...


In [12]:
# STEP 4: Tokenization (now tokenizer is defined)
def tokenize_data(examples):
    return tokenizer(
        examples['text'],
        truncation=True,
        padding=True,
        max_length=256,
        return_tensors="tf"
    )

print("Tokenizing data...")
tokenized_train = train_data_clean.map(tokenize_data, batched=True, remove_columns=train_data_clean.column_names)
tokenized_val = val_data_clean.map(tokenize_data, batched=True, remove_columns=val_data_clean.column_names)


Tokenizing data...


In [13]:
def create_tf_dataset(tokenized_data):
    input_ids = []
    attention_mask = []

    for x in tokenized_data:
        if len(x['input_ids']) < 256:
            pad_length = 256 - len(x['input_ids'])
            input_ids.append(x['input_ids'] + [tokenizer.pad_token_id]*pad_length)
            attention_mask.append(x['attention_mask'] + [0]*pad_length)
        else:
            input_ids.append(x['input_ids'][:256])
            attention_mask.append(x['attention_mask'][:256])

    input_ids = tf.constant(input_ids, dtype=tf.int32)
    attention_mask = tf.constant(attention_mask, dtype=tf.int32)
    labels = tf.constant(input_ids, dtype=tf.int32)

    return tf.data.Dataset.from_tensor_slices(
        ({'input_ids': input_ids, 'attention_mask': attention_mask}, labels)
    )


In [14]:
tf_train_dataset = create_tf_dataset(tokenized_train).shuffle(1000).batch(8)
tf_val_dataset = create_tf_dataset(tokenized_val).batch(8)


2025-10-09 17:14:08.144682: E external/local_xla/xla/stream_executor/cuda/cuda_platform.cc:51] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: UNKNOWN ERROR (303)


<class 'datasets.arrow_dataset.Dataset'>
{'input_ids': [12982, 25, 1867, 318, 262, 2472, 6108, 1628, 1575, 4750, 287, 262, 3188, 30, 15286, 25, 383, 4490, 2472, 286, 262, 6108, 1628, 1575, 318, 720, 22, 11, 46438, 11, 23906, 13, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}


In [15]:
# Convert HF Dataset to plain list of dicts
tokenized_train_list = tokenized_train.to_dict()['input_ids']
tokenized_train_list = [{'input_ids': iid, 'attention_mask': am}
                        for iid, am in zip(tokenized_train['input_ids'], tokenized_train['attention_mask'])]

tokenized_val_list = [{'input_ids': iid, 'attention_mask': am}
                      for iid, am in zip(tokenized_val['input_ids'], tokenized_val['attention_mask'])]


In [16]:
tf_train_dataset = create_tf_dataset(tokenized_train_list).batch(8)
tf_val_dataset = create_tf_dataset(tokenized_val_list).batch(8)


In [17]:
print(type(tokenized_train))       # Should ideally be list
print(tokenized_train[0])          # Should show {'input_ids': [...], 'attention_mask': [...]}


<class 'datasets.arrow_dataset.Dataset'>
{'input_ids': [12982, 25, 1867, 318, 262, 2472, 6108, 1628, 1575, 4750, 287, 262, 3188, 30, 15286, 25, 383, 4490, 2472, 286, 262, 6108, 1628, 1575, 318, 720, 22, 11, 46438, 11, 23906, 13, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}


In [18]:
from transformers import TFGPT2LMHeadModel

print("Loading GPT-2 model with safetensors disabled...")
model = TFGPT2LMHeadModel.from_pretrained("gpt2", use_safetensors=False)

print("✅ Model loaded successfully!")

Loading GPT-2 model with safetensors disabled...


TensorFlow and JAX classes are deprecated and will be removed in Transformers v5. We recommend migrating to PyTorch classes or pinning your version of Transformers.
All model checkpoint layers were used when initializing TFGPT2LMHeadModel.

All the layers of TFGPT2LMHeadModel were initialized from the model checkpoint at gpt2.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.


✅ Model loaded successfully!


In [None]:
# MANUAL TRAINING LOOP - GUARANTEED TO WORK
print("=== SETTING UP MANUAL TRAINING ===")

# Convert to lists and prepare data
tokenized_train_list = list(tokenized_train)
tokenized_val_list = list(tokenized_val)

print(f"Training samples: {len(tokenized_train_list)}")
print(f"Validation samples: {len(tokenized_val_list)}")

# Prepare data arrays
def prepare_data_arrays(tokenized_list):
    input_ids = []
    attention_mask = []
    
    for item in tokenized_list:
        seq = item['input_ids']
        mask = item['attention_mask']
        
        # Pad to exactly 256
        if len(seq) < 256:
            pad_len = 256 - len(seq)
            input_ids.append(seq + [tokenizer.pad_token_id] * pad_len)
            attention_mask.append(mask + [0] * pad_len)
        else:
            input_ids.append(seq[:256])
            attention_mask.append(mask[:256])
    
    return (np.array(input_ids, dtype=np.int32), 
            np.array(attention_mask, dtype=np.int32))

print("Preparing data arrays...")
train_input_ids, train_attention_mask = prepare_data_arrays(tokenized_train_list)
val_input_ids, val_attention_mask = prepare_data_arrays(tokenized_val_list)

print(f"Training data shape: {train_input_ids.shape}")
print(f"Validation data shape: {val_input_ids.shape}")

# Manual training function
def train_manually(model, train_data, val_data, epochs=3):
    optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5)
    
    train_input_ids, train_attention_mask = train_data
    val_input_ids, val_attention_mask = val_data
    
    train_losses = []
    val_losses = []
    
    for epoch in range(epochs):
        print(f"\n🎯 Epoch {epoch + 1}/{epochs}")
        
        # --- TRAINING ---
        epoch_train_loss = 0
        num_train_batches = 0
        
        # Process training in batches
        for i in range(0, len(train_input_ids), 8):
            # Get batch
            batch_input_ids = train_input_ids[i:i+8]
            batch_attention_mask = train_attention_mask[i:i+8]
            
            # Forward pass with gradient tape
            with tf.GradientTape() as tape:
                outputs = model(
                    input_ids=batch_input_ids,
                    attention_mask=batch_attention_mask,
                    labels=batch_input_ids  # Labels are same as input_ids for LM
                )
                loss = outputs.loss
            
            # Backward pass
            gradients = tape.gradient(loss, model.trainable_variables)
            optimizer.apply_gradients(zip(gradients, model.trainable_variables))
            
            epoch_train_loss += loss.numpy()
            num_train_batches += 1
            
            # Print progress every 100 batches
            if num_train_batches % 100 == 0:
                print(f"  Training batch {num_train_batches}, Loss: {loss.numpy():.4f}")
        
        avg_train_loss = epoch_train_loss / num_train_batches
        train_losses.append(avg_train_loss)
        
        # --- VALIDATION ---
        epoch_val_loss = 0
        num_val_batches = 0
        
        # Process validation in batches (no gradients)
        for i in range(0, len(val_input_ids), 8):
            batch_input_ids = val_input_ids[i:i+8]
            batch_attention_mask = val_attention_mask[i:i+8]
            
            outputs = model(
                input_ids=batch_input_ids,
                attention_mask=batch_attention_mask,
                labels=batch_input_ids
            )
            epoch_val_loss += outputs.loss.numpy()
            num_val_batches += 1
        
        avg_val_loss = epoch_val_loss / num_val_batches
        val_losses.append(avg_val_loss)
        
        print(f"✅ Epoch {epoch + 1} completed:")
        print(f"   Training Loss: {avg_train_loss:.4f}")
        print(f"   Validation Loss: {avg_val_loss:.4f}")
    
    return train_losses, val_losses

print("\n🚀 STARTING MANUAL TRAINING...")
print("This will take 30-60 minutes on CPU...")
print("You'll see progress updates every 100 batches")

# Start manual training
train_losses, val_losses = train_manually(
    model,
    (train_input_ids, train_attention_mask),
    (val_input_ids, val_attention_mask),
    epochs=3
)

print("\n🎉 TRAINING COMPLETED!")
print("Final losses:")
print(f"  Training: {train_losses[-1]:.4f}")
print(f"  Validation: {val_losses[-1]:.4f}")

=== SETTING UP MANUAL TRAINING ===
Training samples: 5000
Validation samples: 1000
Preparing data arrays...
Training data shape: (5000, 256)
Validation data shape: (1000, 256)

🚀 STARTING MANUAL TRAINING...
This will take 30-60 minutes on CPU...
You'll see progress updates every 100 batches

🎯 Epoch 1/3
