<a href="https://colab.research.google.com/github/Gakwaya011/AskFinanceAI/blob/main/experiment_2000_samples.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
# COLAB SPECIFIC SETUP
print("🚀 COLAB SETUP - MOUNTING DRIVE AND CHECKING GPU")
print("=" * 70)

# Mount Google Drive for saving models
from google.colab import drive
drive.mount('/content/drive')

# Check GPU
import tensorflow as tf
print(f"GPU Available: {tf.test.is_gpu_available()}")
if tf.test.is_gpu_available():
    print(f"GPU Device: {tf.test.gpu_device_name()}")
    print("✅ Perfect! Using GPU for fast training!")

# Create folder for saving
import os
os.makedirs('/content/drive/MyDrive/finance_chatbot_experiments', exist_ok=True)
print("✅ Google Drive mounted for model saving!")

🚀 COLAB SETUP - MOUNTING DRIVE AND CHECKING GPU
Mounted at /content/drive


Instructions for updating:
Use `tf.config.list_physical_devices('GPU')` instead.


GPU Available: False
✅ Google Drive mounted for model saving!


In [4]:
# Add this after W&B init cell
!wandb login

# It will give you a link to authenticate - just follow it

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit: 
[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mc-gakwaya[0m ([33mc-gakwaya-african-leadership-academy[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [5]:
# CELL 1: Installations & Imports
print("🚀 SETTING UP OPTIMIZED EXPERIMENTATION ENVIRONMENT")
print("=" * 70)

!pip install transformers datasets tensorflow wandb nltk evaluate

import tensorflow as tf
import numpy as np
import pandas as pd
from transformers import (
    TFGPT2LMHeadModel, GPT2Tokenizer,
    DataCollatorForLanguageModeling,
    TrainingArguments, Trainer
)
from datasets import load_dataset, Dataset
import re
import wandb
from nltk.translate.bleu_score import sentence_bleu
import evaluate
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, f1_score
import nltk
nltk.download('punkt')

print("✅ All packages installed and imported!")


🚀 SETTING UP OPTIMIZED EXPERIMENTATION ENVIRONMENT
Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Downloading evaluate-0.4.6-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.6


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


✅ All packages installed and imported!


In [5]:
# CELL 2: Initialize Weights & Biases
print("📊 INITIALIZING WEIGHTS & BIASES FOR EXPERIMENT TRACKING")
print("=" * 70)

wandb.init(
    project="finance-chatbot-optimized",
    name="distilgpt2-2000samples-v1",
    config={
        "learning_rate": 5e-5,
        "batch_size": 8,
        "epochs": 5,
        "model": "distilgpt2",
        "dataset": "financeQA_100K",
        "samples": 2000,
        "max_length": 256
    }
)

print("✅ W&B initialized! Tracking experiment:", wandb.run.name)

📊 INITIALIZING WEIGHTS & BIASES FOR EXPERIMENT TRACKING


  | |_| | '_ \/ _` / _` |  _/ -_)
[34m[1mwandb[0m: Currently logged in as: [33mc-gakwaya[0m ([33mc-gakwaya-african-leadership-academy[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


✅ W&B initialized! Tracking experiment: distilgpt2-2000samples-v1


In [1]:
# CELL 3: Load & Prepare Dataset (PROPER FORMAT)
print("📊 LOADING AND PREPROCESSING DATASET")
print("=" * 70)

# Load dataset
dataset = load_dataset('majorSeaweed/financeQA_100K')

# Take larger samples for better training
train_data = dataset['train'].select(range(2000))
val_data = dataset['validation'].select(range(500))
test_data = dataset['test'].select(range(500))

print(f"Dataset loaded:")
print(f"  Training samples: {len(train_data)}")
print(f"  Validation samples: {len(val_data)}")
print(f"  Test samples: {len(test_data)}")

# Load tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("distilgpt2")
tokenizer.pad_token = tokenizer.eos_token

print("✅ Tokenizer loaded!")

📊 LOADING AND PREPROCESSING DATASET


NameError: name 'load_dataset' is not defined

In [7]:
# CELL 4: Data Preprocessing (OPTIMIZED)
print("🔧 DATA PREPROCESSING - OPTIMIZED CLEANING")
print("=" * 70)

def clean_text_optimized(text):
    """Enhanced cleaning function"""
    if not isinstance(text, str):
        return ""

    # Remove markdown and formatting
    text = re.sub(r'#+\s*Document Type[:]?', '', text)
    text = re.sub(r'\*\*.*?\*\*', '', text)
    text = re.sub(r'###\s*', '', text)
    text = re.sub(r'- \*\*', '', text)

    # Clean extra whitespace
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

def format_conversation_optimized(example):
    """Proper conversation formatting"""
    question = clean_text_optimized(example['question'])
    answer = clean_text_optimized(example['answer'])

    # Format that worked well for us
    formatted_text = f"User: {question} Assistant: {answer}{tokenizer.eos_token}"
    return {'text': formatted_text}

# Apply preprocessing
print("Applying preprocessing...")
train_data_clean = train_data.map(format_conversation_optimized)
val_data_clean = val_data.map(format_conversation_optimized)
test_data_clean = test_data.map(format_conversation_optimized)

print("✅ Data preprocessing completed!")
print(f"Sample formatted text: {train_data_clean[0]['text'][:100]}...")

🔧 DATA PREPROCESSING - OPTIMIZED CLEANING
Applying preprocessing...


Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

✅ Data preprocessing completed!
Sample formatted text: User: What is the total estimated project cost mentioned in the document? Assistant: The grand total...


In [8]:
# CELL 5: Tokenization (PROPER IMPLEMENTATION)
print("🔤 TOKENIZATION WITH PROPER HANDLING")
print("=" * 70)

def tokenize_function_optimized(examples):
    """Tokenization with proper error handling"""
    return tokenizer(
        examples['text'],
        truncation=True,
        padding=True,
        max_length=256,
        return_tensors="tf"
    )

print("Tokenizing datasets...")
tokenized_train = train_data_clean.map(
    tokenize_function_optimized,
    batched=True,
    remove_columns=train_data_clean.column_names
)
tokenized_val = val_data_clean.map(
    tokenize_function_optimized,
    batched=True,
    remove_columns=val_data_clean.column_names
)

print("✅ Tokenization completed!")
print(f"Tokenized training samples: {len(tokenized_train)}")

🔤 TOKENIZATION WITH PROPER HANDLING
Tokenizing datasets...


Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

TensorFlow and JAX classes are deprecated and will be removed in Transformers v5. We recommend migrating to PyTorch classes or pinning your version of Transformers.


Map:   0%|          | 0/500 [00:00<?, ? examples/s]

✅ Tokenization completed!
Tokenized training samples: 2000


In [9]:
# CELL 6: Model Loading & Setup
print("🤖 LOADING MODEL WITH OPTIMIZED SETTINGS")
print("=" * 70)

# Load DistilGPT-2 (better for our use case)
model = TFGPT2LMHeadModel.from_pretrained("distilgpt2", use_safetensors=False)

# Data collator for proper batching
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
    return_tensors="tf"
)

print("✅ Model loaded successfully!")
print(f"Model parameters: {model.num_parameters():,}")

🤖 LOADING MODEL WITH OPTIMIZED SETTINGS


tf_model.h5:   0%|          | 0.00/328M [00:00<?, ?B/s]

TensorFlow and JAX classes are deprecated and will be removed in Transformers v5. We recommend migrating to PyTorch classes or pinning your version of Transformers.
All model checkpoint layers were used when initializing TFGPT2LMHeadModel.

All the layers of TFGPT2LMHeadModel were initialized from the model checkpoint at distilgpt2.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.


generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

✅ Model loaded successfully!
Model parameters: 81,912,576


In [13]:
# CELL 7 FIXED: TensorFlow Dataset Preparation with Padding
print("📦 PREPARING TENSORFLOW DATASETS WITH PROPER PADDING")
print("=" * 70)

def create_tf_dataset_with_padding(tokenized_data, batch_size=8):
    """Working TensorFlow dataset creation with proper padding"""

    # First, let's check the sequence lengths
    sequence_lengths = [len(x['input_ids']) for x in tokenized_data]
    print(f"Sequence lengths - Min: {min(sequence_lengths)}, Max: {max(sequence_lengths)}")

    # Pad all sequences to the same length (256 as we specified in tokenization)
    max_length = 256

    input_ids_padded = []
    attention_mask_padded = []

    for item in tokenized_data:
        input_seq = item['input_ids']
        attention_seq = item['attention_mask']

        # Pad sequences to max_length
        if len(input_seq) < max_length:
            pad_length = max_length - len(input_seq)
            input_ids_padded.append(input_seq + [tokenizer.pad_token_id] * pad_length)
            attention_mask_padded.append(attention_seq + [0] * pad_length)
        else:
            input_ids_padded.append(input_seq[:max_length])
            attention_mask_padded.append(attention_seq[:max_length])

    # Convert to numpy arrays (now they'll have consistent shapes)
    input_ids = np.array(input_ids_padded, dtype=np.int32)
    attention_mask = np.array(attention_mask_padded, dtype=np.int32)

    print(f"Dataset shapes - Input IDs: {input_ids.shape}, Attention Mask: {attention_mask.shape}")

    # Create TensorFlow dataset with proper structure
    dataset = tf.data.Dataset.from_tensor_slices({
        'input_ids': input_ids,
        'attention_mask': attention_mask,
        'labels': input_ids  # Labels are same as input_ids for language modeling
    })

    return dataset.batch(batch_size)

# Create datasets
print("Creating TensorFlow datasets with padding...")
tf_train_dataset = create_tf_dataset_with_padding(tokenized_train, batch_size=8)
tf_val_dataset = create_tf_dataset_with_padding(tokenized_val, batch_size=8)

print("✅ TensorFlow datasets created successfully!")
print(f"Training batches: {len(list(tf_train_dataset))}")
print(f"Validation batches: {len(list(tf_val_dataset))}")

# Test one batch to make sure it works
test_batch = next(iter(tf_train_dataset))
print(f"Batch keys: {test_batch.keys()}")
print(f"Input IDs shape: {test_batch['input_ids'].shape}")
print(f"Labels shape: {test_batch['labels'].shape}")
print(f"All sequences now have consistent shape: {test_batch['input_ids'].shape[1]}")

📦 PREPARING TENSORFLOW DATASETS WITH PROPER PADDING
Creating TensorFlow datasets with padding...
Sequence lengths - Min: 71, Max: 104
Dataset shapes - Input IDs: (2000, 256), Attention Mask: (2000, 256)
Sequence lengths - Min: 75, Max: 75
Dataset shapes - Input IDs: (500, 256), Attention Mask: (500, 256)
✅ TensorFlow datasets created successfully!
Training batches: 250
Validation batches: 63
Batch keys: dict_keys(['input_ids', 'attention_mask', 'labels'])
Input IDs shape: (8, 256)
Labels shape: (8, 256)
All sequences now have consistent shape: 256


In [None]:
# CELL 8 FIXED: Manual TensorFlow Training (WITH PRINT FIX)
print("🎯 STARTING MANUAL TENSORFLOW TRAINING LOOP")
print("=" * 70)

def manual_tensorflow_training(model, train_dataset, val_dataset, epochs=5):
    """Manual training loop that works with TensorFlow and Hugging Face models"""

    optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)

    train_losses = []
    val_losses = []

    # Convert to lists for manual iteration
    train_batches = list(train_dataset)
    val_batches = list(val_dataset)

    print(f"Training batches: {len(train_batches)}")
    print(f"Validation batches: {len(val_batches)}")

    for epoch in range(epochs):
        print(f"\n🎯 Epoch {epoch + 1}/{epochs}")

        # Training phase
        epoch_train_loss = 0
        num_train_batches = 0

        for batch in train_batches:
            with tf.GradientTape() as tape:
                # Forward pass
                outputs = model(
                    input_ids=batch['input_ids'],
                    attention_mask=batch['attention_mask'],
                    labels=batch['labels']  # This is crucial!
                )
                loss = outputs.loss

            # Backward pass
            gradients = tape.gradient(loss, model.trainable_variables)
            optimizer.apply_gradients(zip(gradients, model.trainable_variables))

            # FIX: Convert loss to float before printing
            loss_value = float(loss.numpy())
            epoch_train_loss += loss_value
            num_train_batches += 1

            # Print progress (FIXED)
            if num_train_batches % 50 == 0:
                print(f"  Batch {num_train_batches}, Loss: {loss_value:.4f}")

        avg_train_loss = epoch_train_loss / num_train_batches
        train_losses.append(avg_train_loss)

        # Validation phase
        epoch_val_loss = 0
        num_val_batches = 0

        for batch in val_batches:
            outputs = model(
                input_ids=batch['input_ids'],
                attention_mask=batch['attention_mask'],
                labels=batch['labels']
            )
            # FIX: Convert validation loss to float
            loss_value = float(outputs.loss.numpy())
            epoch_val_loss += loss_value
            num_val_batches += 1

        avg_val_loss = epoch_val_loss / num_val_batches
        val_losses.append(avg_val_loss)

        # Log to W&B
        wandb.log({
            'epoch': epoch + 1,
            'train_loss': avg_train_loss,
            'val_loss': avg_val_loss,
            'learning_rate': optimizer.lr.numpy()
        })

        print(f"✅ Epoch {epoch + 1} completed:")
        print(f"   Training Loss: {avg_train_loss:.4f}")
        print(f"   Validation Loss: {avg_val_loss:.4f}")

        # Generate sample responses every 2 epochs
        if epoch % 2 == 0:
            print("  Generating sample responses...")
            sample_questions = [
                "What is compound interest?",
                "How do I start investing?"
            ]

            for question in sample_questions:
                prompt = f"User: {question} Assistant:"
                inputs = tokenizer.encode(prompt, return_tensors='tf')

                outputs = model.generate(
                    inputs,
                    max_length=150,
                    num_return_sequences=1,
                    pad_token_id=tokenizer.eos_token_id,
                    do_sample=True,
                    temperature=0.7
                )

                response = tokenizer.decode(outputs[0], skip_special_tokens=True)
                wandb.log({f"sample_response_epoch_{epoch+1}": response})
                print(f"    Q: {question}")
                print(f"    A: {response[:80]}...")

    return train_losses, val_losses

print("🚀 STARTING MANUAL TENSORFLOW TRAINING...")
print("This avoids the compatibility issues and gives us full control!")
print("Training for 5 epochs with real-time monitoring...")

# Start training
train_losses, val_losses = manual_tensorflow_training(
    model,
    tf_train_dataset,
    tf_val_dataset,
    epochs=5
)

print("\n🎉 MANUAL TENSORFLOW TRAINING COMPLETED SUCCESSFULLY!")

🎯 STARTING MANUAL TENSORFLOW TRAINING LOOP
🚀 STARTING MANUAL TENSORFLOW TRAINING...
This avoids the compatibility issues and gives us full control!
Training for 5 epochs with real-time monitoring...
Training batches: 250
Validation batches: 63

🎯 Epoch 1/5


  loss_value = float(loss.numpy())


  Batch 50, Loss: 0.3485
  Batch 100, Loss: 0.3022
  Batch 150, Loss: 0.4900
  Batch 200, Loss: 0.2794
  Batch 250, Loss: 0.2987


  loss_value = float(outputs.loss.numpy())


✅ Epoch 1 completed:
   Training Loss: 0.3305
   Validation Loss: 0.3068
  Generating sample responses...
    Q: What is compound interest?
    A: User: What is compound interest? Assistant: The compounds in the tobacco are com...
    Q: How do I start investing?
    A: User: How do I start investing? Assistant: I start using the funds in the 'Menta...

🎯 Epoch 2/5
  Batch 50, Loss: 0.2712
  Batch 100, Loss: 0.2469
  Batch 150, Loss: 0.4108
  Batch 200, Loss: 0.2380
  Batch 250, Loss: 0.2634
✅ Epoch 2 completed:
   Training Loss: 0.2733
   Validation Loss: 0.3026

🎯 Epoch 3/5
  Batch 50, Loss: 0.2275
  Batch 100, Loss: 0.2120
  Batch 150, Loss: 0.3516
  Batch 200, Loss: 0.2057


In [None]:
# CELL 9 UPDATED: Loss Curves for Manual Training
print("📈 GENERATING LOSS CURVES FROM MANUAL TRAINING")
print("=" * 70)

# Create loss curves from our manual training results
plt.figure(figsize=(12, 8))

# Plot training history
epochs_range = range(1, len(train_losses) + 1)

plt.subplot(2, 2, 1)
plt.plot(epochs_range, train_losses, 'bo-', label='Training Loss', linewidth=2, markersize=6)
plt.plot(epochs_range, val_losses, 'ro-', label='Validation Loss', linewidth=2, markersize=6)
plt.title('Training Progress - Manual TensorFlow Training', fontsize=14, fontweight='bold')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.grid(True, alpha=0.3)

# Calculate improvement
if len(train_losses) > 1:
    improvement = ((train_losses[0] - train_losses[-1]) / train_losses[0]) * 100

    plt.subplot(2, 2, 2)
    plt.bar(['Start', 'End'], [train_losses[0], train_losses[-1]], color=['red', 'green'], alpha=0.7)
    plt.title(f'Loss Improvement: {improvement:.1f}%', fontweight='bold')
    plt.ylabel('Loss')

# Log to W&B
wandb.log({
    "loss_curves": wandb.Image(plt),
    "final_train_loss": train_losses[-1],
    "final_val_loss": val_losses[-1],
    "improvement_percentage": improvement
})

plt.tight_layout()
plt.savefig('manual_training_curves.png', dpi=300, bbox_inches='tight')
plt.show()

print(f"✅ Loss curves generated! Improvement: {improvement:.1f}%")
print("✅ Charts saved and logged to W&B!")

In [None]:
# CELL 10: Comprehensive Evaluation Metrics
print("📊 COMPREHENSIVE EVALUATION METRICS")
print("=" * 70)

def calculate_all_metrics():
    """Calculate all required metrics: BLEU, Perplexity, F1, etc."""

    print("Calculating evaluation metrics...")

    # 1. Perplexity
    print("1. Calculating Perplexity...")
    eval_loss = model.evaluate(tf_val_dataset, verbose=0)
    perplexity = np.exp(eval_loss)

    # 2. BLEU Score
    print("2. Calculating BLEU Scores...")
    test_questions = [
        "What is compound interest?",
        "How do I start investing?",
        "What is the difference between stocks and bonds?"
    ]

    reference_responses = {
        "What is compound interest?": [
            "Compound interest is interest calculated on both initial principal and accumulated interest",
            "It means earning interest on your interest over time",
            "Compound interest helps investments grow faster through exponential growth"
        ],
        "How do I start investing?": [
            "Start by setting financial goals and learning basic investment principles",
            "Begin with low-cost index funds and build an emergency fund first",
            "Research different investment options and consider your risk tolerance"
        ]
    }

    bleu_scores = []
    for question in test_questions:
        prompt = f"User: {question} Assistant:"
        inputs = tokenizer.encode(prompt, return_tensors='tf')

        outputs = model.generate(
            inputs,
            max_length=150,
            num_return_sequences=1,
            pad_token_id=tokenizer.eos_token_id,
            do_sample=True,
            temperature=0.7
        )

        response = tokenizer.decode(outputs[0], skip_special_tokens=True)

        if question in reference_responses:
            ref_tokens = [ref.split() for ref in reference_responses[question]]
            hyp_tokens = response.split()
            bleu_score = sentence_bleu(ref_tokens, hyp_tokens)
            bleu_scores.append(bleu_score)

    avg_bleu = np.mean(bleu_scores) if bleu_scores else 0

    # 3. Domain Specificity
    print("3. Calculating Domain Specificity...")
    finance_questions = test_questions
    non_finance_questions = [
        "Where is the best pizza?",
        "How do I fix my car?",
        "What's the weather like?"
    ]

    correct_finance = 0
    for q in finance_questions:
        prompt = f"User: {q} Assistant:"
        inputs = tokenizer.encode(prompt, return_tensors='tf')
        outputs = model.generate(inputs, max_length=150, num_return_sequences=1)
        response = tokenizer.decode(outputs[0], skip_special_tokens=True)

        # Check if response is reasonable (not just EOS token)
        if len(response.split()) > 3:
            correct_finance += 1

    domain_specificity = correct_finance / len(finance_questions)

    # Log all metrics to W&B
    metrics = {
        'perplexity': perplexity,
        'bleu_score': avg_bleu,
        'domain_specificity': domain_specificity,
        'final_train_loss': history.history['loss'][-1],
        'final_val_loss': history.history['val_loss'][-1],
        'improvement_percentage': improvement
    }

    wandb.log(metrics)

    print("\n📈 FINAL METRICS:")
    print(f"  • Perplexity: {perplexity:.2f}")
    print(f"  • BLEU Score: {avg_bleu:.4f}")
    print(f"  • Domain Specificity: {domain_specificity:.1%}")
    print(f"  • Final Training Loss: {history.history['loss'][-1]:.4f}")
    print(f"  • Final Validation Loss: {history.history['val_loss'][-1]:.4f}")
    print(f"  • Improvement: {improvement:.1f}%")

    return metrics

# Calculate all metrics
metrics = calculate_all_metrics()

In [None]:
# CELL 11: Save Model & Finalize
print("💾 SAVING MODEL AND FINALIZING EXPERIMENT")
print("=" * 70)

# Save the trained model
model.save_pretrained("./optimized_finance_chatbot")
tokenizer.save_pretrained("./optimized_finance_chatbot")

# Save to W&B
wandb.save("./optimized_finance_chatbot/*")

print("✅ Model saved to './optimized_finance_chatbot/'")
print("✅ All metrics logged to Weights & Biases")
print("🎉 EXPERIMENT COMPLETED SUCCESSFULLY!")

# Finish W&B run
wandb.finish()

print("\n📊 You can view your experiment at:")
print(f"   https://wandb.ai/{wandb.run.entity}/{wandb.run.project}/runs/{wandb.run.id}")