## üì¶ Step 1: Install Dependencies and Setup

In [None]:
# Install required packages
!pip install -q tensorflow==2.15.0
!pip install -q tensorflow-datasets==4.9.0
!pip install -q pandas openpyxl

print("‚úÖ All packages installed!")
print("\n‚ö†Ô∏è IMPORTANT: After installation completes, go to Runtime > Restart runtime")
print("Then run the cells again from the beginning.")

In [None]:
# Import libraries
import os
import re
import json
import pandas as pd
import numpy as np
import tensorflow as tf
import tensorflow_datasets as tfds
from google.colab import files

print(f"TensorFlow version: {tf.__version__}")
print(f"GPU Available: {tf.config.list_physical_devices('GPU')}")

## üì§ Step 2: Upload Your Soros Excel Dataset

Upload `Soros_sample.xlsx` from your `data/` folder

In [None]:
# Upload the Excel file
print("Please upload your Soros_sample.xlsx file:")
uploaded = files.upload()

# Get the filename
excel_filename = list(uploaded.keys())[0]
print(f"\n‚úÖ Uploaded: {excel_filename}")

## üîß Step 3: Load and Preprocess Soros Q&A Data

In [None]:
# Load Excel data
df = pd.read_excel(excel_filename)

print(f"üìä Dataset shape: {df.shape}")
print(f"\nüìã Columns: {df.columns.tolist()}")
print(f"\nüîç First few rows:")
print(df.head())

In [None]:
# Extract questions and answers
# Adjust column names based on your Excel structure
# Common column names: 'Question', 'Answer', 'question', 'answer', 'Q', 'A'

# Try to auto-detect columns
possible_q_cols = ['Question', 'question', 'Q', 'q', 'Questions', 'Query']
possible_a_cols = ['Answer', 'answer', 'A', 'a', 'Answers', 'Response']

question_col = None
answer_col = None

for col in possible_q_cols:
    if col in df.columns:
        question_col = col
        break

for col in possible_a_cols:
    if col in df.columns:
        answer_col = col
        break

# If auto-detection fails, manually set:
if question_col is None:
    question_col = df.columns[0]  # Use first column
    print(f"‚ö†Ô∏è Using first column as questions: {question_col}")

if answer_col is None:
    answer_col = df.columns[1]  # Use second column
    print(f"‚ö†Ô∏è Using second column as answers: {answer_col}")

print(f"\n‚úÖ Using columns:")
print(f"   Questions: {question_col}")
print(f"   Answers: {answer_col}")

# Extract and clean data
questions = df[question_col].dropna().astype(str).tolist()
answers = df[answer_col].dropna().astype(str).tolist()

# Ensure equal length
min_len = min(len(questions), len(answers))
questions = questions[:min_len]
answers = answers[:min_len]

print(f"\nüìà Total Q&A pairs: {len(questions)}")
print(f"\nüí¨ Sample Q&A:")
for i in range(min(3, len(questions))):
    print(f"\nQ{i+1}: {questions[i][:100]}...")
    print(f"A{i+1}: {answers[i][:100]}...")

In [None]:
# Text preprocessing function (adapted from transformer code)
def preprocess_sentence(sentence):
    """Clean and normalize text for transformer training."""
    sentence = str(sentence).lower().strip()
    
    # Create space between punctuation
    sentence = re.sub(r"([?.!,])", r" \1 ", sentence)
    sentence = re.sub(r'[" "]+', " ", sentence)
    
    # Handle contractions
    sentence = re.sub(r"i'm", "i am", sentence)
    sentence = re.sub(r"he's", "he is", sentence)
    sentence = re.sub(r"she's", "she is", sentence)
    sentence = re.sub(r"it's", "it is", sentence)
    sentence = re.sub(r"that's", "that is", sentence)
    sentence = re.sub(r"what's", "what is", sentence)
    sentence = re.sub(r"where's", "where is", sentence)
    sentence = re.sub(r"how's", "how is", sentence)
    sentence = re.sub(r"\'ll", " will", sentence)
    sentence = re.sub(r"\'ve", " have", sentence)
    sentence = re.sub(r"\'re", " are", sentence)
    sentence = re.sub(r"\'d", " would", sentence)
    sentence = re.sub(r"won't", "will not", sentence)
    sentence = re.sub(r"can't", "cannot", sentence)
    sentence = re.sub(r"n't", " not", sentence)
    sentence = re.sub(r"n'", "ng", sentence)
    sentence = re.sub(r"'bout", "about", sentence)
    
    # Keep only letters, numbers, and basic punctuation
    sentence = re.sub(r"[^a-zA-Z0-9?.!,]+", " ", sentence)
    sentence = sentence.strip()
    
    return sentence

# Preprocess all questions and answers
questions_cleaned = [preprocess_sentence(q) for q in questions]
answers_cleaned = [preprocess_sentence(a) for a in answers]

print("‚úÖ Text preprocessing complete!")
print(f"\nüìù Sample preprocessed Q&A:")
for i in range(min(2, len(questions_cleaned))):
    print(f"\nQ: {questions_cleaned[i]}")
    print(f"A: {answers_cleaned[i]}")

## üî§ Step 4: Build Tokenizer (Vocabulary)

In [None]:
# Build vocabulary from Soros Q&A pairs
TARGET_VOCAB_SIZE = 2**13  # 8192 subwords

print("Building tokenizer from Soros dataset...")
print("This may take 2-3 minutes...\n")

tokenizer = tfds.deprecated.text.SubwordTextEncoder.build_from_corpus(
    questions_cleaned + answers_cleaned, 
    target_vocab_size=TARGET_VOCAB_SIZE
)

print(f"‚úÖ Tokenizer built!")
print(f"   Vocabulary size: {tokenizer.vocab_size}")

# Define special tokens
START_TOKEN = [tokenizer.vocab_size]
END_TOKEN = [tokenizer.vocab_size + 1]
VOCAB_SIZE = tokenizer.vocab_size + 2

print(f"\nüéØ Special tokens:")
print(f"   START_TOKEN: {START_TOKEN[0]}")
print(f"   END_TOKEN: {END_TOKEN[0]}")
print(f"   Total vocab size: {VOCAB_SIZE}")

# Test tokenization
sample_sentence = questions_cleaned[0]
encoded = tokenizer.encode(sample_sentence)
decoded = tokenizer.decode(encoded)

print(f"\nüß™ Tokenization test:")
print(f"   Original: {sample_sentence}")
print(f"   Encoded: {encoded[:20]}...")
print(f"   Decoded: {decoded}")

In [None]:
# Save tokenizer for later use
tokenizer.save_to_file('soros_tokenizer')
print("‚úÖ Tokenizer saved to 'soros_tokenizer.subwords'")

## üî¢ Step 5: Tokenize and Prepare Training Data

In [None]:
# Configuration
MAX_LENGTH = 100  # Maximum sequence length

# Tokenize questions and answers
tokenized_questions = []
tokenized_answers = []

print("üìä Tokenizing data...")
for question, answer in zip(questions_cleaned, answers_cleaned):
    # Tokenize and add special tokens
    q_tokens = START_TOKEN + tokenizer.encode(question) + END_TOKEN
    a_tokens = START_TOKEN + tokenizer.encode(answer) + END_TOKEN
    
    # Filter by length
    if len(q_tokens) <= MAX_LENGTH and len(a_tokens) <= MAX_LENGTH:
        tokenized_questions.append(q_tokens)
        tokenized_answers.append(a_tokens)

print(f"\nüìä Data statistics:")
print(f"   Original pairs: {len(questions_cleaned)}")
print(f"   After filtering: {len(tokenized_questions)}")
print(f"   Filtered out: {len(questions_cleaned) - len(tokenized_questions)}")
print(f"   Retention rate: {100 * len(tokenized_questions) / len(questions_cleaned):.1f}%")

# Check if we're losing too much data
if len(tokenized_questions) < len(questions_cleaned) * 0.8:
    print(f"\n‚ö†Ô∏è WARNING: Losing {len(questions_cleaned) - len(tokenized_questions)} pairs!")
    print(f"   Consider increasing MAX_LENGTH to retain more data")

# Pad sequences
tokenized_questions = tf.keras.preprocessing.sequence.pad_sequences(
    tokenized_questions, maxlen=MAX_LENGTH, padding='post'
)
tokenized_answers = tf.keras.preprocessing.sequence.pad_sequences(
    tokenized_answers, maxlen=MAX_LENGTH, padding='post'
)

print(f"\n‚úÖ Data prepared!")
print(f"   Questions shape: {tokenized_questions.shape}")
print(f"   Answers shape: {tokenized_answers.shape}")
print(f"   Training on {len(tokenized_questions)} Q&A pairs")

In [None]:
# Create TensorFlow dataset with optimized batch size
BATCH_SIZE = 32  # Smaller batch = better gradient updates = higher accuracy
BUFFER_SIZE = len(tokenized_questions)

dataset = tf.data.Dataset.from_tensor_slices((
    {
        'inputs': tokenized_questions,
        'dec_inputs': tokenized_answers[:, :-1]  # Decoder input (shifted)
    },
    tokenized_answers[:, 1:]  # Target (shifted)
))

dataset = dataset.cache()
dataset = dataset.shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE)
dataset = dataset.prefetch(tf.data.AUTOTUNE)

print(f"‚úÖ TensorFlow dataset created!")
print(f"   Batch size: {BATCH_SIZE} (optimized for high accuracy)")
print(f"   Total batches: {len(list(dataset))}")

## üèóÔ∏è Step 6: Build Transformer Model

In [None]:
# Transformer model components

def scaled_dot_product_attention(query, key, value, mask):
    """Calculate attention weights."""
    matmul_qk = tf.matmul(query, key, transpose_b=True)
    depth = tf.cast(tf.shape(key)[-1], dtype=tf.float32)
    logits = matmul_qk / tf.math.sqrt(depth)
    
    if mask is not None:
        logits += (mask * -1e9)
    
    attention_weights = tf.nn.softmax(logits, axis=-1)
    output = tf.matmul(attention_weights, value)
    return output


class MultiHeadAttentionLayer(tf.keras.layers.Layer):
    def __init__(self, num_heads, d_model, **kwargs):
        super().__init__(**kwargs)
        self.num_heads = num_heads
        self.d_model = d_model
        self.depth = d_model // num_heads
        
        self.query_dense = tf.keras.layers.Dense(d_model)
        self.key_dense = tf.keras.layers.Dense(d_model)
        self.value_dense = tf.keras.layers.Dense(d_model)
        self.dense = tf.keras.layers.Dense(d_model)
    
    def get_config(self):
        config = super().get_config()
        config.update({"num_heads": self.num_heads, "d_model": self.d_model})
        return config
    
    def split_heads(self, inputs, batch_size):
        inputs = tf.reshape(inputs, (batch_size, -1, self.num_heads, self.depth))
        return tf.transpose(inputs, perm=[0, 2, 1, 3])
    
    def call(self, inputs):
        query, key, value, mask = inputs['query'], inputs['key'], inputs['value'], inputs['mask']
        batch_size = tf.shape(query)[0]
        
        query = self.split_heads(self.query_dense(query), batch_size)
        key = self.split_heads(self.key_dense(key), batch_size)
        value = self.split_heads(self.value_dense(value), batch_size)
        
        scaled_attention = scaled_dot_product_attention(query, key, value, mask)
        scaled_attention = tf.transpose(scaled_attention, perm=[0, 2, 1, 3])
        concat_attention = tf.reshape(scaled_attention, (batch_size, -1, self.d_model))
        
        return self.dense(concat_attention)


class PositionalEncoding(tf.keras.layers.Layer):
    def __init__(self, position, d_model, **kwargs):
        super().__init__(**kwargs)
        self.position = position
        self.d_model = d_model
        self.pos_encoding = self.positional_encoding(position, d_model)
    
    def get_config(self):
        config = super().get_config()
        config.update({"position": self.position, "d_model": self.d_model})
        return config
    
    def positional_encoding(self, position, d_model):
        angle_rads = self.get_angles(
            tf.cast(tf.range(position)[:, tf.newaxis], tf.float32),
            tf.cast(tf.range(d_model)[tf.newaxis, :], tf.float32),
            tf.cast(d_model, tf.float32)
        )
        sines = tf.math.sin(angle_rads[:, 0::2])
        cosines = tf.math.cos(angle_rads[:, 1::2])
        pos_encoding = tf.concat([sines, cosines], axis=-1)
        return pos_encoding[tf.newaxis, ...]
    
    def get_angles(self, position, i, d_model):
        angles = 1 / tf.pow(10000.0, (2 * (i // 2)) / d_model)
        return position * angles
    
    def call(self, inputs):
        return inputs + self.pos_encoding[:, :tf.shape(inputs)[1], :]
    
    def compute_output_shape(self, input_shape):
        return input_shape


def create_padding_mask(x):
    mask = tf.cast(tf.math.equal(x, 0), tf.float32)
    return mask[:, tf.newaxis, tf.newaxis, :]


def create_look_ahead_mask(x):
    seq_len = tf.shape(x)[1]
    look_ahead_mask = 1 - tf.linalg.band_part(tf.ones((seq_len, seq_len)), -1, 0)
    padding_mask = create_padding_mask(x)
    return tf.maximum(look_ahead_mask, padding_mask)


print("‚úÖ Transformer components defined!")

In [None]:
# Build encoder layer
def encoder_layer(d_model, num_heads, num_units, dropout, name='encoder_layer'):
    inputs = tf.keras.Input(shape=(None, d_model), name='inputs')
    padding_mask = tf.keras.Input(shape=(1, 1, None), name='padding_mask')
    
    attention = MultiHeadAttentionLayer(num_heads, d_model, name='attention')({
        'query': inputs, 'key': inputs, 'value': inputs, 'mask': padding_mask
    })
    attention = tf.keras.layers.Dropout(dropout)(attention)
    attention = tf.keras.layers.LayerNormalization(epsilon=1e-6)(inputs + attention)
    
    outputs = tf.keras.layers.Dense(num_units, activation='relu')(attention)
    outputs = tf.keras.layers.Dense(d_model)(outputs)
    outputs = tf.keras.layers.Dropout(dropout)(outputs)
    outputs = tf.keras.layers.LayerNormalization(epsilon=1e-6)(attention + outputs)
    
    return tf.keras.Model(inputs=[inputs, padding_mask], outputs=outputs, name=name)


# Build encoder
def encoder(vocab_size, num_layers, d_model, num_heads, num_units, dropout, name='encoder'):
    inputs = tf.keras.Input(shape=(None,), name='inputs')
    padding_mask = tf.keras.Input(shape=(1, 1, None), name='padding_mask')
    
    # Use sparse=False to ensure dense output
    embeddings = tf.keras.layers.Embedding(vocab_size, d_model, sparse=False)(inputs)
    embeddings *= tf.math.sqrt(tf.cast(d_model, tf.float32))
    embeddings = PositionalEncoding(vocab_size, d_model)(embeddings)
    outputs = tf.keras.layers.Dropout(dropout)(embeddings)
    
    for i in range(num_layers):
        outputs = encoder_layer(d_model, num_heads, num_units, dropout, f'encoder_layer_{i}')(
            [outputs, padding_mask]
        )
    
    return tf.keras.Model(inputs=[inputs, padding_mask], outputs=outputs, name=name)


# Build decoder layer
def decoder_layer(d_model, num_heads, num_units, dropout, name='decoder_layer'):
    inputs = tf.keras.Input(shape=(None, d_model), name='inputs')
    enc_outputs = tf.keras.Input(shape=(None, d_model), name='encoder_outputs')
    look_ahead_mask = tf.keras.Input(shape=(1, None, None), name='look_ahead_mask')
    padding_mask = tf.keras.Input(shape=(1, 1, None), name='padding_mask')
    
    attention1 = MultiHeadAttentionLayer(num_heads, d_model, name='attention_1')({
        'query': inputs, 'key': inputs, 'value': inputs, 'mask': look_ahead_mask
    })
    attention1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)(attention1 + inputs)
    
    attention2 = MultiHeadAttentionLayer(num_heads, d_model, name='attention_2')({
        'query': attention1, 'key': enc_outputs, 'value': enc_outputs, 'mask': padding_mask
    })
    attention2 = tf.keras.layers.Dropout(dropout)(attention2)
    attention2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)(attention2 + attention1)
    
    outputs = tf.keras.layers.Dense(num_units, activation='relu')(attention2)
    outputs = tf.keras.layers.Dense(d_model)(outputs)
    outputs = tf.keras.layers.Dropout(dropout)(outputs)
    outputs = tf.keras.layers.LayerNormalization(epsilon=1e-6)(outputs + attention2)
    
    return tf.keras.Model(
        inputs=[inputs, enc_outputs, look_ahead_mask, padding_mask],
        outputs=outputs,
        name=name
    )


# Build decoder
def decoder(vocab_size, num_layers, d_model, num_heads, num_units, dropout, name='decoder'):
    inputs = tf.keras.Input(shape=(None,), name='inputs')
    enc_outputs = tf.keras.Input(shape=(None, d_model), name='encoder_outputs')
    look_ahead_mask = tf.keras.Input(shape=(1, None, None), name='look_ahead_mask')
    padding_mask = tf.keras.Input(shape=(1, 1, None), name='padding_mask')
    
    # Use sparse=False to ensure dense output
    embeddings = tf.keras.layers.Embedding(vocab_size, d_model, sparse=False)(inputs)
    embeddings *= tf.math.sqrt(tf.cast(d_model, tf.float32))
    embeddings = PositionalEncoding(vocab_size, d_model)(embeddings)
    outputs = tf.keras.layers.Dropout(dropout)(embeddings)
    
    for i in range(num_layers):
        outputs = decoder_layer(d_model, num_heads, num_units, dropout, f'decoder_layer_{i}')(
            [outputs, enc_outputs, look_ahead_mask, padding_mask]
        )
    
    return tf.keras.Model(
        inputs=[inputs, enc_outputs, look_ahead_mask, padding_mask],
        outputs=outputs,
        name=name
    )


print("‚úÖ Encoder and decoder builders defined!")

In [None]:
# Build complete transformer
def transformer(vocab_size, num_layers, d_model, num_heads, num_units, dropout, name='transformer'):
    inputs = tf.keras.Input(shape=(None,), name='inputs')
    dec_inputs = tf.keras.Input(shape=(None,), name='dec_inputs')
    
    enc_padding_mask = tf.keras.layers.Lambda(
        create_padding_mask, output_shape=(1, 1, None), name='enc_padding_mask'
    )(inputs)
    
    look_ahead_mask = tf.keras.layers.Lambda(
        create_look_ahead_mask, output_shape=(1, None, None), name='look_ahead_mask'
    )(dec_inputs)
    
    dec_padding_mask = tf.keras.layers.Lambda(
        create_padding_mask, output_shape=(1, 1, None), name='dec_padding_mask'
    )(inputs)
    
    enc_outputs = encoder(vocab_size, num_layers, d_model, num_heads, num_units, dropout)(
        [inputs, enc_padding_mask]
    )
    
    dec_outputs = decoder(vocab_size, num_layers, d_model, num_heads, num_units, dropout)(
        [dec_inputs, enc_outputs, look_ahead_mask, dec_padding_mask]
    )
    
    outputs = tf.keras.layers.Dense(vocab_size, name='outputs')(dec_outputs)
    
    return tf.keras.Model(inputs=[inputs, dec_inputs], outputs=outputs, name=name)


print("‚úÖ Transformer builder defined!")

In [None]:
# Model hyperparameters - Optimized for HIGH ACCURACY (60-70%)
NUM_LAYERS = 4      # Increased to 4 layers for maximum learning capacity
D_MODEL = 512       # Doubled embedding dimension for richer representations
NUM_HEADS = 8       # Attention heads
NUM_UNITS = 1024    # Doubled FFN units for better transformation
DROPOUT = 0.1       # Reduced dropout to allow more learning

# Build model
print("Building HIGH-CAPACITY Soros Transformer model...")
print("Target: 60-70% accuracy\n")

chatbot_model = transformer(
    vocab_size=VOCAB_SIZE,
    num_layers=NUM_LAYERS,
    d_model=D_MODEL,
    num_heads=NUM_HEADS,
    num_units=NUM_UNITS,
    dropout=DROPOUT,
    name='soros_transformer'
)

print("‚úÖ High-capacity model built!\n")
print(f"Architecture (optimized for 60-70% accuracy):")
print(f"  - {NUM_LAYERS} encoder/decoder layers (deep network)")
print(f"  - {D_MODEL}-dimensional embeddings (rich representations)")
print(f"  - {NUM_HEADS} attention heads")
print(f"  - {NUM_UNITS} FFN units")
print(f"  - {DROPOUT} dropout rate (balanced)\n")

chatbot_model.summary()

## üéì Step 7: Train the Model

**Training time:**
- CPU: 6-8 hours ‚ùå
- GPU (Colab): 2-3 hours ‚úÖ
- TPU (Colab): 1-2 hours ‚ö°

**Tip:** Use `Runtime > Change runtime type > GPU` for faster training!

In [None]:
# Custom learning rate schedule
class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
    def __init__(self, d_model, warmup_steps=4000):
        super().__init__()
        self.d_model = d_model
        self.d_model_float = tf.cast(d_model, tf.float32)
        self.warmup_steps = warmup_steps
    
    def __call__(self, step):
        step = tf.cast(step, tf.float32)
        arg1 = tf.math.rsqrt(step)
        arg2 = step * (self.warmup_steps ** -1.5)
        return tf.math.rsqrt(self.d_model_float) * tf.math.minimum(arg1, arg2)
    
    def get_config(self):
        return {
            'd_model': self.d_model,
            'warmup_steps': self.warmup_steps
        }


# Loss function
cross_entropy = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')

def loss_function(y_true, y_pred):
    y_true = tf.reshape(y_true, shape=(-1, MAX_LENGTH - 1))
    loss = cross_entropy(y_true, y_pred)
    mask = tf.cast(tf.not_equal(y_true, 0), tf.float32)
    loss = tf.multiply(loss, mask)
    return tf.reduce_mean(loss)


def accuracy(y_true, y_pred):
    y_true = tf.reshape(y_true, shape=(-1, MAX_LENGTH - 1))
    return tf.keras.metrics.sparse_categorical_accuracy(y_true, y_pred)


# Compile model
learning_rate = CustomSchedule(D_MODEL)
optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98, epsilon=1e-9)

chatbot_model.compile(
    optimizer=optimizer,
    loss=loss_function,
    metrics=[accuracy]
)

print("‚úÖ Model compiled and ready to train!")

In [None]:
# Training configuration - OPTIMIZED FOR 60-70% ACCURACY
EPOCHS = 300  # Doubled epochs for thorough learning

# Callbacks for better training
checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    'soros_checkpoint.keras',  # Use .keras format (modern)
    save_best_only=True,
    monitor='accuracy',  # Monitor accuracy instead of loss
    mode='max',  # We want to maximize accuracy
    verbose=1
)

early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor='accuracy',  # Stop based on accuracy plateau
    patience=25,  # More patience for higher accuracy
    restore_best_weights=True,
    min_delta=0.005,  # Stop if accuracy doesn't improve by 0.5%
    mode='max'
)

# Note: ReduceLROnPlateau removed - not compatible with CustomSchedule
# The CustomSchedule already handles learning rate adjustments

# Start training
print(f"\nüöÄ Starting INTENSIVE training for {EPOCHS} epochs...")
print("üéØ Target: 60-70% accuracy")
print("‚è±Ô∏è Expected time: 20-30 minutes on GPU")
print("This will achieve much higher accuracy! ‚òï\n")

history = chatbot_model.fit(
    dataset,
    epochs=EPOCHS,
    callbacks=[checkpoint_callback, early_stopping],
    verbose=1
)

print("\nüéâ Training complete!")
print(f"\nüìä Final metrics:")

print(f"   Loss: {history.history['loss'][-1]:.4f}")print(f"   Epochs trained: {len(history.history['loss'])}")
print(f"   Accuracy: {history.history['accuracy'][-1]:.4f}")

In [None]:
# Plot training history
import matplotlib.pyplot as plt

plt.figure(figsize=(12, 4))

plt.subplot(1, 2, 1)
plt.plot(history.history['loss'])
plt.title('Model Loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.grid(True)

plt.subplot(1, 2, 2)
plt.plot(history.history['accuracy'])
plt.title('Model Accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.grid(True)

plt.tight_layout()
plt.show()

print(f"Final Loss: {history.history['loss'][-1]:.4f}")
print(f"Final Accuracy: {history.history['accuracy'][-1]:.4f}")

## üß™ Step 8: Test the Trained Model

In [None]:
# Inference function
def predict_answer(question, max_length=MAX_LENGTH):
    """Generate answer for a given question."""
    # Preprocess
    question = preprocess_sentence(question)
    
    # Tokenize input
    sentence = tf.expand_dims(START_TOKEN + tokenizer.encode(question) + END_TOKEN, 0)
    output = tf.expand_dims(START_TOKEN, 0)
    
    # Generate tokens one by one
    for i in range(max_length):
        predictions = chatbot_model(inputs=[sentence, output], training=False)
        predictions = predictions[:, -1:, :]
        predicted_id = tf.cast(tf.argmax(predictions, axis=-1), tf.int32)
        
        if tf.equal(predicted_id, END_TOKEN[0]):
            break
        
        output = tf.concat([output, predicted_id], axis=-1)
    
    # Decode output
    prediction = tf.squeeze(output, axis=0)
    predicted_sentence = tokenizer.decode(
        [i for i in prediction.numpy() if i < tokenizer.vocab_size]
    )
    
    return predicted_sentence


print("‚úÖ Inference function ready!")

In [None]:
# Test with sample questions
test_questions = [
    "What is George Soros' investment philosophy?",
    "How does reflexivity work in markets?",
    "What is the theory of reflexivity?",
    "Tell me about short selling.",
    "What are Soros' views on risk management?"
]

print("üß™ Testing the model:\n")
print("="*80)

for question in test_questions:
    answer = predict_answer(question)
    print(f"\n‚ùì Q: {question}")
    print(f"üí° A: {answer}")
    print("-"*80)

In [None]:
# Interactive testing
print("\nüéÆ Interactive Mode - Ask anything about George Soros!")
print("Type 'quit' to exit\n")

while True:
    user_input = input("\n‚ùì Your question: ").strip()
    
    if user_input.lower() in ['quit', 'exit', 'q']:
        print("üëã Goodbye!")
        break
    
    if not user_input:
        continue
    
    answer = predict_answer(user_input)
    print(f"üí° Answer: {answer}")

## üíæ Step 9: Save the Trained Model

**Important:** You'll download these files to use in your Streamlit app!

In [None]:
# Save complete model
MODEL_DIR = 'soros_transformer_model'

print("üíæ Saving model...\n")

# Save as SavedModel format (recommended)
chatbot_model.save(MODEL_DIR, save_format='tf')
print(f"‚úÖ Model saved to '{MODEL_DIR}/' folder")

# Also save as H5 format (backup)
chatbot_model.save('soros_transformer_model.h5')
print(f"‚úÖ Model also saved as 'soros_transformer_model.h5'")

In [None]:
# Save training configuration
config = {
    'vocab_size': int(VOCAB_SIZE),
    'num_layers': NUM_LAYERS,
    'd_model': D_MODEL,
    'num_heads': NUM_HEADS,
    'num_units': NUM_UNITS,
    'dropout': DROPOUT,
    'max_length': MAX_LENGTH,
    'start_token': int(START_TOKEN[0]),
    'end_token': int(END_TOKEN[0]),
    'epochs_trained': len(history.history['loss']),
    'final_loss': float(history.history['loss'][-1]),
    'final_accuracy': float(history.history['accuracy'][-1])
}

with open('training_config.json', 'w') as f:
    json.dump(config, f, indent=2)

print("‚úÖ Configuration saved to 'training_config.json'")
print("\nüìã Model Configuration:")
print(json.dumps(config, indent=2))

## üì• Step 10: Download Files for Your Streamlit App

**Download these files and place them in your project:**

1. **`soros_transformer_model/`** (entire folder) ‚Üí Copy to `/transformer_model/` in your project
2. **`soros_tokenizer.subwords`** ‚Üí Copy to project root
3. **`soros_tokenizer.subwords.txt`** (if exists) ‚Üí Copy to project root  
4. **`training_config.json`** ‚Üí Copy to project root
5. **`soros_transformer_model.h5`** (backup) ‚Üí Copy to project root

In [None]:
# Create a zip file for easier download
!zip -r soros_transformer_complete.zip soros_transformer_model/ soros_tokenizer.subwords* training_config.json soros_transformer_model.h5

print("\nüì¶ Created 'soros_transformer_complete.zip'")
print("\n‚¨áÔ∏è Download this file and extract it in your project folder!")

In [None]:
# Download the zip file
from google.colab import files

print("üì• Downloading complete package...\n")
files.download('soros_transformer_complete.zip')

print("\n" + "="*80)
print("üéâ SUCCESS! Your Soros Transformer model is trained!")
print("="*80)
print("\nüìã Next steps:")
print("1. Extract 'soros_transformer_complete.zip' in your project folder")
print("2. The backend code will load these files automatically")
print("3. Toggle between Groq API and Custom Transformer in the UI")
print("\nüí° Files you downloaded:")
print("   - soros_transformer_model/ (TensorFlow SavedModel)")
print("   - soros_tokenizer.subwords (Vocabulary)")
print("   - training_config.json (Model settings)")
print("   - soros_transformer_model.h5 (Backup format)")
print("\nüöÄ Ready to build the Streamlit app!")

---

## üìä Training Summary

Your model has been successfully trained on George Soros Q&A dataset!

**Model Architecture:**
- Transformer with encoder-decoder
- Multi-head attention mechanism
- Positional encoding

**Performance:**
- Check the plots above for loss and accuracy
- Test responses should be relevant to Soros' investment philosophy

**Next:** Build the Streamlit UI that lets users toggle between:
- **Groq API** (Fast, requires internet, uses Pinecone RAG)
- **Custom Transformer** (Your trained model, works offline)

---