###**Libraries**

In [1]:
import tensorflow as tf
from tensorflow.keras.layers import Layer, Embedding, Dense, LayerNormalization, Dropout,Input
from tensorflow.keras import Model
import json
from transformers import GPT2Tokenizer


###**Dataset Preparation**

In [2]:


# --- LOAD YOUR JSON DATA ---
file_path = 'physiotherapy_qa.json'
with open(file_path, 'r') as f:
    # This loads the data as a list of lists, e.g., [['question1', 'answer1'], ...]
    qa_pairs = json.load(f)

# --- FORMAT THE DATA ---
# We'll format each Q&A pair into a single string.
# The model learns to recognize this structure.
formatted_texts = []
for q, a in qa_pairs:
    # <|startoftext|> and <|endoftext|> are special tokens the model recognizes.
    formatted_texts.append(f"<|startoftext|>Question: {q}\nAnswer: {a}<|endoftext|>")

print(f"Loaded and formatted {len(formatted_texts)} Q&A pairs.")
print("Example:\n", formatted_texts[0])

Loaded and formatted 330 Q&A pairs.
Example:
 <|startoftext|>Question: What is spinal stenosis?
Answer: Spinal stenosis is the narrowing of the spinal canal, which can put pressure on the spinal cord and nerves, causing pain, numbness, or weakness, often in the legs and back.<|endoftext|>


###**Model Architecture**

In [3]:
class MultiHeadSelfAttention(tf.keras.layers.Layer):
    def __init__(self, embed_dim, num_heads):
        super().__init__()
        self.num_heads = num_heads
        self.embed_dim = embed_dim
        self.depth = embed_dim // num_heads
        self.wq = tf.keras.layers.Dense(embed_dim)
        self.wk = tf.keras.layers.Dense(embed_dim)
        self.wv = tf.keras.layers.Dense(embed_dim)
        self.dense = tf.keras.layers.Dense(embed_dim)
    def split_heads(self, x, batch_size):
        x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth))
        return tf.transpose(x, perm=[0, 2, 1, 3])
    def call(self, v, k, q, mask):
        batch_size = tf.shape(q)[0]
        q = self.split_heads(self.wq(q), batch_size)
        k = self.split_heads(self.wk(k), batch_size)
        v = self.split_heads(self.wv(v), batch_size)
        matmul_qk = tf.matmul(q, k, transpose_b=True)
        dk = tf.cast(tf.shape(k)[-1], tf.float32)
        scaled_attention_logits = matmul_qk / tf.math.sqrt(dk)
        if mask is not None:
            scaled_attention_logits += (mask * -1e9)
        attention_weights = tf.nn.softmax(scaled_attention_logits, axis=-1)
        output = tf.matmul(attention_weights, v)
        output = tf.transpose(output, perm=[0, 2, 1, 3])
        concat_attention = tf.reshape(output, (batch_size, -1, self.embed_dim))
        return self.dense(concat_attention)

class FeedForwardNetwork(tf.keras.layers.Layer):
    def __init__(self, embed_dim, dff):
        super().__init__()
        self.dense1 = tf.keras.layers.Dense(dff, activation='gelu')
        self.dense2 = tf.keras.layers.Dense(embed_dim)
    def call(self, x):
        return self.dense2(self.dense1(x))

class TransformerBlock(tf.keras.layers.Layer):
    def __init__(self, embed_dim, num_heads, dff, dropout_rate=0.1):
        super().__init__()
        self.att = MultiHeadSelfAttention(embed_dim, num_heads)
        self.ffn = FeedForwardNetwork(embed_dim, dff)
        self.norm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.norm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = tf.keras.layers.Dropout(dropout_rate)
        self.dropout2 = tf.keras.layers.Dropout(dropout_rate)
    def call(self, x, mask=None):
        attn_output = self.att(x, x, x, mask)
        attn_output = self.dropout1(attn_output)
        out1 = self.norm1(x + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output)
        return self.norm2(out1 + ffn_output)

class GPT2(Model):
    def __init__(self, vocab_size, max_length, embed_dim=768, num_heads=12, dff=3072, num_layers=12, dropout_rate=0.1):
        super().__init__()
        self.token_emb = tf.keras.layers.Embedding(vocab_size, embed_dim)
        self.pos_emb = tf.keras.layers.Embedding(max_length, embed_dim)
        self.transformer_blocks = [TransformerBlock(embed_dim, num_heads, dff, dropout_rate) for _ in range(num_layers)]
        self.norm = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.out = tf.keras.layers.Dense(vocab_size)
    def create_causal_mask(self, seq_len):
        mask = tf.linalg.band_part(tf.ones((seq_len, seq_len)), -1, 0)
        return 1 - mask
    def call(self, x):
        seq_len = tf.shape(x)[1]
        mask = self.create_causal_mask(seq_len)
        token_embeddings = self.token_emb(x)
        position_ids = tf.range(start=0, limit=seq_len, delta=1)
        position_embeddings = self.pos_emb(position_ids)
        x = token_embeddings + position_embeddings
        for transformer in self.transformer_blocks:
            x = transformer(x, mask)
        x = self.norm(x)
        return self.out(x)

###**Model training**

In [4]:

# 1. LOAD DATA
with open('physiotherapy_qa.json', 'r') as f:
    qa_pairs = json.load(f)
formatted_texts = [f"<|startoftext|>Question: {q}\nAnswer: {a}<|endoftext|>" for q, a in qa_pairs]

# 2. PREPARE TOKENIZER AND DATA
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
special_tokens_dict = {'bos_token': '<|startoftext|>', 'eos_token': '<|endoftext|>', 'pad_token': '<|pad|>'}
tokenizer.add_special_tokens(special_tokens_dict)

TOKENIZER_VOCAB_SIZE = len(tokenizer)
# This is the total length we pad sequences to.
MAX_SEQ_LENGTH = 1024

tokens = tokenizer(
    formatted_texts, max_length=MAX_SEQ_LENGTH, padding="max_length", truncation=True, return_tensors="tf"
)['input_ids']

# Create input x (shape: [batch, 1023]) and label y (shape: [batch, 1023])
x = tokens[:, :-1]
y = tokens[:, 1:]
dataset = tf.data.Dataset.from_tensor_slices((x, y)).shuffle(1000).batch(4)

# --- 3. BUILD THE MODEL WITH THE CORRECT SHAPE ---
# The model's input shape must match the shape of x, which is 1023.
MODEL_INPUT_SHAPE = MAX_SEQ_LENGTH - 1

inputs = Input(shape=(MODEL_INPUT_SHAPE,), dtype=tf.int32)
# Also pass this correct length to the GPT2 class for the positional embeddings
outputs = GPT2(vocab_size=TOKENIZER_VOCAB_SIZE, max_length=MODEL_INPUT_SHAPE)(inputs)
gpt2 = Model(inputs, outputs)

# --- 4. COMPILE AND TRAIN ---
optimizer = tf.keras.optimizers.AdamW(learning_rate=5e-5)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

gpt2.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])
gpt2.summary()

print("Starting training...")
gpt2.fit(dataset, epochs=5)
print("Training finished.")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

TensorFlow and JAX classes are deprecated and will be removed in Transformers v5. We recommend migrating to PyTorch classes or pinning your version of Transformers.


Starting training...
Epoch 1/5
[1m83/83[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m225s[0m 2s/step - accuracy: 0.8904 - loss: 6.1910
Epoch 2/5
[1m83/83[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m105s[0m 1s/step - accuracy: 0.9470 - loss: 1.2683
Epoch 3/5
[1m83/83[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m105s[0m 1s/step - accuracy: 0.9465 - loss: 0.5454
Epoch 4/5
[1m83/83[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m105s[0m 1s/step - accuracy: 0.9473 - loss: 0.4217
Epoch 5/5
[1m83/83[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m105s[0m 1s/step - accuracy: 0.9473 - loss: 0.3880
Training finished.


###**Save the Model**

In [5]:
# This line at the end of the training script creates the file.
gpt2.save_weights('physio_gpt2.weights.h5')

###**Model** **Inference**

In [None]:
class MultiHeadSelfAttention(Layer):
    def __init__(self, embed_dim, num_heads):
        super().__init__()
        self.num_heads = num_heads
        self.embed_dim = embed_dim
        self.depth = embed_dim // num_heads
        self.wq = Dense(embed_dim)
        self.wk = Dense(embed_dim)
        self.wv = Dense(embed_dim)
        self.dense = Dense(embed_dim)
    def split_heads(self, x, batch_size):
        x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth))
        return tf.transpose(x, perm=[0, 2, 1, 3])
    def call(self, v, k, q, mask):
        batch_size = tf.shape(q)[0]
        q = self.split_heads(self.wq(q), batch_size)
        k = self.split_heads(self.wk(k), batch_size)
        v = self.split_heads(self.wv(v), batch_size)
        matmul_qk = tf.matmul(q, k, transpose_b=True)
        dk = tf.cast(tf.shape(k)[-1], tf.float32)
        scaled_attention_logits = matmul_qk / tf.math.sqrt(dk)
        if mask is not None:
            scaled_attention_logits += (mask * -1e9)
        attention_weights = tf.nn.softmax(scaled_attention_logits, axis=-1)
        output = tf.matmul(attention_weights, v)
        output = tf.transpose(output, perm=[0, 2, 1, 3])
        concat_attention = tf.reshape(output, (batch_size, -1, self.embed_dim))
        return self.dense(concat_attention)

class FeedForwardNetwork(Layer):
    def __init__(self, embed_dim, dff):
        super().__init__()
        self.dense1 = Dense(dff, activation='gelu')
        self.dense2 = Dense(embed_dim)
    def call(self, x):
        return self.dense2(self.dense1(x))

class TransformerBlock(Layer):
    def __init__(self, embed_dim, num_heads, dff, dropout_rate=0.1):
        super().__init__()
        self.att = MultiHeadSelfAttention(embed_dim, num_heads)
        self.ffn = FeedForwardNetwork(embed_dim, dff)
        self.norm1 = LayerNormalization(epsilon=1e-6)
        self.norm2 = LayerNormalization(epsilon=1e-6)
        self.dropout1 = Dropout(dropout_rate)
        self.dropout2 = Dropout(dropout_rate)
    def call(self, x, mask=None):
        attn_output = self.att(x, x, x, mask)
        attn_output = self.dropout1(attn_output)
        out1 = self.norm1(x + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output)
        return self.norm2(out1 + ffn_output)

class GPT2(Model):
    def __init__(self, vocab_size, max_length, embed_dim=768, num_heads=12, dff=3072, num_layers=12, dropout_rate=0.1):
        super().__init__()
        self.token_emb = Embedding(vocab_size, embed_dim)
        self.pos_emb = Embedding(max_length, embed_dim)
        self.transformer_blocks = [TransformerBlock(embed_dim, num_heads, dff, dropout_rate) for _ in range(num_layers)]
        self.norm = LayerNormalization(epsilon=1e-6)
        self.out = Dense(vocab_size)
    def create_causal_mask(self, seq_len):
        mask = tf.linalg.band_part(tf.ones((seq_len, seq_len)), -1, 0)
        return 1 - mask
    def call(self, x):
        seq_len = tf.shape(x)[1]
        mask = self.create_causal_mask(seq_len)
        token_embeddings = self.token_emb(x)
        position_ids = tf.range(start=0, limit=seq_len, delta=1)
        position_embeddings = self.pos_emb(position_ids)
        x = token_embeddings + position_embeddings
        for transformer in self.transformer_blocks:
            x = transformer(x, mask)
        x = self.norm(x)
        return self.out(x)
# -----------------------------------------------

# 1. LOAD TOKENIZER AND MODEL
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
special_tokens_dict = {'bos_token': '<|startoftext|>', 'eos_token': '<|endoftext|>', 'pad_token': '<|pad|>'}
tokenizer.add_special_tokens(special_tokens_dict)

TOKENIZER_VOCAB_SIZE = len(tokenizer)
MODEL_MAX_LENGTH = 1023

inputs = Input(shape=(None,), dtype=tf.int32)
outputs = GPT2(vocab_size=TOKENIZER_VOCAB_SIZE, max_length=MODEL_MAX_LENGTH)(inputs)
inference_model = Model(inputs, outputs)

inference_model.load_weights('physio_gpt2.weights.h5')
print("Model and weights loaded successfully.")

# 2. DEFINE PROMPT AND GENERATE
prompt = "<|startoftext|>Question: Can physiotherapy help with chronic pain?"
input_ids = tokenizer.encode(prompt, return_tensors='tf')

print("Generating response...")
max_new_tokens = 150
generated_ids = input_ids

for i in range(max_new_tokens):
    predictions = inference_model(generated_ids)
    next_token_logits = predictions[:, -1, :]
    next_token_id = tf.argmax(next_token_logits, axis=-1)

    # --- THIS IS THE CORRECTED LINE ---
    # Reshape the next_token_id to a 2D tensor of shape [1, 1] before concatenating.
    next_token_id_2d = tf.reshape(next_token_id, [1, 1])
    generated_ids = tf.concat([generated_ids, tf.cast(next_token_id_2d, tf.int32)], axis=-1)

    if next_token_id == tokenizer.eos_token_id:
        print(f"End-of-text token reached after {i+1} iterations.")
        break

# 3. DECODE THE OUTPUT TO TEXT
generated_text = tokenizer.decode(generated_ids[0], skip_special_tokens=False)

print("\n--- Full Generated Text ---\n")
print(generated_text)

In [None]:
import json
import tensorflow as tf
# This is the key change: we will create the optimizer from the transformers library
from transformers import GPT2Tokenizer, TFGPT2LMHeadModel, create_optimizer

# --- 1. LOAD PRE-TRAINED MODEL AND TOKENIZER ---
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = TFGPT2LMHeadModel.from_pretrained('gpt2', from_pt=True) # Using from_pt=True for robustness

special_tokens_dict = {'bos_token': '<|startoftext|>', 'eos_token': '<|endoftext|>', 'pad_token': '<|pad|>'}
tokenizer.add_special_tokens(special_tokens_dict)

model.resize_token_embeddings(len(tokenizer))

print("Pre-trained model and tokenizer loaded.")

# --- 2. LOAD AND PREPARE YOUR DATA ---
with open('physiotherapy_qa.json', 'r') as f:
    qa_pairs = json.load(f)

formatted_texts = [f"<|startoftext|>Question: {q}\nAnswer: {a}<|endoftext|>" for q, a in qa_pairs]

max_length = 512
tokens = tokenizer(
    formatted_texts, max_length=max_length, padding="max_length", truncation=True, return_tensors="tf"
)['input_ids']

dataset = tf.data.Dataset.from_tensor_slices((tokens, tokens)).shuffle(1000).batch(2)

# --- 3. COMPILE AND FINE-TUNE THE MODEL ---

# Define training parameters
num_epochs = 3
num_train_steps = len(dataset) * num_epochs
learning_rate = 5e-5

# Use the recommended create_optimizer function from transformers
optimizer, schedule = create_optimizer(
    init_lr=learning_rate,
    num_warmup_steps=0,
    num_train_steps=num_train_steps,
    weight_decay_rate=0.01,
)

# The model has its own internal loss function, so you only need to provide the optimizer.
model.compile(optimizer=optimizer)

print("Starting fine-tuning...")
model.fit(dataset, epochs=num_epochs)
model.save_pretrained('./fine-tuned-physio-gpt2')
tokenizer.save_pretrained('./fine-tuned-physio-gpt2')
print("Fine-tuning complete and model saved.")

# --- 4. INFERENCE (This part remains the same) ---
tuned_model = TFGPT2LMHeadModel.from_pretrained('./fine-tuned-physio-gpt2')
tuned_tokenizer = GPT2Tokenizer.from_pretrained('./fine-tuned-physio-gpt2')

prompt = "<|startoftext|>Question: Can physiotherapy help with chronic pain?"
input_ids = tuned_tokenizer.encode(prompt, return_tensors='tf')

output_ids = tuned_model.generate(
    input_ids,
    max_length=150,
    num_beams=5,
    no_repeat_ngram_size=2,
    early_stopping=True
)

generated_text = tuned_tokenizer.decode(output_ids[0], skip_special_tokens=True)

print("\n--- Generated Response from Fine-Tuned Model ---\n")
print(generated_text)

pytorch_model.bin:   0%|          | 0.00/548M [00:00<?, ?B/s]

TensorFlow and JAX classes are deprecated and will be removed in Transformers v5. We recommend migrating to PyTorch classes or pinning your version of Transformers.
All PyTorch model weights were used when initializing TFGPT2LMHeadModel.

All the weights of TFGPT2LMHeadModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.


Pre-trained model and tokenizer loaded.
Starting fine-tuning...
Epoch 1/3
Epoch 2/3
Epoch 3/3

In [None]:
def gpt_answers():
  while True:
    pr=input('ask me question')
    input_ids = tuned_tokenizer.encode(pr, return_tensors='tf')

    output_ids = tuned_model.generate(
    input_ids,
    max_length=150,
    num_beams=5,
    no_repeat_ngram_size=2,
    early_stopping=True)

    generated_text = tuned_tokenizer.decode(output_ids[0], skip_special_tokens=True)

    print("\n--- Generated Response from Fine-Tuned Model ---\n")
    print(generated_text)


In [10]:
gpt_answers()


ask me questionwhat is active recovery


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.



--- Generated Response from Fine-Tuned Model ---

what is active recovery?

Answer: Active recovery is a process where the body uses a variety tissues to restore strength, balance, and mobility.
ask me questionWhat is the 'boom-bust' cycle of activity


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.



--- Generated Response from Fine-Tuned Model ---

What is the 'boom-bust' cycle of activity?

Bust is when a muscle relaxes, causing the muscles to contract and contract. It can also be a sign of muscle weakness.
ask me questionwhat is flare up managementpath


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.



--- Generated Response from Fine-Tuned Model ---

what is flare up managementpathology?
Answer: flare-up management is the process by which a patient is treated with a high-intensity exercise therapy. It involves the use of physiotherapy, physiotherapists, and physical therapists.
ask me questionhow can i improve my posture while sleeping


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.



--- Generated Response from Fine-Tuned Model ---

how can i improve my posture while sleeping?

Answer: It is possible to improve your posture by strengthening your pelvic floor muscles.
ask me questionhow does posture affect breathing


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.



--- Generated Response from Fine-Tuned Model ---

how does posture affect breathing?
Question: What is posture?
ask me questionHow does posture affect breathing?


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.



--- Generated Response from Fine-Tuned Model ---

How does posture affect breathing?

Patellar flexion is the movement of the hip flexor muscles to the floor. It is often used to stabilize a person's posture.
ask me questionWhat is the Lachman test?


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.



--- Generated Response from Fine-Tuned Model ---

What is the Lachman test?

Lachmach test is a diagnostic test used to assess muscle strength, flexibility, and coordination. It is used for athletes who are unable to perform the test correctly.


KeyboardInterrupt: Interrupted by user