# **Parameters**

In [None]:
datasetDir = '../datasets'
modelDir = 'saved_models'

In [None]:
Import Libraries

In [None]:
# Import necessary libraries
import tensorflow as tf
from transformers import TFGPT2LMHeadModel, GPT2Tokenizer
import pandas as pd
import json
from sklearn.model_selection import train_test_split


In [None]:
Load Preprocessed Data

In [None]:
# Load preprocessed data
preprocessed_data_path = "preprocessed_data.json"

# Load the JSON file into a DataFrame
with open(preprocessed_data_path, "r") as f:
    data = [json.loads(line) for line in f]

df = pd.DataFrame(data)

# Convert input_ids and attention_mask back to lists
df["input_ids"] = df["input_ids"].apply(lambda x: list(map(int, x)))
df["attention_mask"] = df["attention_mask"].apply(lambda x: list(map(int, x)))

# Inspect the DataFrame
print("Preprocessed data preview:")
print(df.head())


In [None]:
Prepare Data for TensorFlow

In [None]:
# Convert the DataFrame into TensorFlow datasets
X = tf.constant(df["input_ids"].to_list())
attention_masks = tf.constant(df["attention_mask"].to_list())

# Split into training and validation sets
X_train, X_val, attn_train, attn_val = train_test_split(
    X, attention_masks, test_size=0.2, random_state=42
)

# Create TensorFlow datasets
batch_size = 8
train_dataset = tf.data.Dataset.from_tensor_slices((X_train, X_train))
val_dataset = tf.data.Dataset.from_tensor_slices((X_val, X_val))

train_dataset = train_dataset.shuffle(1000).batch(batch_size).prefetch(tf.data.AUTOTUNE)
val_dataset = val_dataset.batch(batch_size).prefetch(tf.data.AUTOTUNE)


In [None]:
Load Pre-trained GPT-2 Model

In [None]:
# Load the pre-trained GPT-2 model and tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = TFGPT2LMHeadModel.from_pretrained("gpt2")

# Resize token embeddings to include the added special token
model.resize_token_embeddings(len(tokenizer))


In [None]:
Compile and Fine-tune the Model

In [None]:
# Compile the model
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
model.compile(optimizer=optimizer, loss=loss_fn)

# Fine-tune the model
epochs = 3
history = model.fit(
    train_dataset, validation_data=val_dataset, epochs=epochs
)


In [None]:
Save the Fine-tuned Model

In [None]:
# Save the fine-tuned model and tokenizer
output_dir = os.path.join(modelDir,"chatbot_model")
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

print(f"Model saved to {output_dir}")


In [None]:
Test the Model

In [None]:
# Load the model and tokenizer for inference
model = TFGPT2LMHeadModel.from_pretrained(output_dir)
tokenizer = GPT2Tokenizer.from_pretrained(output_dir)

# Generate a response
def generate_response(prompt, max_length=50):
    input_ids = tokenizer(prompt, return_tensors="tf").input_ids
    output_ids = model.generate(input_ids, max_length=max_length, pad_token_id=tokenizer.eos_token_id)
    return tokenizer.decode(output_ids[0], skip_special_tokens=True)

# Test the model
user_input = "Hello, how are you?"
response = generate_response(user_input)
print(f"User: {user_input}")
print(f"Chatbot: {response}")
