In [11]:
import numpy as np
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Embedding, Dense, Bidirectional, LSTM, Input
from transformers import DistilBertTokenizer, TFDistilBertModel
from tensorflow.keras.preprocessing.sequence import pad_sequences
import pickle
import tensorflow as tf

# Load and preprocess the data
with open("que&ans.txt", "r", encoding="utf-8") as file:
    data = file.read().split('\n')

questions = []
answers = []

for line in data:
    if line.startswith('|Q|'):
        questions.append(line[3:])
    elif line.startswith('|A|'):
        answers.append(line[3:])

# Tokenize the data using DistilBERT tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
questions_tokens = tokenizer(questions, padding=True, truncation=True, return_tensors='tf')

input_ids = questions_tokens['input_ids']
attention_mask = questions_tokens['attention_mask']

# Create input sequences and pad them
input_sequences = []
for i in range(len(questions)):
    for j in range(1, len(input_ids[i])):
        n_gram_sequence = input_ids[i][:j + 1]
        input_sequences.append(n_gram_sequence)

max_sequence_length = max(len(seq) for seq in input_sequences)
input_sequences = pad_sequences(input_sequences, maxlen=max_sequence_length, padding='pre')

X, y = input_sequences[:, :-1], input_sequences[:, -1]
y = np.array(np.eye(np.max(input_sequences) + 1)[y])

# Build the model with DistilBERT layers
distilbert_model = TFDistilBertModel.from_pretrained('distilbert-base-uncased', trainable=False)
inputs = Input(shape=(max_sequence_length-1,), dtype=tf.int32)
distilbert_output = distilbert_model(inputs)[0]
bi_lstm = Bidirectional(LSTM(100))(distilbert_output)
output = Dense(np.max(input_sequences) + 1, activation='softmax')(bi_lstm)

model = Model(inputs=inputs, outputs=output)

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
model.fit(X, y, epochs=20, verbose=1)

# Save the tokenizer and model
model.save('chatbot_model_distilbert.h5')
with open('tokenizer_distilbert.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)


Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertModel: ['vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_transform.bias']
- This IS expected if you are initializing TFDistilBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFDistilBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


  saving_api.save_model(


In [28]:
import numpy as np
from tensorflow.keras.models import load_model, Model
from transformers import DistilBertTokenizer, TFDistilBertModel
import pickle
import tensorflow as tf

# Load the tokenizer and model with custom objects
custom_objects = {'TFDistilBertModel': TFDistilBertModel}
model = load_model('chatbot_model_distilbert.h5', custom_objects=custom_objects, compile=False)

with open('tokenizer_distilbert.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)

# Get the max sequence length used during training
if model.layers:
    max_sequence_length = model.layers[1].input_shape[1]

# Function to generate a response
def generate_response(question):
    if not question:
        return "Chatbot: Please enter a question."

    input_ids = tokenizer.encode(question, return_tensors='tf', max_length=max_sequence_length, padding='max_length', truncation=True)
    response_ids = []

    # for _ in range(max_sequence_length):
    #     logits = model.predict(input_ids)
    #     predicted_id = np.argmax(logits)
    #     response_ids.append(predicted_id)

    #     # Update input_ids for the next iteration
    #     input_ids = np.concatenate([input_ids, np.array([[predicted_id]])], axis=1)

    # # Ensure the response length does not exceed the max_sequence_length
    # response_tokens = tokenizer.decode(response_ids, skip_special_tokens=True)
    for i in range(max_sequence_length - 1):
         if i >= len(response_ids): break

         logits = model.predict(input_ids)
         predicted_id = np.argmax(logits)

         if predicted_id == tokenizer.eos_token_id:
            break

         response_ids.append(predicted_id)

         input_ids = np.concatenate([input_ids,
                              np.array([[predicted_id]])], axis=1)[:, -max_sequence_length:]

    response_tokens = tokenizer.decode(response_ids, skip_special_tokens=True)
    return "Chatbot: " + response_tokens

# Chat with the model
print("Chatbot: Hi! I'm your chatbot. You can type 'exit' to end the conversation.")
while True:
    user_input = input("You: ")
    if user_input.lower() == 'exit':
        print("Chatbot: Goodbye!")
        break
    elif user_input.lower() == 'start':
        print("Chatbot: You can start typing your questions.")
    else:
        response = generate_response(user_input)
        print(response)


Chatbot: Hi! I'm your chatbot. You can type 'exit' to end the conversation.
You: what is the difference between policy and procedure?
Chatbot: 
You: 
Chatbot: Please enter a question.
You: hi
Chatbot: 
You: exit
Chatbot: Goodbye!
