<a href="https://colab.research.google.com/github/MaryamNourii/ChatBot/blob/main/bert_chatbot.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install tensorflow
!pip install transformers

In [None]:
import os
import pandas as pd
import numpy as np
import tensorflow as tf
from transformers import TFAutoModelForSeq2SeqLM, AutoTokenizer, BartConfig,BertTokenizerFast
from sklearn.model_selection import train_test_split

In [None]:
data = pd.read_csv("percQA_CleanData.csv")
input_texts = data["question"].tolist()
output_texts = data["answer"].tolist()

In [None]:
input_train, input_val, output_train, output_val = train_test_split(input_texts, output_texts, test_size=0.1, random_state=42)

In [None]:
MODEL_NAME_OR_PATH = 'HooshvareLab/bert-fa-base-uncased'

In [None]:
max_length = 64

In [None]:
config = BartConfig.from_pretrained(MODEL_NAME_OR_PATH, output_hidden_states=False)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME_OR_PATH)
model = TFAutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME_OR_PATH, config = config)

In [None]:
train_encodings = tokenizer(input_train,
                            add_special_tokens = True,
                            max_length = max_length,
                            truncation = True,
                            padding = 'max_length', 
                            return_tensors = 'tf',
                            return_token_type_ids = True,
                            return_attention_mask = True,
                            verbose = True)
val_encodings = tokenizer(input_val,
                            add_special_tokens = True,
                            max_length = max_length,
                            truncation = True,
                            padding = 'max_length', 
                            return_tensors = 'tf',
                            return_token_type_ids = True,
                            return_attention_mask = True,
                            verbose = True)
train_decodings = tokenizer(output_train,
                            add_special_tokens = True,
                            max_length = max_length,
                            truncation = True,
                            padding = 'max_length', 
                            return_tensors = 'tf',
                            return_token_type_ids = True,
                            return_attention_mask = True,
                            verbose = True)                            
val_decodings = tokenizer(output_val,
                          add_special_tokens = True,
                          max_length = max_length,
                          truncation = True,
                          padding = 'max_length', 
                          return_tensors = 'tf',
                          return_token_type_ids = True,
                          return_attention_mask = True,
                          verbose = True)                          

In [None]:
train_encodings

In [None]:
train_encdngs = {'input_ids': train_encodings['input_ids'], 'attention_mask': train_encodings['attention_mask'],'token_ids': train_encodings['token_type_ids']}
val_encdngs = {'input_ids': val_encodings['input_ids'], 'attention_mask': val_encodings['attention_mask'],'token_ids': val_encodings['token_type_ids']}
train_dcdngs = {'input_ids': train_decodings['input_ids'], 'attention_mask': train_decodings['attention_mask'],'token_ids': train_decodings['token_type_ids']}
val_dcdngs = {'input_ids': val_decodings['input_ids'], 'attention_mask': val_decodings['attention_mask'],'token_ids': val_decodings['token_type_ids']}

In [None]:
train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_encdngs),
    dict(train_dcdngs)
)).shuffle(1000).batch(1)

val_dataset = tf.data.Dataset.from_tensor_slices((
    dict(val_encdngs),
    dict(val_dcdngs)
)).shuffle(1000).batch(1)

In [None]:
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
model.compile(optimizer=optimizer, loss=model.compute_loss)

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('your-model-name')

def tokenize_function(examples):
    return tokenizer(examples['input_text'], padding=True, truncation=True, max_length=64)

train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)

In [None]:
model.fit(train_dataset, epochs=3, validation_data=val_dataset)

In [None]:
model.save_pretrained("fine_tuned_parsbert/")
tokenizer.save_pretrained("fine_tuned_parsbert/")

In [None]:
def chatbot_response(text):
    tokenizer = AutoTokenizer.from_pretrained("fine_tuned_parsbert/")
    model = TFAutoModelForSeq2SeqLM.from_pretrained("fine_tuned_parsbert/")
    input_text = tokenizer.encode(text, return_tensors="tf")
    output_text = model.generate(input_text, max_length=128, num_return_sequences=1)
    decoded_output = tokenizer.decode(output_text[0], skip_special_tokens=True)

    return decoded_output

input_text = "سلام، حال شما چطور است؟"
response = chatbot_response(input_text)
print(response)