<a href="https://colab.research.google.com/github/NdanyuzweP/air_quality_model_/blob/main/A%20chat%20bot.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
import pandas as pd
import re
from transformers import AutoTokenizer
import os
!pip install datasets
import tensorflow as tf
from transformers import TFGPT2LMHeadModel, GPT2Tokenizer, DataCollatorForLanguageModeling
from datasets import Dataset



In [5]:
# Load dataset
file_path = "/content/medicaldataset.csv"
df = pd.read_csv(file_path)

In [6]:
# Convert text to lowercase
df['question'] = df['question'].str.lower()
df['answer'] = df['answer'].str.lower()

# Remove special characters
def clean_text(text):
    # Check if the input is a string
    if isinstance(text, str):
        text = re.sub(r'[^a-zA-Z0-9 ?!.,]', '', text)
    else:
        # Handle non-string values
        text = str(text)  # Convert to string
    return text

df['question'] = df['question'].apply(clean_text)
df['answer'] = df['answer'].apply(clean_text)

In [7]:
# Tokenization using a Hugging Face tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
df['question_tokens'] = df['question'].apply(lambda x: tokenizer.tokenize(x))
df['answer_tokens'] = df['answer'].apply(lambda x: tokenizer.tokenize(x))

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Token indices sequence length is longer than the specified maximum sequence length for this model (1147 > 512). Running this sequence through the model will result in indexing errors


In [8]:
# Save preprocessed dataset
os.makedirs("/mnt/data", exist_ok=True)
df.to_csv("/mnt/data/preprocessed_medicaldataset.csv", index=False)

print("Preprocessing complete. Saved preprocessed dataset.")

Preprocessing complete. Saved preprocessed dataset.


In [9]:
# Load preprocessed dataset
file_path = "/mnt/data/preprocessed_medicaldataset.csv"
df = pd.read_csv(file_path)

# Combine question and answer as training data
df['input_text'] = "Question: " + df['question'] + " Answer: " + df['answer']

In [10]:
# Load GPT-2 tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token
model = TFGPT2LMHeadModel.from_pretrained("gpt2")

All PyTorch model weights were used when initializing TFGPT2LMHeadModel.

All the weights of TFGPT2LMHeadModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.


In [11]:
# Tokenize dataset
def tokenize_function(examples):
    # The input is already a list of strings when batched=True
    input_texts = examples["input_text"]
    # Ensure input_texts are strings before tokenization
    input_texts = [str(text) for text in input_texts] # Convert each element to string before passing to the tokenizer
    return tokenizer(input_texts, padding="max_length", truncation=True, max_length=512)

dataset = Dataset.from_pandas(df[['input_text']])
dataset = dataset.map(tokenize_function, batched=True)

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

Map:   0%|          | 0/16412 [00:00<?, ? examples/s]

In [12]:
# Convert dataset to TensorFlow format
tf_dataset = dataset.to_tf_dataset(
    columns=["input_ids", "attention_mask"],
    label_cols=["input_ids"],
    shuffle=True,
    batch_size=8,
    collate_fn=data_collator,
)

Old behaviour: columns=['a'], labels=['labels'] -> (tf.Tensor, tf.Tensor)  
             : columns='a', labels='labels' -> (tf.Tensor, tf.Tensor)  
New behaviour: columns=['a'],labels=['labels'] -> ({'a': tf.Tensor}, {'labels': tf.Tensor})  
             : columns='a', labels='labels' -> (tf.Tensor, tf.Tensor) 


In [13]:
# Compile and train model
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=5e-5))
model.fit(tf_dataset, epochs=1)



<tf_keras.src.callbacks.History at 0x7d4810ff0c10>

In [15]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

model.save_pretrained("movie_chatbot_model")
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
tokenizer.save_pretrained("movie_chatbot_model")


('movie_chatbot_model/tokenizer_config.json',
 'movie_chatbot_model/special_tokens_map.json',
 'movie_chatbot_model/vocab.txt',
 'movie_chatbot_model/added_tokens.json',
 'movie_chatbot_model/tokenizer.json')

In [16]:
model.save("movie_chatbot_model1")