<a href="https://colab.research.google.com/github/JABU-2022/Q-A_Chat_Bot/blob/main/Chatbot_using_Bert.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Import libraries


In [None]:
import pandas as pd
import numpy as np
import torch
import json
import re
import pickle
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import LabelEncoder
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset, RandomSampler, SequentialSampler

Load data

In [None]:
data_path = '/content/intents.json'
with open(data_path, 'r') as file:
    data = json.load(file)

Prepare the dataset

In [None]:
questions = []
labels = []

for intent in data['intents']:
    for pattern in intent['patterns']:
        questions.append(pattern)
        labels.append(intent['tag'])

df = pd.DataFrame({'question': questions, 'label': labels})

Encode the labels

In [None]:
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['label'])

Save the label encoder

In [None]:
with open('label_encoder.pkl', 'wb') as file:
    pickle.dump(label_encoder, file)

Split the data

In [None]:
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

Data Preprocessing

In [None]:
def clean_text(text):
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text, re.I|re.A)
    text = text.lower().strip()
    return text

train_df['question'] = train_df['question'].apply(clean_text)
test_df['question'] = test_df['question'].apply(clean_text)


tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

class MentalHealthDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        question = str(self.data.iloc[index, 0])
        label = self.data.iloc[index, 1]

        inputs = self.tokenizer.encode_plus(
            question,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=False,
            truncation=True
        )

        input_ids = inputs['input_ids']
        attention_mask = inputs['attention_mask']

        return {
            'input_ids': torch.tensor(input_ids, dtype=torch.long),
            'attention_mask': torch.tensor(attention_mask, dtype=torch.long),
            'label': torch.tensor(label, dtype=torch.long)
        }

MAX_LEN = 32
BATCH_SIZE = 16

train_dataset = MentalHealthDataset(train_df, tokenizer, MAX_LEN)
test_dataset = MentalHealthDataset(test_df, tokenizer, MAX_LEN)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

Model Fine-tuning

In [None]:
from transformers import AdamW, get_linear_schedule_with_warmup

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(label_encoder.classes_))
model.to(device)

optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)
total_steps = len(train_loader) * 3
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

def train_epoch(model, data_loader, optimizer, device, scheduler):
    model.train()
    losses = []
    correct_predictions = 0

    for batch in data_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )

        loss = outputs.loss
        logits = outputs.logits

        _, preds = torch.max(logits, dim=1)
        correct_predictions += torch.sum(preds == labels)
        losses.append(loss.item())

        loss.backward()
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()

    return correct_predictions.double() / len(data_loader.dataset), np.mean(losses)

EPOCHS = 3

for epoch in range(EPOCHS):
    train_acc, train_loss = train_epoch(model, train_loader, optimizer, device, scheduler)
    print(f'Epoch {epoch + 1}/{EPOCHS}')
    print(f'Train loss: {train_loss}, Train accuracy: {train_acc}')


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3
Train loss: 4.373945474624634, Train accuracy: 0.021621621621621623
Epoch 2/3
Train loss: 4.297305623690288, Train accuracy: 0.04864864864864865
Epoch 3/3
Train loss: 4.269392490386963, Train accuracy: 0.05945945945945946


Building the Interface

In [None]:
!pip install streamlit
import streamlit as st
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=8)

model.to(device)
model.eval()

# Load the label encoder
with open('label_encoder.pkl', 'rb') as file:
    label_encoder = pickle.load(file)

# Load the tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def predict(question):
    inputs = tokenizer.encode_plus(
        question,
        None,
        add_special_tokens=True,
        max_length=32,
        pad_to_max_length=True,
        return_token_type_ids=False,
        truncation=True
    )

    input_ids = torch.tensor(inputs['input_ids']).unsqueeze(0).to(device)
    attention_mask = torch.tensor(inputs['attention_mask']).unsqueeze(0).to(device)

    outputs = model(input_ids=input_ids, attention_mask=attention_mask)
    logits = outputs.logits

    _, preds = torch.max(logits, dim=1)
    pred_label = label_encoder.inverse_transform(preds.cpu().numpy())

    return pred_label[0]


st.title("Mental Health Q&A Chatbot")
st.write("Ask me anything about mental health.")

user_query = st.text_input("Your Question:")
if user_query:
    response = predict(user_query)
    st.write(response)




Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation and Examples

In [None]:
def eval_model(model, data_loader, device):
    model.eval()
    predictions = []
    true_labels = []

    with torch.no_grad():
        for input_ids, attention_mask, labels in data_loader:
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            labels = labels.to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs

            _, preds = torch.max(logits, dim=1)

            predictions.extend(preds.tolist())
            true_labels.extend(labels.tolist())

    accuracy = accuracy_score(true_labels, predictions)
    report = classification_report(true_labels, predictions, target_names=label_encoder.classes_)

    return accuracy, report

test_accuracy, test_report = eval_model(model, test_loader, device)
print(f'Test Accuracy: {test_accuracy}')
print(f'Test Report: {test_report}')


Test Accuracy: 0.31
Test Report:               precision    recall  f1-score   support

      class1       0.24      0.26      0.25        27
      class2       0.33      0.38      0.36        34
      class3       0.34      0.28      0.31        39

    accuracy                           0.31       100
   macro avg       0.31      0.31      0.31       100
weighted avg       0.31      0.31      0.31       100

