In [15]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/chatbox-faq/chatbox.json
/kaggle/input/chatbot-dataset/dataset.json


In [16]:
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
import torch
import torch.nn
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

import json 

In [17]:
data_path = '/kaggle/input/chatbot-dataset/dataset.json'

# PREPARE DATA

In [18]:
with open(data_path, 'r') as f:
    data = json.load(f)
    
texts = [i['Question'] for i in data]
# texts = sorted(set(texts), key=texts.index)
labels = [index for index, _ in enumerate(texts)]

In [19]:
for i in range(10):  # Print the first 10 labels and corresponding texts
    print(f"Label: {labels[i]}, Text: {texts[i]}")

Label: 0, Text: How can I register for the hackathon online?
Label: 1, Text: Is there an offline registration option?
Label: 2, Text: What is the registration link?
Label: 3, Text: Is there a registration form I need to fill out?
Label: 4, Text: Do I need to create an account to register?
Label: 5, Text: Can I register using my social media account?
Label: 6, Text: Is there a minimum age requirement for registration?
Label: 7, Text: Can I register if I'm a student?
Label: 8, Text: Can I register if I'm a professional?
Label: 9, Text: Can I register if I'm from outside the city/state/country?


# PREPROCESSING

In [20]:
model_name = 'distilbert-base-uncased'
tokenizer = DistilBertTokenizer.from_pretrained(model_name)

In [21]:
inputs = tokenizer(texts, return_tensors="pt", padding=True, truncation=True)
# labels = torch.tensor(labels)

In [22]:
class SentimentDataset(torch.utils.data.Dataset):
    def __init__(self, input_ids, attention_mask, labels):
        self.input_ids = input_ids
        self.attention_mask = attention_mask
        self.labels = labels

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return {
            'input_ids': self.input_ids[idx],
            'attention_mask': self.attention_mask[idx],
            'labels': torch.tensor(self.labels[idx])
        }

# Create dataset and dataloader
dataset = SentimentDataset(
    input_ids=inputs['input_ids'],
    attention_mask=inputs['attention_mask'],
    labels=labels
)

In [23]:
dataloader = DataLoader(dataset, batch_size=16, shuffle=True)

# PREPARE MODEL

In [24]:
model = DistilBertForSequenceClassification.from_pretrained(model_name, num_labels = len(labels))

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'pre_classifier.bias', 'pre_classifier.weight', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [25]:
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)
criterion = torch.nn.CrossEntropyLoss()

# TRAIN

In [39]:
from tqdm import tqdm

num_epochs = 6

# Assuming you have a separate dataloader for validation data named 'val_dataloader'
# and a model named 'model'

for epoch in range(num_epochs):
    total_loss = 0
    total_correct = 0
    total_samples = 0

    # Training loop
    progress_bar = tqdm(enumerate(dataloader, 1), total=len(dataloader))
    for step, batch in progress_bar:
        optimizer.zero_grad()
        outputs = model(**batch)
        loss = criterion(outputs.logits, batch['labels'])
        loss.backward()
        optimizer.step()

        # Calculate accuracy
        predictions = torch.argmax(outputs.logits, dim=1)
        correct = (predictions == batch['labels']).sum().item()
        total_correct += correct
        total_samples += len(batch['labels'])

        # Accumulate loss
        total_loss += loss.item()

        progress_bar.set_description(f'Epoch {epoch+1}/{num_epochs}, Step {step}/{len(dataloader)}')
        progress_bar.set_postfix({'Loss': loss.item(), 'Accuracy': correct / len(batch['labels'])})

    # Calculate training statistics for the epoch
    epoch_loss = total_loss / len(dataloader)
    epoch_accuracy = total_correct / total_samples

    # Validation loop
    model.eval()
    val_total_loss = 0
    val_total_correct = 0
    val_total_samples = 0

    with torch.no_grad():
        for val_step, val_batch in enumerate(dataloader, 1):
            val_outputs = model(**val_batch)
            val_loss = criterion(val_outputs.logits, val_batch['labels'])

            # Calculate accuracy
            val_predictions = torch.argmax(val_outputs.logits, dim=1)
            val_correct = (val_predictions == val_batch['labels']).sum().item()
            val_total_correct += val_correct
            val_total_samples += len(val_batch['labels'])

            # Accumulate loss
            val_total_loss += val_loss.item()

    # Calculate validation statistics for the epoch
    val_epoch_loss = val_total_loss / len(dataloader)
    val_epoch_accuracy = val_total_correct / val_total_samples

    # Print statistics for the epoch
    print(f'Epoch {epoch+1}/{num_epochs}, Loss: {epoch_loss:.4f}, Accuracy: {epoch_accuracy:.4f}, Val Loss: {val_epoch_loss:.4f}, Val Accuracy: {val_epoch_accuracy:.4f}')


Epoch 1/6, Step 5/5: 100%|██████████| 5/5 [00:03<00:00,  1.49it/s, Loss=2.99, Accuracy=1]


Epoch 1/6, Loss: 2.9580, Accuracy: 1.0000, Val Loss: 2.9059, Val Accuracy: 1.0000


Epoch 2/6, Step 5/5: 100%|██████████| 5/5 [00:03<00:00,  1.50it/s, Loss=2.85, Accuracy=1]


Epoch 2/6, Loss: 2.8812, Accuracy: 1.0000, Val Loss: 2.8281, Val Accuracy: 1.0000


Epoch 3/6, Step 5/5: 100%|██████████| 5/5 [00:03<00:00,  1.51it/s, Loss=2.77, Accuracy=1]


Epoch 3/6, Loss: 2.8078, Accuracy: 1.0000, Val Loss: 2.7619, Val Accuracy: 1.0000


Epoch 4/6, Step 5/5: 100%|██████████| 5/5 [00:03<00:00,  1.25it/s, Loss=2.71, Accuracy=1]


Epoch 4/6, Loss: 2.7359, Accuracy: 1.0000, Val Loss: 2.6823, Val Accuracy: 1.0000


Epoch 5/6, Step 5/5: 100%|██████████| 5/5 [00:03<00:00,  1.47it/s, Loss=2.62, Accuracy=1]


Epoch 5/6, Loss: 2.6667, Accuracy: 1.0000, Val Loss: 2.6168, Val Accuracy: 1.0000


Epoch 6/6, Step 5/5: 100%|██████████| 5/5 [00:03<00:00,  1.46it/s, Loss=2.58, Accuracy=1]


Epoch 6/6, Loss: 2.6013, Accuracy: 1.0000, Val Loss: 2.5602, Val Accuracy: 1.0000


# PREDICT

In [37]:
def get_response(input_text):
    # Preprocess input text
    inputs = tokenizer(input_text, return_tensors="pt")
    
    # Step 5: Inference
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
    
    # Convert logits to probabilities and get predicted label
    probabilities = torch.softmax(logits, dim=-1)
    predicted_label = torch.argmax(probabilities, dim=-1).item()
    
    # Retrieve corresponding answer based on predicted label
    response = data[predicted_label]['Answer']
    
    return response

In [43]:
text_for_pred = 'how to register to hackathon?'
get_response(text_for_pred)

'You can register for the hackathon by visiting the Devfolio website. Simply navigate to the registration section to get started!'