In [4]:
import tensorflow as tf
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
!pip install transformers datasets
from datasets import load_dataset
from transformers import TFDebertaForSequenceClassification, DebertaTokenizerFast, create_optimizer, DataCollatorWithPadding



In [3]:
!pip install -q kaggle
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 /root/.kaggle/kaggle.json
!kaggle datasets download -d bitext/training-dataset-for-chatbotsvirtual-assistants
!unzip training-dataset-for-chatbotsvirtual-assistants.zip

Downloading training-dataset-for-chatbotsvirtual-assistants.zip to /content
  0% 0.00/1.16M [00:00<?, ?B/s]
100% 1.16M/1.16M [00:00<00:00, 135MB/s]
Archive:  training-dataset-for-chatbotsvirtual-assistants.zip
  inflating: 20000-Utterances-Training-dataset-for-chatbots-virtual-assistant-Bitext-sample/20000-Utterances-Training-dataset-for-chatbots-virtual-assistant-Bitext-sample/20000-Utterances-Training-dataset-for-chatbots-virtual-assistant-Bitext-sample.csv  
  inflating: 20000-Utterances-Training-dataset-for-chatbots-virtual-assistant-Bitext-sample/20000-Utterances-Training-dataset-for-chatbots-virtual-assistant-Bitext-sample/20000-Utterances-Training-dataset-for-chatbots-virtual-assistant-Bitext-sample.xlsx  
  inflating: 20000-Utterances-Training-dataset-for-chatbots-virtual-assistant-Bitext-sample/20000-Utterances-Training-dataset-for-chatbots-virtual-assistant-Bitext-sample/LICENSE.txt  
  inflating: 20000-Utterances-Training-dataset-for-chatbots-virtual-assistant-Bitext-sampl

In [5]:
dataset = load_dataset("csv", data_files="/content/20000-Utterances-Training-dataset-for-chatbots-virtual-assistant-Bitext-sample/20000-Utterances-Training-dataset-for-chatbots-virtual-assistant-Bitext-sample/20000-Utterances-Training-dataset-for-chatbots-virtual-assistant-Bitext-sample.csv")

Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-33f63414cf02978d/0.0.0/eea64c71ca8b46dd3f537ed218fc9bf495d5707789152eb2764f5c78fa66d59d...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-33f63414cf02978d/0.0.0/eea64c71ca8b46dd3f537ed218fc9bf495d5707789152eb2764f5c78fa66d59d. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

In [6]:
dataset

DatasetDict({
    train: Dataset({
        features: ['flags', 'utterance', 'category', 'intent'],
        num_rows: 21534
    })
})

In [7]:
dataset["train"][0]

{'flags': 'BILC',
 'utterance': "I don't have an online account, what do I have to do to register?",
 'category': 'ACCOUNT',
 'intent': 'create_account'}

In [9]:
intents = list(set(dataset["train"]["intent"]))
intents

['change_shipping_address',
 'check_refund_policy',
 'recover_password',
 'contact_customer_service',
 'delivery_options',
 'registration_problems',
 'check_payment_methods',
 'get_invoice',
 'set_up_shipping_address',
 'delivery_period',
 'switch_account',
 'review',
 'track_refund',
 'place_order',
 'check_invoices',
 'newsletter_subscription',
 'complaint',
 'cancel_order',
 'payment_issue',
 'delete_account',
 'contact_human_agent',
 'edit_account',
 'get_refund',
 'track_order',
 'create_account',
 'change_order',
 'check_cancellation_fee']

In [10]:
dict_intents = {intents[i]:i for i in range(len(intents))}
dict_intents

{'change_shipping_address': 0,
 'check_refund_policy': 1,
 'recover_password': 2,
 'contact_customer_service': 3,
 'delivery_options': 4,
 'registration_problems': 5,
 'check_payment_methods': 6,
 'get_invoice': 7,
 'set_up_shipping_address': 8,
 'delivery_period': 9,
 'switch_account': 10,
 'review': 11,
 'track_refund': 12,
 'place_order': 13,
 'check_invoices': 14,
 'newsletter_subscription': 15,
 'complaint': 16,
 'cancel_order': 17,
 'payment_issue': 18,
 'delete_account': 19,
 'contact_human_agent': 20,
 'edit_account': 21,
 'get_refund': 22,
 'track_order': 23,
 'create_account': 24,
 'change_order': 25,
 'check_cancellation_fee': 26}

In [11]:
def process(dataset):
  return {"utterance":dataset["utterance"],
          "intent":dict_intents[dataset["intent"]]}

In [12]:
prep_dataset = dataset.map(process)

Map:   0%|          | 0/21534 [00:00<?, ? examples/s]

In [14]:
prep_dataset["train"][0]

{'flags': 'BILC',
 'utterance': "I don't have an online account, what do I have to do to register?",
 'category': 'ACCOUNT',
 'intent': 24}

In [20]:
model_id = "microsoft/deberta-base"
tokenizer = DebertaTokenizerFast.from_pretrained(model_id)

In [23]:
def tokenize(dataset):
  return tokenizer(dataset["utterance"],)

In [24]:
tokenized_dataset = prep_dataset.map(tokenize)

Map:   0%|          | 0/21534 [00:00<?, ? examples/s]

In [25]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['flags', 'utterance', 'category', 'intent', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 21534
    })
})

In [26]:
tokenized_dataset["train"][0]

{'flags': 'BILC',
 'utterance': "I don't have an online account, what do I have to do to register?",
 'category': 'ACCOUNT',
 'intent': 24,
 'input_ids': [1,
  100,
  218,
  75,
  33,
  41,
  804,
  1316,
  6,
  99,
  109,
  38,
  33,
  7,
  109,
  7,
  5124,
  116,
  2],
 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [28]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="tf")

In [51]:
tf_dataset = tokenized_dataset["train"].to_tf_dataset(
    columns=["input_ids","attention_mask","intent"],
    shuffle=True,
    batch_size=32,
    collate_fn=data_collator
)

In [52]:
tf_dataset

<_PrefetchDataset element_spec={'intent': TensorSpec(shape=(None,), dtype=tf.int64, name=None), 'input_ids': TensorSpec(shape=(None, None), dtype=tf.int64, name=None), 'attention_mask': TensorSpec(shape=(None, None), dtype=tf.int64, name=None)}>

In [53]:
def swap_position(dataset):
  return {"input_ids":dataset["input_ids"],
          "attention_mask":dataset["attention_mask"],},dataset["intent"]

In [54]:
tf_dataset = tf_dataset.map(swap_position)

In [55]:
tf_dataset

<_MapDataset element_spec=({'input_ids': TensorSpec(shape=(None, None), dtype=tf.int64, name=None), 'attention_mask': TensorSpec(shape=(None, None), dtype=tf.int64, name=None)}, TensorSpec(shape=(None,), dtype=tf.int64, name=None))>

In [56]:
tf_train_dataset = tf_dataset.take(int(0.9*len(tf_dataset)))
tf_val_dataset = tf_dataset.skip(int(0.9*len(tf_dataset)))

In [58]:
for i in tf_val_dataset.take(1):
  print(i)

({'input_ids': <tf.Tensor: shape=(32, 19), dtype=int64, numpy=
array([[    1,  8569,   109,    38,  2069,    41, 38199,   116,     2,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0],
       [    1,   118, 33976,   236,   127, 23523,  1316,     6,   141,
           64,   939, 17462,    24,   116,     2,     0,     0,     0,
            0],
       [    1,   100,   236,     7,   216,   114,    38,    64,  5124,
           80,  2349,    19,    10,   881,  1047,  1100,     2,     0,
            0],
       [    1,   118, 23126,   216,    99,     5,  1047,     7,  1511,
         2111,   544,    16,     2,     0,     0,     0,     0,     0,
            0],
       [    1,   100,  1017,   101,     7,  2069,    41, 38199,     6,
           64,    47,  1137,   162,   141,     7,   109,    24,   116,
            2],
       [    1,   100,   236,     7,   216,   141,    38,    64,  1649,
          127,  8368,     2,     0,     0,     0,     0,     0,     0,
     

In [59]:
model = TFDebertaForSequenceClassification.from_pretrained(model_id, num_labels=len(intents))

All model checkpoint layers were used when initializing TFDebertaForSequenceClassification.

Some layers of TFDebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-base and are newly initialized: ['classifier', 'pooler', 'cls_dropout']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [60]:
model.summary()

Model: "tf_deberta_for_sequence_classification_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 deberta (TFDebertaMainLayer  multiple                 138601728 
 )                                                               
                                                                 
 pooler (TFDebertaContextPoo  multiple                 590592    
 ler)                                                            
                                                                 
 cls_dropout (TFDebertaStabl  multiple                 0         
 eDropout)                                                       
                                                                 
 classifier (Dense)          multiple                  20763     
                                                                 
Total params: 139,213,083
Trainable params: 139,213,083
Non-trainable params: 0
____________

In [61]:
num_epochs = 3
batches_per_epoch = len(tokenized_dataset["train"])//32
total_train_steps = int(batches_per_epoch*num_epochs)

In [62]:
optimizer, schedule = create_optimizer(init_lr=2e-5, num_warmup_steps=0, num_train_steps=total_train_steps)

In [63]:
model.compile(optimizer=optimizer,
              metrics=["accuracy"])

In [64]:
history = model.fit(tf_train_dataset, validation_data=tf_val_dataset, epochs=num_epochs)

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [65]:
inputs = tokenizer(["Please how do i go about the account creation? ",
                    "After setting up my account, i feel like i need to change it. How do i go about that?",
                    "how do i know how much i need to pay?",
                    "purchased a product, which i now want to change"
                    ], padding=True,return_tensors="tf")

logits = model(**inputs).logits
outputs=tf.argmax(logits,axis=-1).numpy()

In [66]:
reverse_dict_intents={i:intents[i] for i in range(len(intents))}
print(reverse_dict_intents)

{0: 'change_shipping_address', 1: 'check_refund_policy', 2: 'recover_password', 3: 'contact_customer_service', 4: 'delivery_options', 5: 'registration_problems', 6: 'check_payment_methods', 7: 'get_invoice', 8: 'set_up_shipping_address', 9: 'delivery_period', 10: 'switch_account', 11: 'review', 12: 'track_refund', 13: 'place_order', 14: 'check_invoices', 15: 'newsletter_subscription', 16: 'complaint', 17: 'cancel_order', 18: 'payment_issue', 19: 'delete_account', 20: 'contact_human_agent', 21: 'edit_account', 22: 'get_refund', 23: 'track_order', 24: 'create_account', 25: 'change_order', 26: 'check_cancellation_fee'}


In [67]:
for i in outputs:
  print(reverse_dict_intents[i])

create_account
edit_account
payment_issue
change_order
