<a href="https://colab.research.google.com/github/MaryamNourii/Intent-detection-and-Slot-filling/blob/main/Joint_BERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Template solution for BehsazanFardaAIChallenge2023**

In [None]:
!pip install -q transformers
!pip install -q tensorflow
!pip install nlpaug
!pip install tensorflow_addons

import tensorflow as tf
from transformers import BertConfig, TFAutoModel , BertTokenizer
from tensorflow.keras.layers import Layer
from tensorflow.keras.layers import Input, Dense, Dropout, TimeDistributed
from tensorflow.keras.models import Model
from tensorflow.keras.initializers import TruncatedNormal
from sklearn.model_selection import train_test_split
from tensorflow.keras.optimizers import Adam
import pandas as pd
import numpy as np
from tensorflow.keras.metrics import SparseCategoricalAccuracy
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from sklearn.preprocessing import LabelEncoder
import tensorflow.keras.backend as K
from pathlib import Path
import re
import tensorflow_addons as tfa
from tensorflow_addons.optimizers import AdamW



MAX_LENGTH = 45
epochs = 10
batch_size = 16

MODEL_NAME_OR_PATH = "HooshvareLab/bert-fa-zwnj-base"
bert_cnfg = BertConfig.from_pretrained(MODEL_NAME_OR_PATH, output_hidden_states=True)
bert = TFAutoModel.from_pretrained(MODEL_NAME_OR_PATH, config=bert_cnfg)
bert_tknzr = BertTokenizer.from_pretrained(MODEL_NAME_OR_PATH, config=bert_cnfg)

## **Fetch data and Data augmentation**

In [None]:
!unzip -qx dataset.zip -d dataset

In [None]:
import pandas as pd
import nlpaug.augmenter.word as naw

data = pd.read_csv("/content/dataset/train.csv")

def augment_data(sentence, slots, intent_label, aug):
    augmented_sentence = aug.augment(sentence)
    return augmented_sentence, slots, intent_label

aug = naw.ContextualWordEmbsAug(model_path=MODEL_NAME_OR_PATH, action="insert")

aug_data = []
for i in range(len(data)):
    sentence = data.loc[i, 'sentence']
    slots = data.loc[i, 'slots']
    intent_label = data.loc[i, 'intent_label']
    augmented_sentence, slots, intent_label = augment_data(sentence, slots, intent_label, aug)
    aug_data.append([''.join(augmented_sentence), slots, intent_label])

aug_data = pd.DataFrame(aug_data, columns=['sentence', 'slots', 'intent_label'])

augmented_data = pd.concat([data, aug_data], ignore_index=True)
augmented_data=augmented_data.drop('index',axis=1)
augmented_data = augmented_data.sample(frac=1).reset_index(drop=True)
augmented_data.to_csv('/content/dataset/train.csv', index=False)

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.11M [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/473M [00:00<?, ?B/s]

# **Load datasets**

In [None]:
df_train = pd.read_csv("/content/dataset/train.csv", sep=',')

df_train['slots'] = df_train['slots'].apply(lambda x: x.title())
df_train['slots'] = df_train['slots'].apply(lambda x: ' '.join([('B-Cartnumber') if label.strip() == 'B-Cardnumber' else label for label in x.split()]))
df_train

Unnamed: 0,sentence,slots,intent_label
0,همه قبض [ZWNJ] سپرده های من اقساط را با هر تک ...,O B-Billname O O O O O O O O O O O O O O O O O,bill_payment
1,در شماره مجازی حساب 90056789 به میزان موجودی ب...,O O O B-Accountnumber O O O O O O,balance_query
2,برام یه پایا بزن,O O B-Method O,transaction_paya
3,شناسه قبض آب تون رو مستقیم از کارت ملتم پرداخت کن,B-Billname I-Billname O O O B-Bankname O O,bill_payment
4,بانک امنیت ملی من ؛ نمیدونم ، موجودیش چقدره لط...,B-Bankname I-Bankname O O O O O O O,balance_query
...,...,...,...
221,از حساب ۱۲۳۴۵۶۷ پول قبض آب رو بده,O O B-Accountnumber O B-Billname I-Billname O O,bill_payment
222,من میخوام حواله کارت پایا بزنم,O B-Method I-Method O,transaction_paya
223,توصیه من از میخوام حدود 5 800 هزار تومن حساب خ...,O O B-Amount I-Amount I-Amount O O O O B-Cartn...,transaction_card
224,جعبه موجودی آخرین حساب اینستاگرام من بهت بگو,O O O O O,balance_query


In [None]:
df_test = pd.read_csv("/content/dataset/test.csv", sep=',')


intent_names = Path('/content/dataset/intents.txt').read_text('utf-8').split()
num_intent_labels = len(intent_names)

slot_names = ["[PAD]"]
slot_names += Path('/content/dataset/slots.txt').read_text('utf-8').strip().title().splitlines()
num_slot_labels = len(slot_names)
slot_names,intent_names

(['[PAD]',
  'B-Bankname',
  'I-Bankname',
  'B-Billname',
  'I-Billname',
  'B-Cartnumber',
  'B-Accountnumber',
  'B-Username',
  'I-Username',
  'B-Iban',
  'B-Method',
  'I-Method',
  'B-Amount',
  'I-Amount',
  'B-Exprdate',
  'O'],
 ['transaction_card',
  'transaction_paya',
  'bill_payment',
  'balance_query',
  'introduce'])

# **Create intents and slots map**

In [None]:
# slot_le = LabelEncoder()
# slot_classes_index = slot_le.fit_transform(slot_names)


intent_le = LabelEncoder()
intent_classes_index = intent_le.fit_transform(intent_names)

# slot_map = dict(zip(slot_names,slot_classes_index))
intent_map = dict(zip(intent_names,intent_classes_index))


slot_map = {}
for label in slot_names:
    slot_map[label] = len(slot_map)

slot_map,intent_map

({'[PAD]': 0,
  'B-Bankname': 1,
  'I-Bankname': 2,
  'B-Billname': 3,
  'I-Billname': 4,
  'B-Cartnumber': 5,
  'B-Accountnumber': 6,
  'B-Username': 7,
  'I-Username': 8,
  'B-Iban': 9,
  'B-Method': 10,
  'I-Method': 11,
  'B-Amount': 12,
  'I-Amount': 13,
  'B-Exprdate': 14,
  'O': 15},
 {'transaction_card': 3,
  'transaction_paya': 4,
  'bill_payment': 1,
  'balance_query': 0,
  'introduce': 2})

In [None]:
train, val = train_test_split(df_train,test_size=0.2)
train

Unnamed: 0,sentence,slots,intent_label
22,شماره موجودیمو ندید بگو کارت کارت 5022 - 2910 ...,O O O B-Cartnumber,balance_query
1,در شماره مجازی حساب 90056789 به میزان موجودی ب...,O O O B-Accountnumber O O O O O O,balance_query
162,تا چطور به 60 ملیون تومان تومن بریزم به همراه ...,O B-Amount I-Amount I-Amount O O O O B-Method ...,transaction_paya
121,دوست شخصی دارم پایا حدس بزنم,O O B-Method O,transaction_paya
202,لطفا برام حواله نامه پایا دم بزن,O O B-Method I-Method O,transaction_paya
...,...,...,...
128,میخوام یه مبلغی کارت خوان به سیم کارت انجام کد...,O O O B-Method I-Method I-Method O O,transaction_card
71,گفتم چجوری به اطلاعات شماره حساب 5321564896758...,O O O O B-Accountnumber B-Amount I-Amount I-Am...,transaction_card
57,میتونی قبض گازمو بخوان با پشتیبانی کارت [ZWNJ]...,O B-Billname I-Billname O O B-Bankname O,bill_payment
175,اسم شرکت من علی احمدی فر هست,O O B-Username I-Username O,introduce


In [None]:
intent_train = train["intent_label"].map(intent_map).values
intent_val = val["intent_label"].map(intent_map).values
intent_train

array([0, 0, 4, 4, 4, 3, 3, 1, 0, 1, 4, 1, 4, 4, 3, 2, 1, 3, 1, 4, 1, 1,
       1, 4, 4, 1, 3, 3, 1, 0, 0, 0, 3, 0, 3, 3, 4, 1, 1, 0, 1, 3, 4, 3,
       1, 2, 3, 4, 0, 3, 3, 1, 4, 0, 0, 2, 4, 3, 1, 1, 3, 0, 3, 0, 3, 4,
       3, 0, 0, 4, 0, 1, 3, 2, 3, 0, 1, 2, 0, 1, 2, 3, 4, 1, 1, 3, 3, 3,
       0, 4, 3, 4, 1, 1, 3, 3, 1, 0, 0, 3, 0, 4, 3, 1, 4, 3, 0, 3, 1, 3,
       4, 0, 4, 1, 1, 2, 3, 0, 4, 4, 4, 4, 4, 3, 4, 3, 0, 4, 3, 4, 2, 1,
       4, 1, 4, 2, 4, 1, 4, 2, 0, 0, 0, 1, 1, 4, 4, 2, 3, 2, 3, 0, 2, 1,
       4, 3, 3, 4, 1, 4, 4, 3, 4, 3, 0, 3, 0, 0, 3, 4, 3, 1, 4, 1, 3, 3,
       3, 1, 2, 0])

# **Encode data**

In [None]:
def encode_dataset(tokenizer, text_sequences, max_length):
    token_ids = np.zeros(shape=(len(text_sequences), max_length), dtype=np.int32)
    attention_masks = np.zeros(shape=(len(text_sequences), max_length), dtype=np.int32)
    for i, text_sequence in enumerate(text_sequences):
        encoded = tokenizer.encode_plus(text_sequence, max_length=max_length, truncation=True, padding='max_length')
        token_ids[i] = np.array(encoded['input_ids'])
        attention_masks[i] = np.array(encoded['attention_mask'])
    return {"input_ids": token_ids, "attention_mask": attention_masks}

encoded_train = encode_dataset(bert_tknzr, train['sentence'], MAX_LENGTH)
encoded_val = encode_dataset(bert_tknzr, val['sentence'], MAX_LENGTH)

In [None]:
encoded_train['input_ids'][10]

array([    2,  2849,  7244, 38409,  3769, 19481,  3072,  2129,  3767,
       11020,  1923,  5988,  3305,  1932,  3348,  3791, 30047,  1114,
        4384,  3348,  5954,  1121, 12401,  1182, 38209,  1182,  9068,
       27006, 25363, 15213, 15314, 29497,  1019,  2065,     3,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0],
      dtype=int32)

# **Encode token labels**

In [None]:
def encode_token_labels(text_sequences, slot_names, tokenizer, slot_map,max_length):

    encoded = np.zeros(shape=(len(text_sequences), max_length), dtype=np.int32) + slot_map['[PAD]']
    for i, (text_sequence, word_labels) in enumerate(zip(text_sequences, slot_names)):
        encoded_labels = []
        for word, word_label in zip(text_sequence.split(), word_labels.split()):
            tokens = tokenizer.tokenize(word)
            encoded_labels.append(slot_map[word_label])
            expand_label = word_label.replace("B-", "I-")
            if not expand_label in slot_map:
                expand_label = word_label
            encoded_labels.extend([slot_map[expand_label]] * (len(tokens) - 1))
        encoded[i, 0] = slot_map['[PAD]']
        encoded[i, 1:len(encoded_labels) + 1] = encoded_labels
    return encoded


slot_train = encode_token_labels(
    train["sentence"], train["slots"], bert_tknzr,slot_map, MAX_LENGTH)

slot_val = encode_token_labels(
    val["sentence"], val["slots"], bert_tknzr, slot_map, MAX_LENGTH)


In [None]:
slot_train[20]

array([ 0, 15,  3,  4, 15, 15,  1,  2, 15, 15,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0], dtype=int32)

In [None]:
train_tensor = tf.data.Dataset.from_tensor_slices((encoded_train, (slot_train, intent_train))).shuffle(len(encoded_train)).batch(16)
val_tensor = tf.data.Dataset.from_tensor_slices((encoded_val, (slot_val, intent_val))).shuffle(len(encoded_val)).batch(16)

# **Define model & Train & Test**

In [None]:
from keras.api._v2.keras import activations
def JointIntentAndSlotFillingModel(num_slot_labels, num_intent_labels):

    input_ids = Input(shape=(MAX_LENGTH,), dtype=tf.int32, name="input_ids")
    attention_mask = Input(shape=(MAX_LENGTH,), dtype=tf.int32, name="attention_mask")
    inputs = {'input_ids': input_ids,'attention_mask': attention_mask}

    bert_output = bert(inputs)

    sequence_output = bert_output.last_hidden_state
    pooled_output = bert_output.pooler_output

    intent_dropout = Dropout(bert_cnfg.hidden_dropout_prob, name='intent_dropout')(pooled_output, training=False)
    intent_output = Dense(units=num_intent_labels, activation="softmax", name='Intent')(intent_dropout)

    slot_dropout = Dropout(bert_cnfg.hidden_dropout_prob, name='slot_dropout')(sequence_output, training=False)
    slots_output = Dense(units=num_slot_labels, name = 'Slot')(slot_dropout)

    model = Model(inputs=[input_ids, attention_mask], outputs=[slots_output, intent_output])

    return model

joint_bert = JointIntentAndSlotFillingModel(num_slot_labels, num_intent_labels)
joint_bert.summary()


Model: "model_6"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 attention_mask (InputLayer)    [(None, 45)]         0           []                               
                                                                                                  
 input_ids (InputLayer)         [(None, 45)]         0           []                               
                                                                                                  
 tf_bert_model (TFBertModel)    TFBaseModelOutputWi  118297344   ['attention_mask[0][0]',         
                                thPoolingAndCrossAt               'input_ids[0][0]']              
                                tentions(last_hidde                                               
                                n_state=(None, 45,                                          

In [None]:
opt = Adam(learning_rate = 5e-5, epsilon=1e-8)
# opt = AdamW(learning_rate = 5e-5,weight_decay=1e-08)

losses = {'Slot':SparseCategoricalCrossentropy(from_logits=True),
          'Intent':SparseCategoricalCrossentropy(from_logits=True)}
metrics = [SparseCategoricalAccuracy('accuracy')]

joint_bert.compile(optimizer=opt, loss=losses, metrics=metrics)

def scheduler(epoch, lr):
    if epoch < 2:
        return lr
    else:
        return lr * 0.2

lr_callback = tf.keras.callbacks.LearningRateScheduler(scheduler)

history = joint_bert.fit(train_tensor,
    validation_data=(val_tensor),
    epochs=epochs, batch_size=batch_size, shuffle=True, callbacks=lr_callback)


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [None]:
results = []
for i,sntnc in  enumerate(df_test["sentence"]):
  max_len = len(sntnc.split())
  encoded_test = encode_dataset(bert_tknzr, [sntnc] , MAX_LENGTH)
  outputs = joint_bert(encoded_test)

  slot_ids = outputs[0].numpy().argmax(axis=-1)[:,1:-1]
  intent_id = outputs[1].numpy().argmax(axis=-1)

  slot_ids = slot_ids[slot_ids > slot_map['[PAD]']]

  reverse_slot_map = {v: k for k, v in slot_map.items()}

  slot_pred_labels = ' '.join([reverse_slot_map[i] for i in slot_ids])

  # slot_pred_labels = slot_le.inverse_transform(slot_ids.ravel())
  intent_pred_labels = intent_le.inverse_transform(intent_id.ravel())

  results.append([i, slot_pred_labels , ''.join(intent_pred_labels)])

predictions = pd.DataFrame(results, columns=['index','slots','intent_label'])
predictions.to_csv('predictions.csv', index=False)