In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/customer-support-intent-dataset/Bitext_Sample_Customer_Service_Training_Dataset.csv
/kaggle/input/customer-support-intent-dataset/Bitext_Sample_Customer_Service_Validation_Dataset.csv
/kaggle/input/customer-support-intent-dataset/Bitext_Sample_Customer_Service_Testing_Dataset.csv


In [2]:
import pandas as pd
train_path="/kaggle/input/customer-support-intent-dataset/Bitext_Sample_Customer_Service_Training_Dataset.csv"
valid_path="/kaggle/input/customer-support-intent-dataset/Bitext_Sample_Customer_Service_Validation_Dataset.csv"
test_data="/kaggle/input/customer-support-intent-dataset/Bitext_Sample_Customer_Service_Testing_Dataset.csv"
df_train=pd.read_csv(train_path)
df_test=pd.read_csv(test_data)

In [13]:
import re

In [14]:
# Text Preprocessing
def preprocess_text(text):
    text = text.lower()  # Lowercase text
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    return text

In [15]:
df_train['utterance']=df_train['utterance'].apply(preprocess_text)

In [46]:
train_texts = df_train['utterance'].tolist()
train_labels = df_train['intent'].tolist()

In [47]:
import random

In [48]:
import spacy
from spacy.tokens import DocBin
from spacy.util import minibatch, compounding

In [49]:
nlp = spacy.blank("en")

In [50]:
# Add the text classifier to the pipeline
textcat = nlp.add_pipe("textcat")

In [51]:

# Add labels to the text classifier
for label in set(train_labels):
    textcat.add_label(label)

In [52]:
from spacy.training import Example

In [53]:
# Convert the training data to spaCy's format
train_data = []
for text, label in zip(train_texts, train_labels):
    train_data.append((text,label))

In [54]:
train_data = []
for text, label in zip(train_texts, train_labels):
    doc = nlp.make_doc(text)
    example = Example.from_dict(doc, {"cats": {label: 1.0}})
    train_data.append(example)

In [55]:
print(train_data[0])

{'doc_annotation': {'cats': {'cancel_order': 1.0}, 'entities': ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], 'spans': {}, 'links': {}}, 'token_annotation': {'ORTH': ['would', 'it', 'be', 'possible', 'to', 'cancel', 'the', 'order', 'i', 'made'], 'SPACY': [True, True, True, True, True, True, True, True, True, False], 'TAG': ['', '', '', '', '', '', '', '', '', ''], 'LEMMA': ['', '', '', '', '', '', '', '', '', ''], 'POS': ['', '', '', '', '', '', '', '', '', ''], 'MORPH': ['', '', '', '', '', '', '', '', '', ''], 'HEAD': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9], 'DEP': ['', '', '', '', '', '', '', '', '', ''], 'SENT_START': [1, 0, 0, 0, 0, 0, 0, 0, 0, 0]}}


In [56]:
TRAIN_DATA = train_data

In [58]:
optimizer=nlp.initialize()

[2024-07-04 14:08:19,579] [INFO] Created vocabulary
[2024-07-04 14:08:19,581] [INFO] Finished initializing nlp object


In [60]:
n_iter = 50
batch_size = 64
for epoch in range(n_iter):
    losses = {}
    batches = minibatch(train_data, size=compounding(4., 32., 1.001))
    for batch in batches:
#         texts, annotations = zip(*batch)
        nlp.update(batch, drop=0.5, losses=losses)
    print(f"Epoch {epoch + 1}, Losses: {losses}")

Epoch 1, Losses: {'textcat': 34.74037839857976}
Epoch 2, Losses: {'textcat': 27.07610330547684}
Epoch 3, Losses: {'textcat': 18.982321805853932}
Epoch 4, Losses: {'textcat': 12.966103721329329}
Epoch 5, Losses: {'textcat': 9.336888280823862}
Epoch 6, Losses: {'textcat': 5.955783396831604}
Epoch 7, Losses: {'textcat': 4.360208956670032}
Epoch 8, Losses: {'textcat': 3.358605134563779}
Epoch 9, Losses: {'textcat': 2.9359020286799966}
Epoch 10, Losses: {'textcat': 3.079868023140445}
Epoch 11, Losses: {'textcat': 2.8616308963960675}
Epoch 12, Losses: {'textcat': 2.7651799256749685}
Epoch 13, Losses: {'textcat': 3.158621487429218}
Epoch 14, Losses: {'textcat': 3.1895270269815885}
Epoch 15, Losses: {'textcat': 3.3597233055006077}
Epoch 16, Losses: {'textcat': 2.9694055049946626}
Epoch 17, Losses: {'textcat': 2.5671316152917933}
Epoch 18, Losses: {'textcat': 2.0918553618641478}
Epoch 19, Losses: {'textcat': 1.2198981939462912}
Epoch 20, Losses: {'textcat': 0.8421309109858303}
Epoch 21, Losses:

KeyboardInterrupt: 

In [61]:
nlp.to_disk("intent_model")

In [62]:
p = spacy.load("intent_model")

In [64]:
test_texts = df_test['utterance'].apply(preprocess_text).tolist()
test_labels = df_test['intent'].tolist()

In [65]:
def predict_intent(text):
    # Process the text using the loaded model
    doc = nlp(text)
    print(doc.cats)
    # Get the highest scoring label
    pred_label = max(doc.cats, key=doc.cats.get)
    return pred_label

In [66]:
text = "I need help with my order"
predicted_intent = predict_intent(text)
print(f"Predicted intent: {predicted_intent}")

{'change_order': 0.9639458656311035, 'registration_problems': 3.583449925681492e-10, 'change_shipping_address': 7.277732549937355e-08, 'cancel_order': 0.0015451516956090927, 'newsletter_subscription': 3.0360589042821573e-10, 'check_payment_methods': 1.286542206102581e-09, 'complaint': 7.617496633471887e-10, 'track_refund': 1.4990295463257475e-11, 'get_invoice': 2.8299720900015757e-10, 'contact_customer_service': 4.960254955221899e-05, 'delete_account': 4.319828804000281e-06, 'edit_account': 1.45970293630171e-08, 'switch_account': 2.602877982593732e-11, 'track_order': 0.00015503006579820067, 'review': 1.4611874377123968e-08, 'check_invoice': 0.03373781219124794, 'delivery_options': 8.190327207557857e-06, 'get_refund': 3.7203537472940695e-10, 'payment_issue': 5.893903676223999e-07, 'contact_human_agent': 7.075805115164258e-06, 'check_cancellation_fee': 1.0515489812235046e-08, 'check_refund_policy': 1.8033649951121333e-07, 'create_account': 2.444219546759996e-07, 'place_order': 5.35642570

In [67]:
from sklearn.metrics import accuracy_score

In [69]:
# Make predictions
predictions = []
for text in test_texts:
    doc = nlp(text)
    # Get the highest scoring label
    pred_label = max(doc.cats, key=doc.cats.get)
    predictions.append(pred_label)
# print(predictions)

# Calculate accuracy
accuracy = accuracy_score(test_labels, predictions)
print(f"Test Accuracy: {accuracy}")

Test Accuracy: 0.991442542787286


In [70]:
import shutil
shutil.make_archive("intent_model", 'zip', "intent_model")

'/kaggle/working/intent_model.zip'