<a href="https://colab.research.google.com/github/Karan-j22/Chatbot-ML/blob/main/Chatbot.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
# Import necessary libraries
import pandas as pd  # For data manipulation
import numpy as np   # For numerical operations
from sklearn import preprocessing  # For preprocessing tasks
from sklearn.model_selection import train_test_split  # To split the dataset into training and testing
from sklearn.feature_extraction.text import TfidfVectorizer  # To convert text data into TF-IDF features
from sklearn.pipeline import Pipeline  # To create a pipeline for combining multiple steps
from sklearn.metrics import classification_report  # To evaluate model performance
from sklearn.linear_model import LogisticRegression  # To use logistic regression for classification

# Import NLTK for text processing tasks
import nltk
try:
    nltk.data.find('tokenizers/punkt')  # Try to find the 'punkt' tokenizer in NLTK data
except:
    nltk.download('punkt')  # Download 'punkt' tokenizer if not found


In [6]:
# Load the dataset from a CSV file
df = pd.read_csv("/Bitext_Sample_Customer_Service_Training_Dataset.csv")

# Show the first 5 rows of the dataset to understand its structure
df.head()

Unnamed: 0,flags,utterance,category,intent
0,BM,I have problems with canceling an order,ORDER,cancel_order
1,BIM,how can I find information about canceling ord...,ORDER,cancel_order
2,B,I need help with canceling the last order,ORDER,cancel_order
3,BIP,could you help me cancelling the last order I ...,ORDER,cancel_order
4,B,problem with cancelling an order I made,ORDER,cancel_order


In [8]:
# Check the total number of unique intents in the dataset
len(df.intent.value_counts())

27

In [9]:
# Encoding the intent labels into numerical format without any preprocessing
label_intent = preprocessing.LabelEncoder()  # Initialize LabelEncoder
df['label_num'] = label_intent.fit_transform(df.intent)  # Fit and transform the 'intent' column

# Show the first 5 rows of the modified dataset with numerical intent labels
df.head()

Unnamed: 0,flags,utterance,category,intent,label_num
0,BM,I have problems with canceling an order,ORDER,cancel_order,0
1,BIM,how can I find information about canceling ord...,ORDER,cancel_order,0
2,B,I need help with canceling the last order,ORDER,cancel_order,0
3,BIP,could you help me cancelling the last order I ...,ORDER,cancel_order,0
4,B,problem with cancelling an order I made,ORDER,cancel_order,0


In [10]:
label_intent.__dict__

{'classes_': array(['cancel_order', 'change_order', 'change_shipping_address',
        'check_cancellation_fee', 'check_invoice', 'check_payment_methods',
        'check_refund_policy', 'complaint', 'contact_customer_service',
        'contact_human_agent', 'create_account', 'delete_account',
        'delivery_options', 'delivery_period', 'edit_account',
        'get_invoice', 'get_refund', 'newsletter_subscription',
        'payment_issue', 'place_order', 'recover_password',
        'registration_problems', 'review', 'set_up_shipping_address',
        'switch_account', 'track_order', 'track_refund'], dtype=object)}

In [12]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    df.utterance,  # Features: User utterances
    df.label_num,  # Target labels: Encoded intents
    test_size=0.2,  # Allocate 20% of the data for testing
    random_state=2022,  # Random seed for reproducibility
    stratify=df.label_num  # Ensure the split maintains the distribution of intent classes
)

# Print the shape of the training and testing sets
print("Shape of X_train: ", X_train.shape)
print("Shape of X_test: ", X_test.shape)

# Create a pipeline with a TF-IDF vectorizer and Logistic Regression
clf_tfid = Pipeline([
    ('vectorizer_tfidf', TfidfVectorizer()),  # Convert the utterances into TF-IDF vectors
    ('LogisticRegress', LogisticRegression(C=1.0, penalty='l2', max_iter=100))  # Use logistic regression with L2 penalty (default), increased iterations to 100
])

# Train the model using the training data
clf_tfid.fit(X_train, y_train)

# Predict the labels for the test data
y_pred = clf_tfid.predict(X_test)

# Print the classification report to evaluate the model performance
print(classification_report(y_test, y_pred))


Shape of X_train:  (6540,)
Shape of X_test:  (1635,)
              precision    recall  f1-score   support

           0       0.95      0.98      0.97        61
           1       1.00      0.95      0.97        61
           2       1.00      1.00      1.00        59
           3       0.98      1.00      0.99        60
           4       1.00      1.00      1.00        65
           5       1.00      0.98      0.99        59
           6       0.98      0.98      0.98        60
           7       1.00      1.00      1.00        60
           8       0.98      1.00      0.99        60
           9       1.00      1.00      1.00        59
          10       0.97      0.93      0.95        60
          11       0.98      1.00      0.99        60
          12       0.98      1.00      0.99        60
          13       1.00      1.00      1.00        60
          14       1.00      0.97      0.98        59
          15       1.00      1.00      1.00        65
          16       1.00     

In [13]:
from sklearn.feature_extraction.text import CountVectorizer

# Create a pipeline using CountVectorizer and Logistic Regression
clf_bow = Pipeline([
    ('vectorizer_bow', CountVectorizer()),  # Use CountVectorizer to convert utterances into a bag-of-words representation
    ('LogisticRegress', LogisticRegression(C=1.0, penalty='l2', max_iter=100))  # Logistic regression with L2 penalty (default), max_iter increased to 100
])

# Train the model using the training data
clf_bow.fit(X_train, y_train)

# Predict the labels for the test data
y_pred = clf_bow.predict(X_test)

# Print the classification report to evaluate the model performance
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.95      0.98      0.97        61
           1       1.00      0.93      0.97        61
           2       1.00      1.00      1.00        59
           3       0.97      1.00      0.98        60
           4       1.00      1.00      1.00        65
           5       1.00      1.00      1.00        59
           6       1.00      0.98      0.99        60
           7       1.00      1.00      1.00        60
           8       0.98      1.00      0.99        60
           9       1.00      1.00      1.00        59
          10       0.97      0.95      0.96        60
          11       1.00      1.00      1.00        60
          12       1.00      1.00      1.00        60
          13       1.00      1.00      1.00        60
          14       1.00      1.00      1.00        59
          15       1.00      1.00      1.00        65
          16       1.00      0.98      0.99        59
          17       1.00    