# Chatbot for E-Commerce Website Demo 

#### Read Train Data

In [12]:
import pandas as pd
import json

# Read the JSON data from the file
with open('train_data.json', 'r') as json_file:
    data = json.load(json_file)

# Create a list to store tag and patterns
tag_patterns_list = []

# Extract "tag" and "patterns" from each JSON object
for item in data:
    tag = item["tag"]
    patterns = item["patterns"]
    tag_patterns_list.append({"patterns": patterns[0] if patterns else "", "tag": tag})

# Create a DataFrame with "patterns" as the first column and "tag" as the second column
df = pd.DataFrame(tag_patterns_list)

# Display the DataFrame
df


Unnamed: 0,patterns,tag
0,does this product come with a warranty,product_warranty
1,can i return this item if i don't like it,product_return
2,do you offer free shipping,product_free_shipping
3,is there a discount code available,product_discount
4,is google wallet accepted,payment_google
5,can i pay with apple cash,payment_apple
6,accepted methods of payment,payment_accept
7,what are the features of this product,product_features
8,what are the return policy,return_policy
9,how much does this product cost,product_price


In [20]:
# product_availability, return_initiation

#### Read Test Data

In [21]:
# import pandas as pd

# # Initialize an empty list to store tag and patterns
# tag_patterns_list = []

# # Read data from the test_data.txt file
# with open('test_data.txt', 'r') as file:
#     for line in file:
#         line = line.strip()  # Remove leading/trailing whitespace
#         tag, patterns = line.split(',', 1)  # Split into tag and patterns
#         tag = tag.strip('"')  # Remove double quotes around tag
#         tag_patterns_list.append({"patterns": patterns.strip('"'), "tag": tag})  # Swap tag and patterns

# # Create a DataFrame for the test data with the desired order
# test_df = pd.DataFrame(tag_patterns_list)

# # Display the modified DataFrame for the test data
# test_df

In [13]:
import pandas as pd

# Initialize an empty list to store tag and patterns
tag_patterns_list = []

# Read data from the test_data.txt file
with open('test_data.txt', 'r') as file:
    for line in file:
        line = line.strip()  # Remove leading/trailing whitespace
        parts = line.split(',', 1)  # Split into tag and the rest
        tag = parts[0].strip('"')  # Remove double quotes around tag
        patterns = parts[1].strip('"') if len(parts) > 1 else ""  # Get patterns, if available
        tag_patterns_list.append({"patterns": patterns, "tag": tag})

# Create a DataFrame for the test data with the desired order
test_df = pd.DataFrame(tag_patterns_list)

# Display the modified DataFrame for the test data
test_df


Unnamed: 0,patterns,tag
0,Is there a warranty for this product?,product_warranty
1,What is the return policy if I'm not satisfied...,product_return
2,Do you provide free shipping?,product_free_shipping
3,Are there any discount codes available?,product_discount
4,Are payments via Google Wallet accepted?,payment_google
...,...,...
62,What is the estimated delivery date?,delivery_date
63,What are the features of the latest model of {...,last_model_features
64,Can you provide a list of the best-selling pro...,product_list
65,What is the price range for {product_name} pro...,product_price_range


## Establish Random Forest classifier

In [14]:
import nltk
import string
import random
from nltk.util import ngrams
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from sklearn.feature_extraction import DictVectorizer
from sklearn.ensemble import RandomForestClassifier

# Define a function to preprocess text into bigrams
def preprocess_text(text):
    # Tokenize data
    tokens = nltk.word_tokenize(text)

    # Lowercase all words
    tokens = [word.lower() for word in tokens]

    # Remove stopwords and punctuation
    stop_words = set(nltk.corpus.stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words and word not in string.punctuation]

    # Generate bigrams
    bigrams_list = list(ngrams(tokens, 1))  # '1' n-gram size

    # Lemmatize words (optional)
    lemmatizer = nltk.stem.WordNetLemmatizer()
    tokens = [' '.join(bigram) for bigram in bigrams_list]

    return tokens

# Define a function to extract POS tags from text
def extract_pos_tags(text):
    tokens = word_tokenize(text)
    pos_tags = pos_tag(tokens)
    return [tag for _, tag in pos_tags]

# Define a function to preprocess a single text string and convert it to a feature dictionary
def preprocess_text_to_features(text, n=1):
    preprocessed_text = preprocess_text(text)
    feature_dict = {' '.join(bigram): True for bigram in preprocessed_text}
    
    # Extract POS tags and add them as features
    pos_tags = extract_pos_tags(text)
    for tag in pos_tags:
        feature_dict[tag] = True
    
    return feature_dict

# Prepare the data as labeled featuresets from the 'df' DataFrame
labeled_featuresets = []
for index, row in df.iterrows():
    tag = row['tag']
    pattern = row['patterns']
    feature_dict = preprocess_text_to_features(pattern)
    labeled_featuresets.append((feature_dict, tag))

# Shuffle the labeled featuresets to ensure randomness
random.shuffle(labeled_featuresets)

# Convert feature dictionaries to a feature matrix
vectorizer = DictVectorizer()
X_train = vectorizer.fit_transform([x for x, _ in labeled_featuresets])
y_train = [y for _, y in labeled_featuresets]

random.seed(42)

# Train a Random Forest classifier
classifier = RandomForestClassifier(n_estimators = 100, random_state = 42, min_samples_split = 4 )
classifier.fit(X_train, y_train)

# Define a function to get a tag for a user's question using the Random Forest classifier
def get_tag(question, classifier, df):
    # Preprocess the question with bigrams and convert it to a feature dictionary
    feature_dict = preprocess_text_to_features(question)
    
    # Extract POS tags and add them as features
    pos_tags = extract_pos_tags(question)
    for tag in pos_tags:
        feature_dict[tag] = True
    
    # Convert the feature dictionary to a feature vector
    feature_vector = vectorizer.transform([feature_dict])
    
    # Use the classifier to predict the tag
    predicted_tag = classifier.predict(feature_vector)[0]
    
    return predicted_tag

# Example usage
x = "Tell me about your products"
predicted_tag = get_tag(x, classifier, df)
print("Predicted Tag:", predicted_tag)


Predicted Tag: product_list


## Finding Accuracy

In [24]:
# Prepare the test data as labeled featuresets from the 'test_df' DataFrame
test_labeled_featuresets = []
for index, row in test_df.iterrows():
    tag = row['tag']
    pattern = row['patterns']
    feature_dict = preprocess_text_to_features(pattern)
    test_labeled_featuresets.append((feature_dict, tag))

# Convert feature dictionaries to a feature matrix for the test data
X_test = vectorizer.transform([x for x, _ in test_labeled_featuresets])
y_test = [y for _, y in test_labeled_featuresets]

# Use the trained classifier to make predictions on the test data
y_pred = classifier.predict(X_test)

# Calculate accuracy
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy*100} %")


Accuracy: 83.5820895522388 %


## Grid search on hypyerparameters 


In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import LeaveOneOut


# Define the hyperparameter grid to search over
param_grid = {
    'n_estimators': [100, 200, 300],  # Number of trees in the forest
    'max_depth': [None, 10, 20, 30],  # Maximum depth of the trees
    'min_samples_split': [2,3,4,5,10],  # Minimum number of samples required to split an internal node
    'min_samples_leaf': [1, 2, 4],  # Minimum number of samples required to be at a leaf node
}

# Create a Random Forest classifier
rf_classifier = RandomForestClassifier(random_state=42)

# Create LOOCV object
loo_cv = LeaveOneOut()

# Perform grid search with LOOCV
grid_search = GridSearchCV(estimator=rf_classifier, param_grid=param_grid, cv=loo_cv, scoring='accuracy')

# Perform the grid search on your training data
grid_search.fit(X_train, y_train)

# Print the best hyperparameters found by the grid search
print("Best Hyperparameters:", grid_search.best_params_)


Best Hyperparameters: 'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 4, 'n_estimators': 100

## Custom NER model

In [6]:
import json
import csv

# Initialize the training data list
training_data_for_ner = []

# Replace 'your_input.csv' with your CSV file path
with open('sheet.csv', 'r', newline='', encoding='utf-8') as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        text = row['text']
        entity_text = row['entity_text']
        start = int(row['start'])
        end = int(row['end'])

        # Create a training example dictionary
        example = (
            text,
            {
                "entities": [
                    (start, end, "PRODUCT")
                ]
            }
        )

        # Add the example to the training data list
        training_data_for_ner.append(example)

# Save the training data as a JSON file
with open('training_data.json', 'w', encoding='utf-8') as jsonfile:
    json.dump(training_data_for_ner, jsonfile, ensure_ascii=False)

print(training_data_for_ner)


[('I just got the iPhone 13 Pro Max', {'entities': [(15, 32, 'PRODUCT')]}), ('Looking for a new phone? Check out the Samsung Galaxy S21', {'entities': [(39, 57, 'PRODUCT')]}), ('The Google Pixel 6 Pro camera is impressive', {'entities': [(4, 22, 'PRODUCT')]}), ("I'm loving my OnePlus 9 Pro", {'entities': [(14, 27, 'PRODUCT')]}), ('The iPhone SE (2020) is a budget-friendly option', {'entities': [(4, 20, 'PRODUCT')]}), ("I'm considering buying the Xiaomi Mi 11", {'entities': [(27, 39, 'PRODUCT')]}), ('The Huawei P40 Pro has a great camera', {'entities': [(4, 18, 'PRODUCT')]}), ('LG V60 ThinQ is a solid choice for audio enthusiasts', {'entities': [(0, 12, 'PRODUCT')]}), ('Sony Xperia 1 III is known for its display', {'entities': [(0, 17, 'PRODUCT')]}), ('OnePlus Nord is a mid-range smartphone', {'entities': [(0, 12, 'PRODUCT')]}), ('I upgraded to the iPhone 12 recently', {'entities': [(18, 27, 'PRODUCT')]}), ('Samsung Galaxy A52 offers good value', {'entities': [(0, 18, 'PRODUCT')]}), ('C

In [31]:
# # Step 1: Collect a Dataset (Simplified)
# # In practice, you'd need a more extensive dataset with annotated product names.

# # training_data is above

# # Step 2: Prepare the Data
# # In practice, you'd typically load the data from files and preprocess it.

# # Step 3: Train a Custom NER Model
# import spacy
# import random
# from spacy.training.example import Example

# # Create a blank spaCy NER model
# nlp = spacy.blank("en")

# # Add the "ner" pipeline to the model
# ner = nlp.add_pipe("ner")

# # Add the "PRODUCT" entity label
# ner.add_label("PRODUCT")

# # Start training
# nlp.begin_training()

# # Train the model for a few iterations (epochs)
# for _ in range(20):
#     random.shuffle(training_data_for_ner)
#     for text, annotations in training_data_for_ner:
#         example = Example.from_dict(nlp.make_doc(text), annotations)
#         nlp.update([example])

# # Step 4: Evaluate and Fine-Tune
# # In practice, you would evaluate the model on a validation dataset and fine-tune as needed.



In [32]:
# # Step 5: Use the Custom NER Model
# def extract_product_name(user_input):
#     doc = nlp(user_input)
#     product_names = [ent.text for ent in doc.ents if ent.label_ == "PRODUCT"]
#     return product_names

# # Example of using the custom NER model
# user_input = "I have an Maybelline Fit Me Matte + Poreless Foundation and a Becca Shimmering Skin Perfector Highlighter"
# product_names = extract_product_name(user_input)
# print(product_names)  # Output: ['iPhone 12', 'MacBook Pro']

In [7]:
import spacy
import random
from spacy.training.example import Example

# Create a blank spaCy NER model
nlp = spacy.blank("en")

# Add the "ner" pipeline to the model
ner = nlp.add_pipe("ner")

# Add the "PRODUCT" entity label
ner.add_label("PRODUCT")

# Start training
nlp.begin_training()

# Training data

# Prepare the training examples
train_examples = []
for text, annotations in training_data_for_ner:
    doc = nlp.make_doc(text)
    example = Example.from_dict(doc, annotations)
    train_examples.append(example)

# Train the model
n_iter = 20
random.seed(1)  # For reproducibility
random.shuffle(train_examples)

losses = {}
for epoch in range(n_iter):
    random.shuffle(train_examples)
    for batch in spacy.util.minibatch(train_examples, size=2):
        nlp.update(batch, drop=0.5, losses=losses)
    print(f"Iteration {epoch+1} - Loss: {losses['ner']}")

# Save the trained NER model to a directory
output_dir = "path_to_output_directory"
nlp.to_disk(output_dir)
print(f"Model saved to {output_dir}")


Iteration 1 - Loss: 182.93721702575567
Iteration 2 - Loss: 283.48804834185285
Iteration 3 - Loss: 415.824875706031
Iteration 4 - Loss: 528.5115520656591
Iteration 5 - Loss: 600.4344324601639
Iteration 6 - Loss: 661.1841635301096
Iteration 7 - Loss: 706.9888591540044
Iteration 8 - Loss: 731.7058288913004
Iteration 9 - Loss: 760.4707653205899
Iteration 10 - Loss: 775.3030508604237
Iteration 11 - Loss: 791.7583120189842
Iteration 12 - Loss: 811.7347580742729
Iteration 13 - Loss: 829.3657267333538
Iteration 14 - Loss: 839.9549205546119
Iteration 15 - Loss: 840.2018627353995
Iteration 16 - Loss: 852.4698026230582
Iteration 17 - Loss: 854.1250807263386
Iteration 18 - Loss: 862.3571826617377
Iteration 19 - Loss: 865.6489310525133
Iteration 20 - Loss: 866.0266633193806
Model saved to path_to_output_directory


In [35]:
import spacy

# Replace 'path_to_output_directory' with the actual path to your saved model
model_path = "path_to_output_directory"
# model_path = "custom_ner_model"

# Load the trained NER model
nlp = spacy.load(model_path)

# Now you can use the loaded NER model to perform entity recognition on new text
text = "What are the available colors of iPhone 13"
doc = nlp(text)

# Access the recognized entities
for ent in doc.ents:
    print(f"Entity: {ent.text}, Label: {ent.label_}")


Entity: iPhone 13, Label: PRODUCT


In [37]:
def extract_product_name(user_input):
    doc = nlp(user_input)
    product_names = [ent.text for ent in doc.ents if ent.label_ == "PRODUCT"]
    return product_names

print(extract_product_name("What are the available colors of iPhone 11"))

['iPhone 11']


## Example of the next working with Data base

In [45]:
import spacy
import json

# Load the spaCy model with NER
# nlp = spacy.load("en_core_web_sm")

# Sample product information dictionary
product_info = {
    "iPhone 11": {"product_colors": "Red, Green, Yellow", "product_warranty": "3 months"},
    "Samsung Galaxy S21": {"product_colors": "Pink, White, Gray", "product_warranty": "6 months"},
    "Google Pixel 5": {"product_colors": "Black, White, Green", "product_warranty": "1 year"}
}

# Function to retrieve product information from the dictionary
def get_product_info(product_name, predicted_tag):
    if product_name in product_info and predicted_tag in product_info[product_name]:
        return {predicted_tag: product_info[product_name][predicted_tag]}
    return {}

# Function to read training data from a JSON file
def read_training_data(file_path):
    with open(file_path, 'r') as file:
        training_data = json.load(file)
    return training_data

# Read training data from the JSON file
training_data = read_training_data('train_data.json')

# Function to get the appropriate response based on the predicted_tag
def get_response(predicted_tag, product_name=None):
    for data in training_data:
        if data["tag"] == predicted_tag:
            # if product_name:
            response = data["responses"][0].format(product_name=product_name, **get_product_info(product_name, predicted_tag))
            # else:
            #     response = data["responses"][0]
            return response
    return "Sorry, I don't have a response for that."

# Initialize a context dictionary to store user context
user_context = {}

# Function to handle user queries and save context
def handle_user_query(user_input, predicted_tag):
    global user_context

    # Extract product names from user input
    product_names = extract_product_name(user_input)
    print(f"product_names :{product_names}")

    if not product_names:
        # Try to extract product names from the last query
        last_query = user_context.get("last_query", "")
        product_names = extract_product_name(last_query)
    # Initialize a dictionary to store product information
    product_info_dict = {}

    if predicted_tag == "product_colors" or predicted_tag == "product_warranty":
        # Retrieve product information from the dictionary for each product name
        for product_name in product_names:
            product_info = get_product_info(product_name, predicted_tag)
            if product_info:
                product_info_dict[product_name] = product_info

        # Generate responses based on the retrieved product information
        responses = []
        for product_name, product_info in product_info_dict.items():
            response = get_response(predicted_tag, product_name)
            responses.append(response)

        if responses:
            final_response = "\n".join(responses)
        else:
            final_response = f"Sorry, I couldn't find information about {predicted_tag.replace('_', ' ')} for the specified products."

        # Save context for future queries
        user_context["last_query"] = user_input
        user_context["last_response"] = final_response

        # Print the response
        print("User Input:", user_input)
        print("Response:", final_response)
    else:
        print("Invalid predicted tag:", predicted_tag)

# User input
user_input = "What are the available colors of iPhone 11"
predicted_tag = get_tag(user_input, classifier, df)
handle_user_query(user_input, predicted_tag)
print()
user_input = "Is there any warranty for the product?"
predicted_tag = get_tag(user_input, classifier, df)
handle_user_query(user_input, predicted_tag)


product_names :['iPhone 11']
User Input: What are the available colors of iPhone 11
Response: This product is available in the following colors: Red, Green, Yellow.

product_names :[]
User Input: Is there any warranty for the product?
Response: Yes, this product comes with a 3 months warranty.
