In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/customer-support-ticket-dataset/customer_support_tickets.csv


### **Library Install**

In [2]:
!pip install openai faiss-cpu sentence-transformers unstructured 

Collecting openai
  Downloading openai-1.16.2-py3-none-any.whl.metadata (21 kB)
Collecting faiss-cpu
  Downloading faiss_cpu-1.8.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.6 kB)
Collecting sentence-transformers
  Downloading sentence_transformers-2.6.1-py3-none-any.whl.metadata (11 kB)
Collecting unstructured
  Downloading unstructured-0.13.2-py3-none-any.whl.metadata (30 kB)
Collecting chardet (from unstructured)
  Downloading chardet-5.2.0-py3-none-any.whl.metadata (3.4 kB)
Collecting filetype (from unstructured)
  Downloading filetype-1.2.0-py2.py3-none-any.whl.metadata (6.5 kB)
Collecting python-magic (from unstructured)
  Downloading python_magic-0.4.27-py2.py3-none-any.whl.metadata (5.8 kB)
Collecting python-iso639 (from unstructured)
  Downloading python_iso639-2024.2.7-py3-none-any.whl.metadata (13 kB)
Collecting langdetect (from unstructured)
  Downloading langdetect-1.0.9.tar.gz (981 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━

### **Data Loading and Pre-Processing**

In [3]:
df = pd.read_csv('/kaggle/input/customer-support-ticket-dataset/customer_support_tickets.csv')
print(df.columns)
# df0 = df[['issueNum','releaseName','solution']]
df.head(3)

Index(['Ticket ID', 'Customer Name', 'Customer Email', 'Customer Age',
       'Customer Gender', 'Product Purchased', 'Date of Purchase',
       'Ticket Type', 'Ticket Subject', 'Ticket Description', 'Ticket Status',
       'Resolution', 'Ticket Priority', 'Ticket Channel',
       'First Response Time', 'Time to Resolution',
       'Customer Satisfaction Rating'],
      dtype='object')


Unnamed: 0,Ticket ID,Customer Name,Customer Email,Customer Age,Customer Gender,Product Purchased,Date of Purchase,Ticket Type,Ticket Subject,Ticket Description,Ticket Status,Resolution,Ticket Priority,Ticket Channel,First Response Time,Time to Resolution,Customer Satisfaction Rating
0,1,Marisa Obrien,carrollallison@example.com,32,Other,GoPro Hero,2021-03-22,Technical issue,Product setup,I'm having an issue with the {product_purchase...,Pending Customer Response,,Critical,Social media,2023-06-01 12:15:36,,
1,2,Jessica Rios,clarkeashley@example.com,42,Female,LG Smart TV,2021-05-22,Technical issue,Peripheral compatibility,I'm having an issue with the {product_purchase...,Pending Customer Response,,Critical,Chat,2023-06-01 16:45:38,,
2,3,Christopher Robbins,gonzalestracy@example.com,48,Other,Dell XPS,2020-07-14,Technical issue,Network problem,I'm facing a problem with my {product_purchase...,Closed,Case maybe show recently my computer follow.,Low,Social media,2023-06-01 11:14:38,2023-06-01 18:05:38,3.0


In [4]:
def concatenate_text(x):
    # pattern = r'\[(\d+)-(\d+)\)'
    full_text = [
        f"Ticket Subject {x['Ticket Subject']}",
        f"\nCustomer Name {x['Customer Name']}",
        f"\nDescription {x['Ticket Description']}",
        f"\nSolution is {x['Resolution']}"
    ]
    return ' '.join(full_text)

In [5]:
df0 = df.copy()
df0['Complete description'] = df0.apply(lambda x: concatenate_text(x), axis=1)

In [6]:
df0['Complete description'][0]

"Ticket Subject Product setup \nCustomer Name Marisa Obrien \nDescription I'm having an issue with the {product_purchased}. Please assist.\n\nYour billing zip code is: 71701.\n\nWe appreciate that you have requested a website address.\n\nPlease double check your email address. I've tried troubleshooting steps mentioned in the user manual, but the issue persists. \nSolution is nan"

In [7]:
df0.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8469 entries, 0 to 8468
Data columns (total 18 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Ticket ID                     8469 non-null   int64  
 1   Customer Name                 8469 non-null   object 
 2   Customer Email                8469 non-null   object 
 3   Customer Age                  8469 non-null   int64  
 4   Customer Gender               8469 non-null   object 
 5   Product Purchased             8469 non-null   object 
 6   Date of Purchase              8469 non-null   object 
 7   Ticket Type                   8469 non-null   object 
 8   Ticket Subject                8469 non-null   object 
 9   Ticket Description            8469 non-null   object 
 10  Ticket Status                 8469 non-null   object 
 11  Resolution                    2769 non-null   object 
 12  Ticket Priority               8469 non-null   object 
 13  Tic

In [8]:
# null_mask = df.isnull().any(axis=1)
# null_rows = df[null_mask]

# print(null_rows)

In [9]:
# not_null_mask = df.notnull().all(axis=1)
# not_null_rows = df[not_null_mask]

# print(not_null_rows)

In [10]:
df0 = df0.loc[df['Resolution'].notnull()]
df0.shape

(2769, 18)

### **Tf-IDF Embeddings**

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

docs = df0['Complete description'].tolist()

tfidf_vectorizer = TfidfVectorizer(stop_words='english')

# Fit the vectorizer to the data and transform the documents into TF-IDF embeddings
tfidf_embeddings = tfidf_vectorizer.fit_transform(docs)

In [12]:
# max_features = 1000
# n_component = 10
# svd = TruncatedSVD()

In [13]:
print("Feature Names n",tfidf_vectorizer.get_feature_names_out())

Feature Names n ['00' '000' '00015735595957' ... 'สสท' '家沙' '超地理伝獣']


#### **Sparse Matrix size and embeddings**

In [14]:
print("Sparse Matrix n",tfidf_embeddings.shape,"n",tfidf_embeddings.toarray())

Sparse Matrix n (2769, 5092) n [[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [15]:
def similarity_search(query_text):

    query_embeddings = tfidf_vectorizer.fit_transform([query_text])

    # Pad the query embedding with zeros to match the dimensionality of document embeddings
    padding_size = tfidf_embeddings.shape[1] - query_embeddings.shape[1]
    padded_query_embedding = np.pad(query_embeddings.toarray(), ((0, 0), (0, padding_size)), mode='constant')

    # Calculate cosine similarity between documents
    cosine_similarities = cosine_similarity(padded_query_embedding, tfidf_embeddings)
    top_similar_docs_indices = cosine_similarities.argsort()[0][-4:-1][::-1]

    for index in top_similar_docs_indices:
        print(docs[index])

In [16]:
df0.columns

Index(['Ticket ID', 'Customer Name', 'Customer Email', 'Customer Age',
       'Customer Gender', 'Product Purchased', 'Date of Purchase',
       'Ticket Type', 'Ticket Subject', 'Ticket Description', 'Ticket Status',
       'Resolution', 'Ticket Priority', 'Ticket Channel',
       'First Response Time', 'Time to Resolution',
       'Customer Satisfaction Rating', 'Complete description'],
      dtype='object')

In [17]:
df0['Product Purchased'].iloc[456]

'Autodesk AutoCAD'

In [18]:
df0['Ticket Description'].iloc[456]

"I'm having an issue with the {product_purchased}. Please assist.\n\nQ: Can I buy more than one copy of the item?\n\nA: All sales are limited to 1 (1) copy. (Except for I need assistance as soon as possible because it's affecting my work and productivity."

In [19]:
similarity_search("I'm having an issue with the {product_purchased}. Please assist.\n\nQ: Can I buy more than one copy of the item?\n\nA: All sales are limited to 1 (1) copy. (Except for I need assistance as soon as possible because it's affecting my work and productivity.")

Ticket Subject Peripheral compatibility 
Customer Name Jonathan Morris 
Description I'm having an issue with the {product_purchased}. Please assist.

[13:01:01] <gordon> that is a bug in the package <0>[10:00:02] <coble I've noticed that the issue occurs consistently when I use a specific feature or application on my {product_purchased}. 
Solution is Poor charge also quality month.
Ticket Subject Installation support 
Customer Name Luke Vega 
Description I'm having an issue with the {product_purchased}. Please assist.

$50,000 - $75,000

$80,000 - $100,000

$500,000 - $10,000 I've followed the troubleshooting steps mentioned in the user manual, but the issue persists. 
Solution is Call space water live than strong sort month.
Ticket Subject Peripheral compatibility 
Customer Name James Woods 
Description I'm having an issue with the {product_purchased}. Please assist.

0.00015735595957

0.00015735595957

0.00015735595957

0 I've recently updated the firmware of my {product_purchased}, 

### **Word2Vec Embeddings**

In [20]:
import re
import nltk
import spacy
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
from sklearn.metrics.pairwise import cosine_similarity

nltk.download('punkt')

#loading the english language small model of spacy
en = spacy.load('en_core_web_sm')
stopwords = en.Defaults.stop_words

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [21]:
df0["modified_text"] = df0['Complete description'].apply(lambda x: " ".join(word for word in x.split() if word not in stopwords ))

In [22]:
print(df0["Complete description"].iloc[1])
print('-------------------------------------------------------------------------------------------------')
print(df0["modified_text"].iloc[1])

Ticket Subject Account access 
Customer Name Christina Dillon 
Description I'm having an issue with the {product_purchased}. Please assist.

If you have a problem you're interested in and I'd love to see this happen, please check out the Feedback. I've already contacted customer support multiple times, but the issue remains unresolved. 
Solution is Try capital clearly never color toward story.
-------------------------------------------------------------------------------------------------
Ticket Subject Account access Customer Name Christina Dillon Description I'm having issue {product_purchased}. Please assist. If problem you're interested I'd love happen, check Feedback. I've contacted customer support multiple times, issue remains unresolved. Solution Try capital clearly color story.


In [23]:
#Preprocessing the text
def clean_text(text):
    # lower-case all characters
    text=text.lower()
    #removing emails
    text = re.sub(r'([a-z0-9+._-]+@[a-z0-9+._-]+\.[a-z0-9+_-]+)',"", text)
    #removing hashtag and @ words
    text= re.sub(r'@\S+', '',text)
    text= re.sub(r'#\S+', '',text)
    # remove urls
    text = re.sub(r'(http|https|ftp|ssh)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?', '' , text)
    text= re.sub(r'www.\S+', '',text)
    # regex only keeps characters
    # text=re.sub(r"[^a-zA-Z+']", ' ',text)
    #remove 's
    text=re.sub(r"['’]s\b",' ',text)
    # regex removes repeated spaces, strip removes leading and trailing spaces
    text=re.sub("\s[\s]+", " ",text).strip()

    return text

In [24]:
def clean(text):
    text = text.replace('[',"").replace("]","").replace("_"," ")
    text = re.sub(r'([a-z0-9+._-]+@[a-z0-9+._-]+\.[a-z0-9+_-]+)',"", text)
    text=re.sub("\s[\s]+", " ",text).strip()
    return text

In [25]:
df0["modified_text"] = df0.modified_text.apply(lambda x: clean(x))

In [26]:
# Tokenize the text data
documents = df0['Complete description'].tolist()
tokenized_documents = [word_tokenize(doc.lower()) for doc in documents]

In [27]:
# Train Word2Vec model
word2vec_model = Word2Vec(sentences=tokenized_documents, vector_size=100, window=5, min_count=1, workers=4)

In [28]:
# Search query
search_query = "I'm having an issue with the {product_purchased}. Please assist.\n\nQ: Can I buy more than one copy of the item?\n\nA: All sales are limited to 1 (1) copy. (Except for I need assistance as soon as possible because it's affecting my work and productivity."

In [29]:
# Tokenize the search query
tokenized_query = word_tokenize(search_query.lower())

# Perform similarity search
similar_words = word2vec_model.wv.most_similar(positive=tokenized_query, topn=3)

In [30]:
# Print the most similar words
print("Top 3 most similar words to the search query:")
for word, similarity in similar_words:
    print(word, "-", similarity)

Top 3 most similar words to the search query:
p.s - 0.9077248573303223
remove - 0.8984506726264954
1. - 0.8900465965270996


In [31]:
tokenized_documents[:1]

[['ticket',
  'subject',
  'network',
  'problem',
  'customer',
  'name',
  'christopher',
  'robbins',
  'description',
  'i',
  "'m",
  'facing',
  'a',
  'problem',
  'with',
  'my',
  '{',
  'product_purchased',
  '}',
  '.',
  'the',
  '{',
  'product_purchased',
  '}',
  'is',
  'not',
  'turning',
  'on',
  '.',
  'it',
  'was',
  'working',
  'fine',
  'until',
  'yesterday',
  ',',
  'but',
  'now',
  'it',
  'does',
  "n't",
  'respond',
  '.',
  '1.8.3',
  'i',
  'really',
  'i',
  "'m",
  'using',
  'the',
  'original',
  'charger',
  'that',
  'came',
  'with',
  'my',
  '{',
  'product_purchased',
  '}',
  ',',
  'but',
  'it',
  "'s",
  'not',
  'charging',
  'properly',
  '.',
  'solution',
  'is',
  'case',
  'maybe',
  'show',
  'recently',
  'my',
  'computer',
  'follow',
  '.']]

In [32]:
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
from sklearn.metrics.pairwise import cosine_similarity

# Sample text data
documents = df0['Complete description'].tolist()

# Train Word2Vec model
word2vec_model = Word2Vec(sentences=[word_tokenize(doc.lower()) for doc in documents], vector_size=100, window=5, min_count=1, workers=4)

# Search query
search_query = "I'm having an issue with the {product_purchased}. Please assist.\n\nQ: Can I buy more than one copy of the item?\n\nA: All sales are limited to 1 (1) copy. (Except for I need assistance as soon as possible because it's affecting my work and productivity."
# Tokenize the search query
tokenized_query = word_tokenize(search_query.lower())

# Compute average Word2Vec embedding for the search query
query_embedding = [word2vec_model.wv[word] for word in tokenized_query if word in word2vec_model.wv]
if query_embedding:
    query_embedding = sum(query_embedding) / len(query_embedding)
else:
    # Handle out-of-vocabulary words by skipping the query
    print("Search query contains out-of-vocabulary words.")
    exit()

# Compute cosine similarity between the query embedding and document embeddings
similarities = []
for doc in documents:
    tokenized_doc = word_tokenize(doc.lower())
    doc_embedding = [word2vec_model.wv[word] for word in tokenized_doc if word in word2vec_model.wv]
    if doc_embedding:
        doc_embedding = sum(doc_embedding) / len(doc_embedding)
        similarity = cosine_similarity([query_embedding], [doc_embedding])[0][0]
        similarities.append((doc, similarity))

# Sort documents by similarity score and return top N similar documents
top_similar_documents = sorted(similarities, key=lambda x: x[1], reverse=True)[:3]

# Print the top similar documents
print("Top 3 similar documents to the search query:")
for doc, similarity in top_similar_documents:
    print(doc)

Top 3 similar documents to the search query:
Ticket Subject Data loss 
Customer Name Riley Reed 
Description I'm having an issue with the {product_purchased}. Please assist. Thank you."

In response to a question about its price, the company said: "We make what we sell for the satisfaction of consumers, regardless of their I need assistance as soon as possible because it's affecting my work and productivity. 
Solution is Majority not successful understand.
Ticket Subject Product setup 
Customer Name Terri Johnson 
Description I'm having an issue with the {product_purchased}. Please assist. [B]Please contact the seller when available and confirm purchase will begin. If we could have added a different price to your cart, we'd be more likely to I need assistance as soon as possible because it's affecting my work and productivity. 
Solution is Collection commercial rise weight.
Ticket Subject Hardware issue 
Customer Name Justin Knight 
Description I'm having an issue with the {product_pur