# **NLP Analysis of Customer Support Chat Data**

In [1]:
# Import necessary libraries
import spacy  # For NLP tasks like tokenization, POS tagging
import nltk  # For NLP tasks like stemming, tokenization

In [2]:
from google.colab import drive  # For accessing files in Google Drive

# Mount Google Drive to access files stored in it
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
# Read the contents of the file 'Chat Log Dataset.txt' into a list of lines
with open("/content/drive/MyDrive/Chat Log Dataset.txt") as f:
    text = f.readlines()
text

['customer_id      chat_date                   message_text \n',
 '101              2024-01-10  09:00:00        "I\'m having trouble logging into my account, please help me." \n',
 '102              2024-02-04  09:15:00        "How do I change my password? I forgot the old one." \n',
 '103              2024-04-10  09:30:00        "Can I return a product that I bought last week? It\'s defective." \n',
 '104              2024-05-20  09:45:00        "When will my order be shipped? I haven\'t received any updates." \n',
 '105              2024-06-08  10:00:00        "I need to update my shipping address for my recent order." \n',
 '106              2024-06-20  10:15:00        "Is it possible to get a refund on a defective item I bought a month ago?" \n',
 '107              2024-07-15  10:30:00        "My credit card was charged incorrectly. Can you assist with that?" \n',
 '108              2024-08-18  10:45:00        "I was charged twice for the same order. Please check it." \n',
 '109   

In [4]:
# Join the list of lines into a single string, separating each line with a space
text = " ".join(text)
text

'customer_id      chat_date                   message_text \n 101              2024-01-10  09:00:00        "I\'m having trouble logging into my account, please help me." \n 102              2024-02-04  09:15:00        "How do I change my password? I forgot the old one." \n 103              2024-04-10  09:30:00        "Can I return a product that I bought last week? It\'s defective." \n 104              2024-05-20  09:45:00        "When will my order be shipped? I haven\'t received any updates." \n 105              2024-06-08  10:00:00        "I need to update my shipping address for my recent order." \n 106              2024-06-20  10:15:00        "Is it possible to get a refund on a defective item I bought a month ago?" \n 107              2024-07-15  10:30:00        "My credit card was charged incorrectly. Can you assist with that?" \n 108              2024-08-18  10:45:00        "I was charged twice for the same order. Please check it." \n 109              2024-10-05  11:00:00      

## 1. Data Preprocessing

### Tokenization

In [5]:
# Import the necessary modules
import re  # For regular expressions

# Define a regex pattern to match timestamps and capture the message after the timestamp
pattern = re.compile(r"\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2}:\d{2}\s+(.*)")

# Extract the messages from the text using the regex pattern
message_texts = pattern.findall(text)

# Load a blank SpaCy NLP model for English and add the sentencizer component
nlp = spacy.blank("en")
nlp.add_pipe("sentencizer")  # Add the sentencizer to detect sentence boundaries

# Loop through each extracted message, split into sentences, and then tokenize
for message_text in message_texts:
    doc = nlp(message_text)  # Process the message with SpaCy
    sentences = [sent.text for sent in doc.sents]  # Extract sentences from the message

    print("Extracted message:", message_text)
    print("Sentences:", sentences)

    for sentence in sentences:
        sent_doc = nlp(sentence)  # Process each sentence separately
        tokens = [token.text for token in sent_doc]  # Extract tokens (words) from the sentence
        print("Sentence:", sentence)
        print("Tokens:", tokens)
        print("-" * 80)

Extracted message: "I'm having trouble logging into my account, please help me." 
Sentences: ['"I\'m having trouble logging into my account, please help me."']
Sentence: "I'm having trouble logging into my account, please help me."
Tokens: ['"', 'I', "'m", 'having', 'trouble', 'logging', 'into', 'my', 'account', ',', 'please', 'help', 'me', '.', '"']
--------------------------------------------------------------------------------
Extracted message: "How do I change my password? I forgot the old one." 
Sentences: ['"How do I change my password?', 'I forgot the old one."']
Sentence: "How do I change my password?
Tokens: ['"', 'How', 'do', 'I', 'change', 'my', 'password', '?']
--------------------------------------------------------------------------------
Sentence: I forgot the old one."
Tokens: ['I', 'forgot', 'the', 'old', 'one', '.', '"']
--------------------------------------------------------------------------------
Extracted message: "Can I return a product that I bought last week?

### Stemming and Lemmatization

In [6]:
# Import PorterStemmer from NLTK for stemming (reducing words to their root form)
from nltk.stem import PorterStemmer

# Create an instance of the PorterStemmer
stemmer = PorterStemmer()

# Load the SpaCy English language model
nlp = spacy.load("en_core_web_sm")

# Loop through each message to perform stemming and lemmatization
for message_text in message_texts:
    doc = nlp(message_text)  # Process the message with SpaCy
    tokens = [token.text for token in doc]  # Extract tokens from the message

    # Apply stemming to each token using the PorterStemmer
    stemmed_tokens = [stemmer.stem(token) for token in tokens]

    # Apply lemmatization to each token using SpaCy (returns base form of the word)
    lemmatized_tokens = [token.lemma_ for token in doc]

    print(f"Original Message: {message_text}")
    print(f"Tokens: {tokens}")
    print(f"Stemmed Tokens: {stemmed_tokens}")
    print(f"Lemmatized Tokens: {lemmatized_tokens}")
    print("-" * 80)

Original Message: "I'm having trouble logging into my account, please help me." 
Tokens: ['"', 'I', "'m", 'having', 'trouble', 'logging', 'into', 'my', 'account', ',', 'please', 'help', 'me', '.', '"']
Stemmed Tokens: ['"', 'i', "'m", 'have', 'troubl', 'log', 'into', 'my', 'account', ',', 'pleas', 'help', 'me', '.', '"']
Lemmatized Tokens: ['"', 'I', 'be', 'have', 'trouble', 'log', 'into', 'my', 'account', ',', 'please', 'help', 'I', '.', '"']
--------------------------------------------------------------------------------
Original Message: "How do I change my password? I forgot the old one." 
Tokens: ['"', 'How', 'do', 'I', 'change', 'my', 'password', '?', 'I', 'forgot', 'the', 'old', 'one', '.', '"']
Stemmed Tokens: ['"', 'how', 'do', 'i', 'chang', 'my', 'password', '?', 'i', 'forgot', 'the', 'old', 'one', '.', '"']
Lemmatized Tokens: ['"', 'how', 'do', 'I', 'change', 'my', 'password', '?', 'I', 'forget', 'the', 'old', 'one', '.', '"']
------------------------------------------------

## 2. POS Tagging

In [7]:
# Prepare to store POS tags (part-of-speech tags) for each message
results = []

# Loop through each message and extract its POS tags
for message_text in message_texts:
    doc = nlp(message_text)  # Process the message with SpaCy
    # Extract tokens and their POS tags, formatting them with " | "
    pos_tags = [f"{token.text} | {token.pos_}" for token in doc]
    results.append({"message_text": message_text, "pos_tags": pos_tags})  # Store the results

# Print the POS tags for each message
for result in results:
    print(f"Message: {result['message_text']}")
    print(f"POS Tags: {', '.join(result['pos_tags'])}")  # Join tags with a comma for readability
    print("-" * 80)

Message: "I'm having trouble logging into my account, please help me." 
POS Tags: " | PUNCT, I | PRON, 'm | AUX, having | VERB, trouble | NOUN, logging | VERB, into | ADP, my | PRON, account | NOUN, , | PUNCT, please | INTJ, help | VERB, me | PRON, . | PUNCT, " | PUNCT
--------------------------------------------------------------------------------
Message: "How do I change my password? I forgot the old one." 
POS Tags: " | PUNCT, How | SCONJ, do | AUX, I | PRON, change | VERB, my | PRON, password | NOUN, ? | PUNCT, I | PRON, forgot | VERB, the | DET, old | ADJ, one | NUM, . | PUNCT, " | PUNCT
--------------------------------------------------------------------------------
Message: "Can I return a product that I bought last week? It's defective." 
POS Tags: " | PUNCT, Can | AUX, I | PRON, return | VERB, a | DET, product | NOUN, that | PRON, I | PRON, bought | VERB, last | ADJ, week | NOUN, ? | PUNCT, It | PRON, 's | AUX, defective | ADJ, . | PUNCT, " | PUNCT
---------------------------

In [8]:
# SpaCy returns a list of the names of all the pipeline components currently loaded in the nlp object
nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

**tok2vec:**

- A transformer-like layer that converts tokens into vector representations.
- It is typically used as a base layer to share word embeddings across other components.
- Useful for improving downstream tasks like tagging, parsing, and NER by providing context-aware word embeddings.

**tagger:**

- The part-of-speech (POS) tagger.
- Assigns grammatical tags (e.g., noun, verb) to each token in the text.
- Example: In "She runs fast," the tagger assigns "PRON" to "She," "VERB" to "runs," and "ADV" to "fast."

**parser:**

- The dependency parser.
- Identifies syntactic relationships between words in a sentence (e.g., subject-verb-object structure).
- Example: In "The dog chased the cat," the parser determines that "dog" is the subject and "cat" is the object of the verb "chased."

**attribute_ruler:**

- A component that modifies token attributes (e.g., lemma, POS) based on custom rules or patterns.
- Useful for tasks like normalizing text (e.g., standardizing abbreviations, handling exceptions).

**lemmatizer:**

- Generates the base form (lemma) of each word.
- Example: The word "running" is lemmatized to "run," and "better" is lemmatized to "good."

**ner:**

- The named entity recognizer.
- Identifies and labels named entities in the text, such as dates, names, locations, organizations, etc.
- Example: In "John works at Google in California," the ner component labels "John" as PERSON, "Google" as ORG, and "California" as GPE (Geopolitical Entity).

## 3. Named Entity Recognition (NER)

In [9]:
# Loop through each message to extract named entities using SpaCy's NER (Named Entity Recognition)
for message_text in message_texts:
    doc = nlp(message_text)  # Process the message with SpaCy

    print(f"Message: {message_text}")
    print("Entities:")

    # Loop through each entity recognized in the message and print its text, label, and description
    for ent in doc.ents:
        print(ent.text, " | ", ent.label_, " | ", spacy.explain(ent.label_))

    print("-" * 80)

Message: "I'm having trouble logging into my account, please help me." 
Entities:
--------------------------------------------------------------------------------
Message: "How do I change my password? I forgot the old one." 
Entities:
--------------------------------------------------------------------------------
Message: "Can I return a product that I bought last week? It's defective." 
Entities:
last week  |  DATE  |  Absolute or relative dates or periods
--------------------------------------------------------------------------------
Message: "When will my order be shipped? I haven't received any updates." 
Entities:
--------------------------------------------------------------------------------
Message: "I need to update my shipping address for my recent order." 
Entities:
--------------------------------------------------------------------------------
Message: "Is it possible to get a refund on a defective item I bought a month ago?" 
Entities:
a month ago  |  DATE  |  Absolute

In [10]:
# Import SpaCy's displacy module for visualizing entities
from spacy import displacy

# Select a few messages (at indices 2, 5, and 8) to visualize named entities
docs = [nlp(message_texts[i]) for i in [2, 5, 8]]

# Render the entities in these selected messages using displacy
displacy.render(docs, style="ent")

## 4. Analysis & Insights

### Common Issues

In [11]:
from collections import Counter

# Initialize a Counter object to count complaint-related terms
complaints = Counter()

# Process each message
for message_text in message_texts:
    # Tokenize the message (assuming `nlp` is a spaCy pipeline object)
    doc = nlp(message_text)
    for token in doc:
        if token.is_alpha and not token.is_stop:  # Consider only alphabetical tokens and exclude stop words
            complaints.update([token.lemma_])  # Update the complaint counter with the lemmatized token

print("Common complaint terms:", complaints.most_common(10))

Common complaint terms: [('order', 3), ('product', 2), ('buy', 2), ('defective', 2), ('update', 2), ('charge', 2), ('have', 1), ('trouble', 1), ('log', 1), ('account', 1)]


### Topic Discovery

In [12]:
# Import TfidfVectorizer from scikit-learn for TF-IDF vectorization
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd  # For handling data in DataFrames

# Initialize TfidfVectorizer to compute TF-IDF values, excluding English stop words
vectorizer = TfidfVectorizer(stop_words='english')

# Fit the vectorizer to the message texts and transform the texts into a TF-IDF matrix
X = vectorizer.fit_transform(message_texts)

# Convert the TF-IDF matrix into a DataFrame for easier analysis
df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())

# For each message, find the top 5 words with the highest TF-IDF scores
top_words_per_message = df.apply(lambda x: pd.Series(x.nlargest(5).index), axis=1)

print(top_words_per_message)

             0          1        2            3          4
0      account     having     help      logging    trouble
1       change     forgot      old     password    account
2      product     return     week       bought  defective
3        haven   received  shipped      updates      order
4      address       need   recent     shipping     update
5          ago       item    month     possible     refund
6       assist       card   credit  incorrectly    charged
7        check      twice  charged        order    account
8        deals  discounts  holiday          new     season
9  electronics    explain   policy     products   warranty


### Sentiment Insight

In [13]:
# Define a list of positive and negative words (for sentiment analysis)
positive_words = {"help", "update", "possible", "assist", "new", "discounts", "explain", "warranty"}
negative_words = {"trouble", "forgot", "defective", "haven't", "refund", "charged", "incorrectly", "twice", "return"}

# Define a function to classify the sentiment of a message based on predefined positive and negative words
def classify_sentiment(message):
    # Tokenize the message and lemmatize each token
    tokens = [token.lemma_ for token in nlp(message)]

    # Count occurrences of positive and negative words in the message
    pos_count = sum(1 for token in tokens if token in positive_words)
    neg_count = sum(1 for token in tokens if token in negative_words)

    # Classify the message as 'Positive', 'Negative', or 'Neutral' based on the counts
    if pos_count > neg_count:
        return 'Positive'
    elif neg_count > pos_count:
        return 'Negative'
    else:
        return 'Neutral'

# Classify the sentiment of each message and count occurrences of each sentiment
sentiment_counts = {"Positive": 0, "Negative": 0, "Neutral": 0}
sentiments = []

for message in message_texts:
    sentiment = classify_sentiment(message)
    sentiments.append(sentiment)
    sentiment_counts[sentiment] += 1

# Print the sentiments for each message
print("Sentiments for each message:")
for message, sentiment in zip(message_texts, sentiments):
    print(f"Message: {message[:50]}... Sentiment: {sentiment}")

# Print the counts of each sentiment
print("\nSentiment counts:")
for sentiment, count in sentiment_counts.items():
    print(f"{sentiment}: {count}")

Sentiments for each message:
Message: "I'm having trouble logging into my account, pleas... Sentiment: Neutral
Message: "How do I change my password? I forgot the old one... Sentiment: Neutral
Message: "Can I return a product that I bought last week? I... Sentiment: Negative
Message: "When will my order be shipped? I haven't received... Sentiment: Positive
Message: "I need to update my shipping address for my recen... Sentiment: Positive
Message: "Is it possible to get a refund on a defective ite... Sentiment: Negative
Message: "My credit card was charged incorrectly. Can you a... Sentiment: Neutral
Message: "I was charged twice for the same order. Please ch... Sentiment: Negative
Message: "Do you have any new deals or discounts for the up... Sentiment: Positive
Message: "Can you explain the warranty policy on your elect... Sentiment: Positive

Sentiment counts:
Positive: 4
Negative: 3
Neutral: 3
