In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
import spacy

In [None]:
#Import dataset
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Datasets/Chat Log Dataset .csv')
df

Unnamed: 0,customer_id,chat_date,message_text
0,101,1/10/2024 9:00,"""I'm having trouble logging into my account, p..."
1,102,2/4/2024 9:15,"""How do I change my password? I forgot the old..."
2,103,4/10/2024 9:30,"""Can I return a product that I bought last wee..."
3,104,5/20/2024 9:45,"""When will my order be shipped? I haven't rece..."
4,105,6/8/2024 10:00,"""I need to update my shipping address for my r..."
5,106,6/20/2024 10:15,"""Is it possible to get a refund on a defective..."
6,107,7/15/2024 10:30,"""My credit card was charged incorrectly. Can y..."
7,108,8/18/2024 10:45,"""I was charged twice for the same order. Pleas..."
8,109,10/5/2024 11:00,"""Do you have any new deals or discounts for th..."
9,110,11/10/2024 11:15,"""Can you explain the warranty policy on your e..."


## Data Preprocessing

### Tokenization

#### Sentence Tokanization

In [None]:
#Load Spacy model
nlp = spacy.load( 'en_core_web_sm')

def sentence_tokenize(text):
    doc = nlp(text)
    return [sent.text for sent in doc.sents]  # Extract sentences as text

# Apply sentence tokenization to each message in the dataset
df['sentences'] = df['message_text'].apply(sentence_tokenize)
print(df['sentences'])

0    ["I'm having trouble logging into my account, ...
1    ["How do I change my password?, I forgot the o...
2    ["Can I return a product that I bought last we...
3    ["When will my order be shipped?, I haven't re...
4    ["I need to update my shipping address for my ...
5    ["Is it possible to get a refund on a defectiv...
6    ["My credit card was charged incorrectly., Can...
7    ["I was charged twice for the same order., Ple...
8    ["Do you have any new deals or discounts for t...
9    ["Can you explain the warranty policy on your ...
Name: sentences, dtype: object


#### Word Tokanization

In [None]:
#Tokenize Mesage text column
tokens= df['message_text'].apply(lambda x: [token.text for token in nlp(x)])
print(tokens)

0    [", I, 'm, having, trouble, logging, into, my,...
1    [", How, do, I, change, my, password, ?, I, fo...
2    [", Can, I, return, a, product, that, I, bough...
3    [", When, will, my, order, be, shipped, ?, I, ...
4    [", I, need, to, update, my, shipping, address...
5    [", Is, it, possible, to, get, a, refund, on, ...
6    [", My, credit, card, was, charged, incorrectl...
7    [", I, was, charged, twice, for, the, same, or...
8    [", Do, you, have, any, new, deals, or, discou...
9    [", Can, you, explain, the, warranty, policy, ...
Name: message_text, dtype: object


In [None]:
import spacy

# Load the spaCy language model
nlp = spacy.load('en_core_web_sm')

# Function to tokenize words
def word_tokenize(text):
    doc = nlp(text)
    return [token.text for token in doc if not token.is_punct and not token.is_space]

# Apply word tokenization to each message in the dataset
df['word_tokens'] = df['message_text'].apply(word_tokenize)

# Check if 'word_tokens' column is created successfully
print(df['word_tokens'])


0    [I, 'm, having, trouble, logging, into, my, ac...
1    [How, do, I, change, my, password, I, forgot, ...
2    [Can, I, return, a, product, that, I, bought, ...
3    [When, will, my, order, be, shipped, I, have, ...
4    [I, need, to, update, my, shipping, address, f...
5    [Is, it, possible, to, get, a, refund, on, a, ...
6    [My, credit, card, was, charged, incorrectly, ...
7    [I, was, charged, twice, for, the, same, order...
8    [Do, you, have, any, new, deals, or, discounts...
9    [Can, you, explain, the, warranty, policy, on,...
Name: word_tokens, dtype: object


### Stemming and Lemmatization

#### Stemming

In [None]:
# Initialize NLTK stemmer
from nltk import PorterStemmer

stemmer = PorterStemmer()

In [None]:
from nltk.stem import PorterStemmer

# Initialize the NLTK stemmer
stemmer = PorterStemmer()

# Function to perform stemming
def stem_words(tokens):
    return [stemmer.stem(token) for token in tokens]

# Apply stemming to each list of tokens in 'word_tokens'
df['stemmed_tokens'] = df['word_tokens'].apply(stem_words)

# Check the stemmed results
print(df[['word_tokens', 'stemmed_tokens']])

                                         word_tokens  \
0  [I, 'm, having, trouble, logging, into, my, ac...   
1  [How, do, I, change, my, password, I, forgot, ...   
2  [Can, I, return, a, product, that, I, bought, ...   
3  [When, will, my, order, be, shipped, I, have, ...   
4  [I, need, to, update, my, shipping, address, f...   
5  [Is, it, possible, to, get, a, refund, on, a, ...   
6  [My, credit, card, was, charged, incorrectly, ...   
7  [I, was, charged, twice, for, the, same, order...   
8  [Do, you, have, any, new, deals, or, discounts...   
9  [Can, you, explain, the, warranty, policy, on,...   

                                      stemmed_tokens  
0  [i, 'm, have, troubl, log, into, my, account, ...  
1  [how, do, i, chang, my, password, i, forgot, t...  
2  [can, i, return, a, product, that, i, bought, ...  
3  [when, will, my, order, be, ship, i, have, n't...  
4  [i, need, to, updat, my, ship, address, for, m...  
5  [is, it, possibl, to, get, a, refund, on, a, d... 

#### Lemmatization

In [None]:
def lemmatize_words(tokens):
  doc = nlp(' '.join(tokens))
  return [token.lemma_ for token in doc]

df['lemmatized_tokens'] = df['word_tokens'].apply(lemmatize_words)

print(df['lemmatized_tokens'])

0    [I, ', m, have, trouble, log, into, my, accoun...
1    [how, do, I, change, my, password, I, forget, ...
2    [can, I, return, a, product, that, I, buy, las...
3    [when, will, my, order, be, ship, I, have, not...
4    [I, need, to, update, my, shipping, address, f...
5    [be, it, possible, to, get, a, refund, on, a, ...
6    [my, credit, card, be, charge, incorrectly, ca...
7    [I, be, charge, twice, for, the, same, order, ...
8    [do, you, have, any, new, deal, or, discount, ...
9    [can, you, explain, the, warranty, policy, on,...
Name: lemmatized_tokens, dtype: object


## POS Tagging

In [None]:
# Function to tokenize and perform POS tagging
def pos_tagging(text):
    doc = nlp(text)  # Tokenize and perform POS tagging using spaCy
    for token in doc:
        print(f"{token.text} | {token.pos_} | {spacy.explain(token.pos_)}")

# Apply the function to each message in the dataset and print results
df['message_text'].apply(pos_tagging)

" | PUNCT | punctuation
I | PRON | pronoun
'm | AUX | auxiliary
having | VERB | verb
trouble | NOUN | noun
logging | VERB | verb
into | ADP | adposition
my | PRON | pronoun
account | NOUN | noun
, | PUNCT | punctuation
please | INTJ | interjection
help | VERB | verb
me | PRON | pronoun
. | PUNCT | punctuation
" | PUNCT | punctuation
" | PUNCT | punctuation
How | SCONJ | subordinating conjunction
do | AUX | auxiliary
I | PRON | pronoun
change | VERB | verb
my | PRON | pronoun
password | NOUN | noun
? | PUNCT | punctuation
I | PRON | pronoun
forgot | VERB | verb
the | DET | determiner
old | ADJ | adjective
one | NUM | numeral
. | PUNCT | punctuation
" | PUNCT | punctuation
" | PUNCT | punctuation
Can | AUX | auxiliary
I | PRON | pronoun
return | VERB | verb
a | DET | determiner
product | NOUN | noun
that | PRON | pronoun
I | PRON | pronoun
bought | VERB | verb
last | ADJ | adjective
week | NOUN | noun
? | PUNCT | punctuation
It | PRON | pronoun
's | AUX | auxiliary
defective | ADJ | adje

Unnamed: 0,message_text
0,
1,
2,
3,
4,
5,
6,
7,
8,
9,


In [None]:
from collections import Counter

# Function to collect POS tag counts
def get_pos_counts(text):
    doc = nlp(text)
    pos_counts = Counter([token.pos_ for token in doc if not token.is_punct and not token.is_space])
    return pos_counts

# Apply the function and aggregate POS counts across all messages
all_pos_counts = df['message_text'].apply(get_pos_counts)
total_pos_counts = sum(all_pos_counts, Counter())

print("Total POS counts across all messages:")
print(total_pos_counts)

Total POS counts across all messages:
Counter({'PRON': 25, 'NOUN': 24, 'VERB': 19, 'AUX': 13, 'DET': 10, 'ADJ': 9, 'ADP': 7, 'PART': 3, 'ADV': 3, 'INTJ': 2, 'SCONJ': 2, 'NUM': 1, 'CCONJ': 1})


In [None]:
nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

## Named Entity Recognition (NER)

In [None]:
import pandas as pd
import spacy
from collections import defaultdict

# Load the spaCy model
nlp = spacy.load("en_core_web_sm")

# Load the dataset
df = pd.DataFrame({
    'customer_id': [101, 102, 103, 104, 105, 106, 107, 108, 109, 110],
    'chat_date': [
        "2024-01-10 09:00:00", "2024-02-04 09:15:00", "2024-04-10 09:30:00",
        "2024-05-20 09:45:00", "2024-06-08 10:00:00", "2024-06-20 10:15:00",
        "2024-07-15 10:30:00", "2024-08-18 10:45:00", "2024-10-05 11:00:00",
        "2024-11-10 11:15:00"
    ],
    'message_text': [
        "I'm having trouble logging into my account, please help me.",
        "How do I change my password? I forgot the old one.",
        "Can I return a product that I bought last week? It's defective.",
        "When will my order be shipped? I haven't received any updates.",
        "I need to update my shipping address for my recent order.",
        "Is it possible to get a refund on a defective item I bought a month ago?",
        "My credit card was charged incorrectly. Can you assist with that?",
        "I was charged twice for the same order. Please check it.",
        "Do you have any new deals or discounts for the upcoming holiday season?",
        "Can you explain the warranty policy on your electronics products?"
    ]
})


In [None]:
# Function to extract entities from message text
def extract_entities(text):
    doc = nlp(text)
    entities = defaultdict(list)
    for ent in doc.ents:
        if ent.label_ in ["PRODUCT", "GPE", "DATE", "PERSON", "ORG", "MONEY", "TIME"]:
            entities[ent.label_].append(ent.text)
    return entities

# Apply the entity extraction to each message
df['entities'] = df['message_text'].apply(extract_entities)

# Display entities for each query
print(df[['message_text', 'entities']])

                                        message_text  \
0  I'm having trouble logging into my account, pl...   
1  How do I change my password? I forgot the old ...   
2  Can I return a product that I bought last week...   
3  When will my order be shipped? I haven't recei...   
4  I need to update my shipping address for my re...   
5  Is it possible to get a refund on a defective ...   
6  My credit card was charged incorrectly. Can yo...   
7  I was charged twice for the same order. Please...   
8  Do you have any new deals or discounts for the...   
9  Can you explain the warranty policy on your el...   

                                    entities  
0                                         {}  
1                                         {}  
2                    {'DATE': ['last week']}  
3                                         {}  
4                                         {}  
5                  {'DATE': ['a month ago']}  
6                                         {}  
7      

In [None]:
from collections import Counter

# Aggregate counts for products and locations
product_counter = Counter()
location_counter = Counter()

for entities in df['entities']:
    product_counter.update(entities.get('PRODUCT', []))
    location_counter.update(entities.get('GPE', []))

# Display the counts
print("Product Mentions:", product_counter)
print("Location Mentions:", location_counter)

Product Mentions: Counter()
Location Mentions: Counter()


## Analysis & Insights

### Common Issues

In [None]:
from collections import Counter
from nltk.corpus import stopwords
import nltk

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# Tokenize and filter stopwords
tokens = [word for message in df['message_text'] for word in message.lower().split() if word not in stop_words]
# Count frequency of each token
token_counts = Counter(tokens)
# Display the most common tokens
print("Most common complaint terms:", token_counts.most_common(10))


Most common complaint terms: [('please', 2), ('bought', 2), ('order.', 2), ('charged', 2), ("i'm", 1), ('trouble', 1), ('logging', 1), ('account,', 1), ('help', 1), ('me.', 1)]


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


###Topic Discovery

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

# Convert messages into a document-term matrix
vectorizer = CountVectorizer(stop_words='english')
dtm = vectorizer.fit_transform(df['message_text'])

# Apply LDA
lda = LatentDirichletAllocation(n_components=3, random_state=42)  # n_components = number of topics
lda.fit(dtm)

# Display topics and associated terms
for idx, topic in enumerate(lda.components_):
    print(f"Topic #{idx+1}:")
    print([vectorizer.get_feature_names_out()[i] for i in topic.argsort()[-10:]])


Topic #1:
['password', 'old', 'forgot', 'change', 'charged', 'return', 'product', 'week', 'defective', 'bought']
Topic #2:
['new', 'holiday', 'season', 'discounts', 'address', 'shipping', 'need', 'recent', 'update', 'order']
Topic #3:
['policy', 'warranty', 'products', 'explain', 'ago', 'item', 'refund', 'possible', 'month', 'charged']


### Sentiment Insight

In [None]:
from textblob import TextBlob

def get_sentiment(text):
    sentiment = TextBlob(text).sentiment.polarity
    if sentiment > 0:
        return "Positive"
    elif sentiment == 0:
        return "Neutral"
    else:
        return "Negative"

df['sentiment'] = df['message_text'].apply(get_sentiment)
print(df['sentiment'].value_counts())

sentiment
Neutral     7
Positive    2
Negative    1
Name: count, dtype: int64
