# Approach-2 (TF-IDF, NLTK)

# 1.1 Read input

In [1]:
# Read text from chatlog
with open("chatlog.txt", "r") as file:
    lines = file.readlines()

In [2]:
lines

['User: Hello!\n',
 'AI: Hi! How can I assist you today?\n',
 'User: Can you explain what machine learning is?\n',
 'AI: Certainly! Machine learning is a field of AI that allows systems to\n',
 'learn from data.']

# 2.1 Chat Log Parsing

> As per given format of the input text, conversations of a specific speakers can be in different line for a single conversation

In [5]:
# Merge multi-line messages for each speaker

messages = []
current_speaker = None
current_message = ""

for line in lines:
    line = line.strip() # Remove leading/trailing whitespace

    # Check user
    if line.startswith("User:"):
        if current_speaker is not None:
            # If the current speaker is not None, it means we have a previous message
            messages.append((current_speaker, current_message.strip()))

        current_speaker = "User"
        current_message = line[len("User: "):].strip()
    
    # Check AI
    elif line.startswith("AI: "):
        if current_speaker is not None:
            messages.append((current_speaker, current_message.strip()))
        current_speaker = "AI"
        current_message = line[len("AI: "):].strip()
    
    else:
        current_message += " " + line

# Don't forget the last message
if current_speaker is not None:
    messages.append((current_speaker, current_message.strip()))

# Example: print all messages
for speaker, message in messages:
    print(f"{speaker}: {message}")

User: Hello!
AI: Hi! How can I assist you today?
User: Can you explain what machine learning is?
AI: Certainly! Machine learning is a field of AI that allows systems to learn from data.


In [6]:
messages

[('User', 'Hello!'),
 ('AI', 'Hi! How can I assist you today?'),
 ('User', 'Can you explain what machine learning is?'),
 ('AI',
  'Certainly! Machine learning is a field of AI that allows systems to learn from data.')]

In [7]:
messages[1] # Second message

('AI', 'Hi! How can I assist you today?')

# NLTK Implementation

> NLTK will be used for:
    
- removing stopwords
- tokenizing for tf-idf
- lemmatization to get the root word resulting better word extraction



In [9]:
!pip install -U NLTK



In [8]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [10]:
# Download NLTK resources (run once)
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-

True

## Combine all the messages to make the corpus

In [11]:
all_text = " ".join([msg[1] for msg in messages])
all_text

'Hello! Hi! How can I assist you today? Can you explain what machine learning is? Certainly! Machine learning is a field of AI that allows systems to learn from data.'

## Tokenize, remove stopwords, and lemmatize

In [66]:
def lemmatize_text(text):
    tokens = word_tokenize(text)
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))
    
    custom_stopwords = {"hi", "hello", "hey", "thanks", "thank", "please", "ok", "okay", "sure", "yes", "no", "maybe", "let's", "let us"}

    stop_words.update(custom_stopwords)
    return ' '.join(
        lemmatizer.lemmatize(w.lower())
        for w in tokens
        if w.isalpha() and w.lower() not in stop_words
    )

lemmatized_docs = [lemmatize_text(msg[1]) for msg in messages]

In [67]:
lemmatized_docs

['',
 'assist today',
 'explain machine learning',
 'certainly machine learning field ai allows system learn data']

# 2.3 Message Statistics

In [84]:
# Count total messages
print("TOtal number of messages", len(messages))

TOtal number of messages 4


In [85]:
# Count messages from User vs. AI
user_messages = [msg for msg in messages if msg[0] == "User"]
ai_messages = [msg for msg in messages if msg[0] == "AI"]

print("USER Message Count: ", len(user_messages))
print("AI Replyy Count: ", len(ai_messages))

USER Message Count:  2
AI Replyy Count:  2


# 2.3 Keyword Analysis

> ### **TF-IDF** for Keyword Extraction

In [86]:
# Import tf-idf
from sklearn.feature_extraction.text import TfidfVectorizer

In [92]:
# Use each message as a document for better TF-IDF results
lemmatized_docs = [msg[1] for msg in messages]
vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(2, 3))

# Generate TF-IDF matrix
tfidf_matrix = vectorizer.fit_transform(lemmatized_docs)

# Get feature names (words)
feature_names = vectorizer.get_feature_names_out()

# Sum TF-IDF scores across all docs
scores = tfidf_matrix.sum(axis=0).A1
top_indices = scores.argsort()[-5:][::-1]
top_keywords = [feature_names[i] for i in top_indices]

print("Top keywords by TF-IDF:", top_keywords)

Top keywords by TF-IDF: ['machine learning', 'explain machine', 'explain machine learning', 'assist today', 'hi assist today']


# 2.4 Generate Summary

In [88]:
# Count total exchanges (User-AI Pairs Count)
exchanges_count = min(len(user_messages), len(ai_messages))
print("Total exchanges (User-AI Pairs Count): ", exchanges_count)

Total exchanges (User-AI Pairs Count):  2


In [89]:
# Main topic
main_topic = top_keywords[0] if top_keywords else "No main topic found"

In [90]:
# Print the summary
print("Summary:")
print(f"- The conversation had {exchanges_count} exchanges.")
print(f"- The user asked mainly about {main_topic} and its uses.")
print(f"- Most common keywords: {', '.join(top_keywords)}")

Summary:
- The conversation had 2 exchanges.
- The user asked mainly about machine learning and its uses.
- Most common keywords: machine learning, explain machine, explain machine learning, assist today, hi assist today
