# Approach-2 (TF-IDF, NLTK) + Multiple AI Chat Log Support

In [None]:
import os
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer

# Download NLTK resources (run once)
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

In [50]:

folder_path = "chatlogs_folder" 

for filename in os.listdir(folder_path):
    if filename.endswith(".txt"):
        print(f"\nProcessing {filename}...")
        with open(os.path.join(folder_path, filename), "r", encoding="utf-8") as file:
            lines = file.readlines()

        # --- Chat Log Parsing ---
        messages = []
        current_speaker = None
        current_message = ""
        for line in lines:
            line = line.strip()
            if line.startswith("User:"):
                if current_speaker is not None:
                    messages.append((current_speaker, current_message.strip()))
                current_speaker = "User"
                current_message = line[len("User: "):].strip()
            elif line.startswith("AI: "):
                if current_speaker is not None:
                    messages.append((current_speaker, current_message.strip()))
                current_speaker = "AI"
                current_message = line[len("AI: "):].strip()
            else:
                current_message += " " + line
        if current_speaker is not None:
            messages.append((current_speaker, current_message.strip()))

        # --- Lemmatization ---
        lemmatizer = WordNetLemmatizer()
        stop_words = set(stopwords.words('english'))
        custom_stopwords = {"hi", "hello", "hey", "thanks", "thank", "please", "ok", "okay", "sure", "yes", "no", "maybe", "let's", "let us"}
        stop_words.update(custom_stopwords)
        lemmatized_docs = []
        for msg in messages:
            tokens = word_tokenize(msg[1])
            lemmatized = ' '.join(
                lemmatizer.lemmatize(w.lower())
                for w in tokens
                if w.isalpha() and w.lower() not in stop_words
            )
            lemmatized_docs.append(lemmatized)

        # --- TF-IDF ---
        vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(2, 3))
        tfidf_matrix = vectorizer.fit_transform(lemmatized_docs)
        feature_names = vectorizer.get_feature_names_out()
        scores = tfidf_matrix.sum(axis=0).A1
        top_indices = scores.argsort()[-5:][::-1]
        top_keywords = [feature_names[i] for i in top_indices]

        # --- Message Statistics ---
        user_messages = [msg for msg in messages if msg[0] == "User"]
        ai_messages = [msg for msg in messages if msg[0] == "AI"]
        exchanges_count = min(len(user_messages), len(ai_messages))
        main_topic = top_keywords[0] if top_keywords else "No main topic found"

        # --- Print the summary ---
        print("Summary:")
        print(f"- The conversation had {exchanges_count} exchanges.")
        print(f"- The user asked mainly about {main_topic} and {top_keywords[1] if len(top_keywords) > 1 else ''}.")
        print(f"- Most common keywords: {', '.join(top_keywords)}")


Processing chatlog_health.txt...
Summary:
- The conversation had 3 exchanges.
- The user asked mainly about healthy eating and tip healthy eating.
- Most common keywords: healthy eating, tip healthy eating, tip healthy, water drink daily, drink daily

Processing chatlog_math.txt...
Summary:
- The conversation had 3 exchanges.
- The user asked mainly about derivative used and help today.
- Most common keywords: derivative used, help today, explain calculus, used determine rate, change slope

Processing chatlog_travel.txt...
Summary:
- The conversation had 3 exchanges.
- The user asked mainly about help travel plan and travel plan.
- Most common keywords: help travel plan, travel plan, help travel, visit paris, best time
