In [None]:
# 📊 ChatGPT Personal Usage Analysis

This notebook analyzes your ChatGPT conversation data to provide insights into your usage patterns, including:

- **Work vs Non-work classification** of your messages
- **Intent analysis** (Asking, Doing, Expressing)
- **Topic categorization** across 15+ categories
- **Usage patterns** by time, day, and conversation length
- **Comparison** with research findings

## 🔬 Research Background

This analysis is based on the methodology from the research paper: *"How Do People Use ChatGPT? Analyzing User Behavior and Message Content"* by Zhang et al. (2024).

## 📈 Expected Results

- Most users have ~27% work-related messages
- ~49% of messages are "Asking" for information
- Common topics include Writing, Technical Help, and Practical Guidance

---

**Setup Instructions**
Step 1: Get Your ChatGPT Export

Go to ChatGPT
Click your profile icon → Settings → Data Controls
Click "Export" next to "Export data"
Check your email for the download link
Download and extract the ZIP file
Locate conversations.json inside

Step 2: Set Up Google Colab

Open Google Colab
Create a new notebook
Copy-paste the cells below in order

Step 3: Set Up OpenAI API (Optional but Recommended)

Get API key from OpenAI Platform
In Colab, click the 🔑 key icon on the left sidebar
Add new secret: Name = OPENAI_API_KEY, Value = your API key:

## 📦 Step 1: Install Dependencies

This cell installs all required Python packages for the analysis. Run this first!

In [None]:
# Install required packages
!pip install openai pandas matplotlib seaborn numpy -q

# Restart runtime after installation (only needed once)
import sys
import subprocess
subprocess.check_call([sys.executable, "-m", "pip", "install", "openai", "pandas", "matplotlib", "seaborn", "numpy"])

print("✅ Dependencies installed successfully!")

✅ Dependencies installed successfully!


## 📚 Step 2: Import Libraries

This cell imports all necessary libraries for data processing, visualization, and OpenAI API integration.

In [None]:
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timezone
import re
from collections import defaultdict, Counter
import warnings
warnings.filterwarnings('ignore')

# Colab-specific imports
from google.colab import userdata, files

# OpenAI imports
try:
    from openai import OpenAI
    OPENAI_AVAILABLE = True
except ImportError:
    OPENAI_AVAILABLE = False
    print("OpenAI library not available")

print("✅ Libraries imported successfully!")
print("\n📋 NEXT STEPS:")
print("1. Run the ChatGPTPersonalAnalyzer class cell")
print("2. Upload your conversations.json file")
print("3. Run the analysis")

✅ Libraries imported successfully!

📋 NEXT STEPS:
1. Run the ChatGPTPersonalAnalyzer class cell
2. Upload your conversations.json file
3. Run the analysis


In [None]:
# Debug the role field in your messages
def debug_roles(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        conversations = json.load(file)

    print("🔍 DEBUGGING MESSAGE ROLES")
    print("="*40)

    role_examples = {}

    for i, conv in enumerate(conversations[:5]):  # Check first 5 conversations
        chat_messages = conv.get('chat_messages', [])
        print(f"\nConversation {i+1} - {len(chat_messages)} messages:")

        for j, msg in enumerate(chat_messages[:3]):  # First 3 messages each
            print(f"  Message {j+1}:")
            print(f"    All keys: {list(msg.keys())}")

            # Check different possible role fields
            role_fields = ['role', 'author', 'sender', 'from', 'type']
            for field in role_fields:
                if field in msg:
                    value = msg[field]
                    print(f"    {field}: {value}")
                    if field not in role_examples:
                        role_examples[field] = set()
                    role_examples[field].add(str(value))

            # Show content preview
            if 'content' in msg:
                content = str(msg['content'])[:100]
                print(f"    Content preview: {content}...")
            print()

    print(f"\nRole field summary:")
    for field, values in role_examples.items():
        print(f"  {field}: {list(values)}")

# Run the debug
debug_roles('conversations.json')

## 🤖 Step 4: Analysis Class Definition

This section contains the main analyzer class with two versions:

1. **Basic version** (Cell below): Uses keyword-based heuristics
2. **Advanced version** (Next section): Uses OpenAI API for accurate classification

### Basic Analyzer (Heuristic-based)

3 : Main Analyzer Class

In [None]:
class FixedCustomFormatAnalyzer:
    def __init__(self, api_key=None):
        """Initialize analyzer for your specific export format"""
        self.conversations = []
        self.messages_df = None
        self.user_messages_df = None
        self.classified_messages = None

        try:
            from openai import OpenAI
            if api_key:
                self.client = OpenAI(api_key=api_key)
                print("✅ OpenAI client initialized")
            else:
                self.client = None
                print("⚠️ Using heuristic classifications")
        except ImportError:
            self.client = None
            print("⚠️ OpenAI library not available")

    def load_conversations(self, file_path):
        """Load conversations from your specific export format"""
        print(f"Loading conversations from {file_path}...")

        with open(file_path, 'r', encoding='utf-8') as file:
            self.conversations = json.load(file)

        print(f"✅ Loaded {len(self.conversations)} conversations")
        return self

    def extract_messages(self):
        """Extract messages from your chat_messages format"""
        print("Extracting messages from conversations...")

        messages = []

        for conv_idx, conversation in enumerate(self.conversations):
            # Extract conversation metadata
            conv_id = conversation.get('uuid', f'conv_{conv_idx}')
            title = conversation.get('name', 'Untitled')
            created_at = conversation.get('created_at', '')
            updated_at = conversation.get('updated_at', '')
            chat_messages = conversation.get('chat_messages', [])

            # Convert timestamps
            try:
                if created_at:
                    create_time = datetime.fromisoformat(created_at.replace('Z', '+00:00')).timestamp()
                else:
                    create_time = 0
            except:
                create_time = 0

            try:
                if updated_at:
                    update_time = datetime.fromisoformat(updated_at.replace('Z', '+00:00')).timestamp()
                else:
                    update_time = 0
            except:
                update_time = 0

            # Extract messages from chat_messages array
            for msg_idx, message in enumerate(chat_messages):
                # Extract role - convert 'human' to 'user' for consistency
                sender = message.get('sender', 'unknown')
                if sender == 'human':
                    role = 'user'
                elif sender == 'assistant' or sender == 'ai':
                    role = 'assistant'
                else:
                    role = sender

                # Extract content - handle complex content structure
                content_raw = message.get('content', '')
                text_content = ""

                if isinstance(content_raw, list):
                    # Extract text from list of content objects
                    for item in content_raw:
                        if isinstance(item, dict):
                            # Look for text in various possible fields
                            if 'text' in item:
                                text_content += str(item['text']) + " "
                            elif 'content' in item:
                                text_content += str(item['content']) + " "
                        else:
                            text_content += str(item) + " "
                elif isinstance(content_raw, dict):
                    # Single content object
                    if 'text' in content_raw:
                        text_content = str(content_raw['text'])
                    elif 'content' in content_raw:
                        text_content = str(content_raw['content'])
                    else:
                        text_content = str(content_raw)
                else:
                    # Simple string content
                    text_content = str(content_raw)

                # Also check the 'text' field directly
                if 'text' in message and message['text']:
                    if not text_content.strip():
                        text_content = str(message['text'])

                # Get message timestamp
                msg_created_at = message.get('created_at', created_at)
                try:
                    if msg_created_at:
                        msg_time = datetime.fromisoformat(msg_created_at.replace('Z', '+00:00')).timestamp()
                    else:
                        msg_time = create_time
                except:
                    msg_time = create_time

                # Only add messages with actual text content
                if text_content and text_content.strip():
                    msg_data = {
                        'conversation_id': conv_id,
                        'conversation_title': title,
                        'conversation_create_time': create_time,
                        'conversation_update_time': update_time,
                        'message_id': f"{conv_id}_{msg_idx}",
                        'author_role': role,
                        'content': text_content.strip(),
                        'message_create_time': msg_time,
                        'content_type': 'text',
                        'word_count': len(text_content.strip().split()),
                        'char_count': len(text_content.strip())
                    }
                    messages.append(msg_data)

        self.messages_df = pd.DataFrame(messages)
        print(f"✅ Extracted {len(messages)} messages from {len(self.conversations)} conversations")

        if not self.messages_df.empty:
            self._add_derived_features()
        else:
            self.user_messages_df = pd.DataFrame()
            print("❌ No messages extracted")

        return self

    def _add_derived_features(self):
        """Add derived features to the messages dataframe"""
        # Convert timestamps
        self.messages_df['create_datetime'] = pd.to_datetime(
            self.messages_df['message_create_time'], unit='s', utc=True
        )
        self.messages_df['conversation_create_datetime'] = pd.to_datetime(
            self.messages_df['conversation_create_time'], unit='s', utc=True
        )

        # Extract date components
        self.messages_df['date'] = self.messages_df['create_datetime'].dt.date
        self.messages_df['hour'] = self.messages_df['create_datetime'].dt.hour
        self.messages_df['day_of_week'] = self.messages_df['create_datetime'].dt.day_name()
        self.messages_df['month'] = self.messages_df['create_datetime'].dt.month
        self.messages_df['year'] = self.messages_df['create_datetime'].dt.year

        # Filter to user messages
        self.user_messages_df = self.messages_df[
            self.messages_df['author_role'] == 'user'
        ].copy()

        print(f"   Total messages: {len(self.messages_df)}")
        print(f"   Your messages: {len(self.user_messages_df)}")
        print(f"   Role distribution: {dict(self.messages_df['author_role'].value_counts())}")

    # [Include all the classification methods from the previous analyzer - _classify_work_related, _classify_intent, _classify_topic, etc.]
    def _get_message_context(self, message_row):
        """Get context for a message"""
        conv_id = message_row['conversation_id']
        msg_time = message_row['message_create_time']

        conv_messages = self.messages_df[
            (self.messages_df['conversation_id'] == conv_id) &
            (self.messages_df['message_create_time'] <= msg_time)
        ].sort_values('message_create_time')

        context_messages = conv_messages.tail(10)

        context = []
        for _, ctx_msg in context_messages.iterrows():
            role = ctx_msg['author_role']
            content = ctx_msg['content'][:1000]
            context.append(f"[{role}]: {content}")

        return "\n".join(context)

    def _classify_work_related(self, context):
        # [Same as before - work classification logic]
        work_keywords = ['work', 'job', 'office', 'meeting', 'project', 'client', 'business',
                        'professional', 'career', 'resume', 'email', 'report', 'presentation']
        content_lower = context.lower()
        return int(any(keyword in content_lower for keyword in work_keywords))

    def _classify_intent(self, context):
        # [Same as before - intent classification logic]
        last_message = context.split('\n')[-1] if context else ""
        if any(word in last_message.lower() for word in ['what', 'how', 'why', 'when', 'where', 'who', '?']):
            return 'Asking'
        elif any(word in last_message.lower() for word in ['write', 'create', 'make', 'generate', 'draft', 'rewrite']):
            return 'Doing'
        else:
            return 'Expressing'

    def _classify_topic(self, context):
        # [Same as before - topic classification logic]
        topics = {
            'Writing': ['write', 'edit', 'rewrite', 'draft', 'essay', 'email', 'letter'],
            'Practical Guidance': ['how to', 'help me', 'advice', 'guide', 'tutorial', 'tips'],
            'Seeking Information': ['what is', 'tell me about', 'explain', 'define', 'information'],
            'Technical Help': ['code', 'programming', 'python', 'javascript', 'debug', 'error'],
            'Creative': ['story', 'poem', 'creative', 'fiction', 'art', 'design'],
            'Self-Expression': ['feel', 'think', 'opinion', 'personal', 'relationship'],
        }

        last_message = context.split('\n')[-1].lower() if context else ""
        for topic, keywords in topics.items():
            if any(keyword in last_message for keyword in keywords):
                return topic
        return 'Other'

    def classify_messages(self, sample_size=None):
        """Classify messages using the paper's methodology"""
        if len(self.user_messages_df) == 0:
            print("❌ No user messages to classify")
            return self

        print("Starting message classification...")

        if sample_size and len(self.user_messages_df) > sample_size:
            messages_to_classify = self.user_messages_df.sample(sample_size, random_state=42)
            print(f"📊 Sampling {sample_size} messages for classification")
        else:
            messages_to_classify = self.user_messages_df.copy()

        classified_results = []

        for idx, row in messages_to_classify.iterrows():
            if len(classified_results) % 50 == 0:
                print(f"   Classifying message {len(classified_results) + 1}/{len(messages_to_classify)}...")

            context = self._get_message_context(row)

            classifications = {
                'message_idx': idx,
                'conversation_id': row['conversation_id'],
                'message_id': row['message_id'],
                'content': row['content'][:200] + '...' if len(row['content']) > 200 else row['content'],
                'create_datetime': row['create_datetime'],
                'word_count': row['word_count'],
                'is_work': self._classify_work_related(context),
                'intent': self._classify_intent(context),
                'topic': self._classify_topic(context)
            }

            classified_results.append(classifications)

        self.classified_messages = pd.DataFrame(classified_results)
        print(f"✅ Completed classification of {len(classified_results)} messages")
        return self

    def generate_analysis(self):
        """Generate comprehensive analysis"""
        if self.classified_messages is None or len(self.classified_messages) == 0:
            print("❌ No classified messages available. Run classify_messages() first.")
            return

        print("📊 Generating analysis report...")

        fig, axes = plt.subplots(2, 3, figsize=(18, 12))

        # Work vs Non-work
        work_counts = self.classified_messages['is_work'].value_counts()
        axes[0,0].pie([work_counts.get(0, 0), work_counts.get(1, 0)],
                     labels=['Non-Work', 'Work'], autopct='%1.1f%%')
        axes[0,0].set_title('Work vs Non-Work Messages')

        # Intent distribution
        intent_counts = self.classified_messages['intent'].value_counts()
        axes[0,1].bar(intent_counts.index, intent_counts.values)
        axes[0,1].set_title('User Intent Distribution')
        axes[0,1].tick_params(axis='x', rotation=45)

        # Topic distribution
        topic_counts = self.classified_messages['topic'].value_counts()
        axes[0,2].bar(topic_counts.index, topic_counts.values)
        axes[0,2].set_title('Topic Distribution')
        axes[0,2].tick_params(axis='x', rotation=45)

        # Hourly usage
        hourly_counts = self.user_messages_df.groupby('hour').size()
        axes[1,0].bar(hourly_counts.index, hourly_counts.values)
        axes[1,0].set_title('Usage by Hour')

        # Daily usage
        day_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
        daily_counts = self.user_messages_df.groupby('day_of_week').size().reindex(day_order, fill_value=0)
        axes[1,1].bar(daily_counts.index, daily_counts.values)
        axes[1,1].set_title('Usage by Day of Week')
        axes[1,1].tick_params(axis='x', rotation=45)

        # Message length
        axes[1,2].hist(self.user_messages_df['word_count'], bins=30, alpha=0.7)
        axes[1,2].set_title('Message Length Distribution')
        axes[1,2].set_xlabel('Word Count')

        plt.tight_layout()
        plt.show()

        self._print_summary()

    def _print_summary(self):
        """Print summary statistics"""
        print("\n" + "="*60)
        print("CHATGPT PERSONAL USAGE ANALYSIS SUMMARY")
        print("="*60)

        print(f"\n📊 BASIC STATISTICS:")
        print(f"   Total Conversations: {len(self.conversations):,}")
        print(f"   Total Messages: {len(self.messages_df):,}")
        print(f"   Your Messages: {len(self.user_messages_df):,}")
        print(f"   Classified Messages: {len(self.classified_messages):,}")

        if len(self.user_messages_df) > 0:
            start_date = self.user_messages_df['create_datetime'].min()
            end_date = self.user_messages_df['create_datetime'].max()
            print(f"   Time Range: {start_date.strftime('%Y-%m-%d')} to {end_date.strftime('%Y-%m-%d')}")

            avg_length = self.user_messages_df['word_count'].mean()
            print(f"   Average Message Length: {avg_length:.1f} words")

        if len(self.classified_messages) > 0:
            print(f"\n🏢 WORK VS NON-WORK:")
            work_stats = self.classified_messages['is_work'].value_counts(normalize=True) * 100
            print(f"   Work-Related: {work_stats.get(1, 0):.1f}%")
            print(f"   Non-Work: {work_stats.get(0, 0):.1f}%")

            print(f"\n🎯 INTENT DISTRIBUTION:")
            intent_stats = self.classified_messages['intent'].value_counts(normalize=True) * 100
            for intent, pct in intent_stats.items():
                print(f"   {intent}: {pct:.1f}%")

            print(f"\n📚 TOPIC DISTRIBUTION:")
            topic_stats = self.classified_messages['topic'].value_counts(normalize=True) * 100
            for topic, pct in topic_stats.items():
                print(f"   {topic}: {pct:.1f}%")

            print(f"\n📈 COMPARISON TO RESEARCH PAPER:")
            your_work_pct = work_stats.get(1, 0)
            print(f"   Your work usage: {your_work_pct:.1f}% (Paper average: ~27%)")

            if 'Asking' in intent_stats:
                your_asking_pct = intent_stats.get('Asking', 0)
                print(f"   Your 'Asking' usage: {your_asking_pct:.1f}% (Paper average: ~49%)")

        print("\n" + "="*60)

print("✅ Fixed analyzer loaded!")
print("\nNow run:")
print("analyzer = FixedCustomFormatAnalyzer()")
print("analyzer.load_conversations('conversations.json')")
print("analyzer.extract_messages()")
print("analyzer.classify_messages(sample_size=500)")
print("analyzer.generate_analysis()")

✅ Fixed analyzer loaded!

Now run:
analyzer = FixedCustomFormatAnalyzer()
analyzer.load_conversations('conversations.json')
analyzer.extract_messages()
analyzer.classify_messages(sample_size=500)
analyzer.generate_analysis()


In [None]:
    # def __init__(self, api_key=None):
    #     """Initialize analyzer for your specific export format"""
    #     self.conversations = []
    #     self.messages_df = None
    #     self.user_messages_df = None
    #     self.classified_messages = None

    #     try:
    #         from openai import OpenAI
    #         if api_key:
    #             self.client = OpenAI(api_key=api_key)
    #             print("✅ OpenAI client initialized")
    #         else:
    #             self.client = None
    #             print("⚠️  Using heuristic classifications")
    #     except ImportError:
    #         self.client = None
    #         print("⚠️  OpenAI library not available")

## 🚀 Step 5: Run Basic Analysis (Heuristic-based)

This cell runs the analysis using keyword-based classification. It's free but less accurate than the OpenAI version.

**Note**: This uses simple keyword matching to classify your messages. For more accurate results, use the OpenAI-powered version below.

## 🎯 Step 6: Advanced Analysis with OpenAI API (Recommended)

This version uses OpenAI's GPT models for highly accurate message classification.

### 💡 Why Use OpenAI API?
- **Much more accurate** than keyword-based classification
- **Context-aware** - considers conversation history
- **Consistent** with research methodology
- **Cost**: ~$3-6 for typical analysis

### 📋 Setup Requirements:
1. Get an OpenAI API key from [platform.openai.com](https://platform.openai.com)
2. In Colab: Click 🔑 icon → Add secret: `OPENAI_API_KEY` = your key
3. Run the cell below

### Advanced Analyzer (OpenAI-powered)

## 🎉 Analysis Complete!

Your ChatGPT usage analysis is now complete. The results above show:

### 📈 Key Insights You Can Extract:
- **Work-life balance**: How much you use ChatGPT for work vs personal
- **Usage style**: Whether you mostly ask questions, request tasks, or express thoughts
- **Interest areas**: Your most common topics of discussion
- **Time patterns**: When you're most active with ChatGPT
- **Comparison**: How you compare to typical ChatGPT users

### 💾 Next Steps:
- Screenshot or save the visualizations
- Copy the summary statistics for your records
- Consider adjusting your ChatGPT usage based on insights
- Share findings (anonymized) with others interested in AI usage patterns

### 🔬 Research Context:
This analysis follows the methodology from *"How Do People Use ChatGPT? Analyzing User Behavior and Message Content"* - helping contribute to understanding of AI tool adoption and usage patterns.

---

**Questions or want to modify the analysis?** Check the GitHub repository for documentation and customization options.

In [None]:
analyzer = FixedCustomFormatAnalyzer()
analyzer.load_conversations('conversations.json')
analyzer.extract_messages()
analyzer.classify_messages(sample_size=500)
analyzer.generate_analysis()

In [None]:
#with openai key - FIXED VERSION
import os

class FixedCustomFormatAnalyzer:
    def __init__(self, api_key=None):
        """Initialize analyzer for your specific export format"""
        self.conversations = []
        self.messages_df = None
        self.user_messages_df = None
        self.classified_messages = None

        # Try to get API key from multiple sources
        if not api_key:
            try:
                # Try Google Colab userdata first
                from google.colab import userdata
                api_key = userdata.get('OPENAI_API_KEY')
            except:
                pass

        if not api_key:
            # Try environment variable
            api_key = os.getenv('OPENAI_API_KEY')

        try:
            from openai import OpenAI
            if api_key and api_key.strip():  # Check if key exists and is not empty
                self.client = OpenAI(api_key=api_key.strip())
                print("✅ OpenAI client initialized for accurate classifications")
                print(f"   Using API key: {api_key[:10]}...{api_key[-4:] if len(api_key) > 14 else ''}")
            else:
                self.client = None
                print("⚠️ Using heuristic classifications (no OpenAI API key found)")
        except ImportError:
            self.client = None
            print("⚠️ OpenAI library not available")
        except Exception as e:
            self.client = None
            print(f"⚠️ OpenAI client initialization failed: {e}")

    def _classify_work_related(self, context):
        """Classify if message is work-related (with OpenAI API support)"""
        if self.client:
            prompt = """You are an internal tool that classifies a message from a user to an AI chatbot, based on the context of the previous messages before it.

Does the last user message of this conversation transcript seem likely to be related to doing some work/employment? Answer with one of the following:
(1) likely part of work (e.g. "rewrite this HR complaint")
(0) likely not part of work (e.g. "does ice reduce pimples?")

In your response, only give the number and no other text. IE: the only acceptable responses are 1 and 0."""

            try:
                response = self.client.chat.completions.create(
                    model="gpt-3.5-turbo",
                    messages=[
                        {"role": "system", "content": prompt},
                        {"role": "user", "content": context[:4000]}  # Truncate context to avoid token limits
                    ],
                    max_tokens=1,
                    temperature=0
                )
                result = response.choices[0].message.content.strip()
                return int(result) if result in ['0', '1'] else 0
            except Exception as e:
                print(f"⚠️ OpenAI API error for work classification: {e}")
                # Fall through to heuristic

        # Heuristic fallback
        work_keywords = ['work', 'job', 'office', 'meeting', 'project', 'client', 'business',
                        'professional', 'career', 'resume', 'email', 'report', 'presentation',
                        'colleague', 'manager', 'deadline', 'proposal', 'company', 'corporate']
        content_lower = context.lower()
        return int(any(keyword in content_lower for keyword in work_keywords))

    def _classify_intent(self, context):
        """Classify user intent (with OpenAI API support)"""
        if self.client:
            prompt = """You are an internal tool that classifies a message from a user to an AI chatbot, based on the context of the previous messages before it.

Assign the last user message of this conversation transcript to one of the following three categories:

- Asking: Asking is seeking information or advice that will help the user be better informed or make better decisions, either at work, at school, or in their personal life. (e.g. "Who was president after Lincoln?", "How do I create a budget for this quarter?")

- Doing: Doing messages request that ChatGPT perform tasks for the user. User is drafting an email, writing code, etc. Classify messages as "doing" if they include requests for output that is created primarily by the model. (e.g. "Rewrite this email to make it more formal", "Draft a report summarizing the use cases of ChatGPT")

- Expressing: Expressing statements are neither asking for information, nor for the chatbot to perform a task.

Only reply with one word: Asking, Doing, or Expressing."""

            try:
                response = self.client.chat.completions.create(
                    model="gpt-3.5-turbo",
                    messages=[
                        {"role": "system", "content": prompt},
                        {"role": "user", "content": context[:4000]}
                    ],
                    max_tokens=10,
                    temperature=0
                )
                result = response.choices[0].message.content.strip()
                if result in ['Asking', 'Doing', 'Expressing']:
                    return result
                else:
                    print(f"⚠️ Unexpected intent result from API: {result}")
            except Exception as e:
                print(f"⚠️ OpenAI API error for intent classification: {e}")

        # Heuristic fallback
        last_message = context.split('\n')[-1] if context else ""
        if any(word in last_message.lower() for word in ['what', 'how', 'why', 'when', 'where', 'who', '?']):
            return 'Asking'
        elif any(word in last_message.lower() for word in ['write', 'create', 'make', 'generate', 'draft', 'rewrite']):
            return 'Doing'
        else:
            return 'Expressing'

    def _classify_topic(self, context):
        """Classify conversation topic (enhanced with OpenAI for better accuracy)"""
        if self.client:
            prompt = """You are an internal tool that classifies a message from a user to an AI chatbot based on conversation context.

Classify the last user message into ONE of these categories:

- Writing: Edit, critique, rewrite, draft emails, essays, documents, translations, summaries
- Practical Guidance: How-to advice, tutorials, recommendations, step-by-step help, life advice
- Seeking Information: Factual questions, explanations, definitions, research, current events
- Technical Help: Programming, coding, debugging, math, data analysis, software troubleshooting
- Creative: Stories, poems, art, design, fiction, creative projects, brainstorming ideas
- Self-Expression: Personal feelings, relationships, opinions, casual chat, greetings
- Learning: Educational content, explaining concepts, tutoring, academic help, study assistance
- Planning: Project planning, scheduling, organizing, goal setting, strategy
- Health/Wellness: Health questions, fitness advice, mental health, medical information
- Entertainment: Games, jokes, fun activities, trivia, casual entertainment
- Shopping/Products: Product recommendations, comparisons, purchasing decisions
- Travel: Travel planning, destination advice, logistics, recommendations
- Food/Cooking: Recipes, cooking advice, restaurant recommendations, nutrition
- Spiritual: Bhagavad Gita, Swami Vivekananda, meditation, philosophy
- Career: Job search, interview prep, professional development, workplace issues
- Other: Anything that doesn't clearly fit the above categories

Only reply with one category name from the list above."""

            try:
                response = self.client.chat.completions.create(
                    model="gpt-4o-mini",
                    messages=[
                        {"role": "system", "content": prompt},
                        {"role": "user", "content": context[:4000]}
                    ],
                    max_tokens=20,
                    temperature=0
                )
                result = response.choices[0].message.content.strip()
                valid_topics = ['Writing', 'Practical Guidance', 'Seeking Information', 'Technical Help',
                'Creative', 'Self-Expression', 'Learning', 'Planning', 'Health/Wellness',
                'Entertainment', 'Shopping/Products', 'Travel', 'Food/Cooking', 'Spiritual', 'Career', 'Other']
                if result in valid_topics:
                    return result
                else:
                    print(f"⚠️ Unexpected topic result from API: {result}")
            except Exception as e:
                print(f"⚠️ OpenAI API error for topic classification: {e}")

        # Heuristic fallback
        topics = {
            'Writing': ['write', 'edit', 'rewrite', 'draft', 'essay', 'email', 'letter', 'document', 'summary'],
            'Practical Guidance': ['how to', 'help me', 'advice', 'guide', 'tutorial', 'tips', 'recommend', 'should i'],
            'Seeking Information': ['what is', 'tell me about', 'explain', 'define', 'information', 'who is', 'when did', 'where is'],
            'Technical Help': ['code', 'programming', 'python', 'javascript', 'debug', 'error', 'technical', 'software', 'computer'],
            'Creative': ['story', 'poem', 'creative', 'fiction', 'art', 'design', 'imagine', 'brainstorm', 'idea'],
            'Self-Expression': ['feel', 'think', 'opinion', 'personal', 'relationship', 'emotion', 'hello', 'hi'],
            'Learning': ['learn', 'teach', 'explain', 'understand', 'study', 'education', 'school', 'homework'],
            'Planning': ['plan', 'schedule', 'organize', 'goal', 'strategy', 'timeline', 'project'],
            'Health/Wellness': ['health', 'fitness', 'exercise', 'mental health', 'therapy', 'medical', 'doctor'],
            'Entertainment': ['game', 'fun', 'joke', 'play', 'entertainment', 'movie', 'music', 'book'],
            'Shopping/Products': ['buy', 'purchase', 'product', 'review', 'compare', 'shopping', 'price'],
            'Travel': ['travel', 'trip', 'vacation', 'flight', 'hotel', 'destination', 'visit'],
            'Food/Cooking': ['recipe', 'cook', 'food', 'restaurant', 'meal', 'nutrition', 'diet'],
            'Spiritual': ['life', 'peace', 'meaning', 'spiritual', 'meditation', 'philosophy', 'bhagavad', 'vivekananda'],
            'Career': ['job', 'career', 'interview', 'resume', 'workplace', 'professional development']
        }

        last_message = context.split('\n')[-1].lower() if context else ""
        for topic, keywords in topics.items():
            if any(keyword in last_message for keyword in keywords):
                return topic
        return 'Other'

    # [Keep all other existing methods: load_conversations, extract_messages, etc.]
    def load_conversations(self, file_path):
        """Load conversations from your specific export format"""
        print(f"Loading conversations from {file_path}...")

        with open(file_path, 'r', encoding='utf-8') as file:
            self.conversations = json.load(file)

        print(f"✅ Loaded {len(self.conversations)} conversations")
        return self

    def extract_messages(self):
        """Extract messages from your chat_messages format"""
        print("Extracting messages from conversations...")

        messages = []

        for conv_idx, conversation in enumerate(self.conversations):
            # Extract conversation metadata
            conv_id = conversation.get('uuid', f'conv_{conv_idx}')
            title = conversation.get('name', 'Untitled')
            created_at = conversation.get('created_at', '')
            updated_at = conversation.get('updated_at', '')
            chat_messages = conversation.get('chat_messages', [])

            # Convert timestamps
            try:
                if created_at:
                    create_time = datetime.fromisoformat(created_at.replace('Z', '+00:00')).timestamp()
                else:
                    create_time = 0
            except:
                create_time = 0

            try:
                if updated_at:
                    update_time = datetime.fromisoformat(updated_at.replace('Z', '+00:00')).timestamp()
                else:
                    update_time = 0
            except:
                update_time = 0

            # Extract messages from chat_messages array
            for msg_idx, message in enumerate(chat_messages):
                # Extract role - convert 'human' to 'user' for consistency
                sender = message.get('sender', 'unknown')
                if sender == 'human':
                    role = 'user'
                elif sender == 'assistant' or sender == 'ai':
                    role = 'assistant'
                else:
                    role = sender

                # Extract content - handle complex content structure
                content_raw = message.get('content', '')
                text_content = ""

                if isinstance(content_raw, list):
                    # Extract text from list of content objects
                    for item in content_raw:
                        if isinstance(item, dict):
                            # Look for text in various possible fields
                            if 'text' in item:
                                text_content += str(item['text']) + " "
                            elif 'content' in item:
                                text_content += str(item['content']) + " "
                        else:
                            text_content += str(item) + " "
                elif isinstance(content_raw, dict):
                    # Single content object
                    if 'text' in content_raw:
                        text_content = str(content_raw['text'])
                    elif 'content' in content_raw:
                        text_content = str(content_raw['content'])
                    else:
                        text_content = str(content_raw)
                else:
                    # Simple string content
                    text_content = str(content_raw)

                # Also check the 'text' field directly
                if 'text' in message and message['text']:
                    if not text_content.strip():
                        text_content = str(message['text'])

                # Get message timestamp
                msg_created_at = message.get('created_at', created_at)
                try:
                    if msg_created_at:
                        msg_time = datetime.fromisoformat(msg_created_at.replace('Z', '+00:00')).timestamp()
                    else:
                        msg_time = create_time
                except:
                    msg_time = create_time

                # Only add messages with actual text content
                if text_content and text_content.strip():
                    msg_data = {
                        'conversation_id': conv_id,
                        'conversation_title': title,
                        'conversation_create_time': create_time,
                        'conversation_update_time': update_time,
                        'message_id': f"{conv_id}_{msg_idx}",
                        'author_role': role,
                        'content': text_content.strip(),
                        'message_create_time': msg_time,
                        'content_type': 'text',
                        'word_count': len(text_content.strip().split()),
                        'char_count': len(text_content.strip())
                    }
                    messages.append(msg_data)

        self.messages_df = pd.DataFrame(messages)
        print(f"✅ Extracted {len(messages)} messages from {len(self.conversations)} conversations")

        if not self.messages_df.empty:
            self._add_derived_features()
        else:
            self.user_messages_df = pd.DataFrame()
            print("❌ No messages extracted")

        return self

    def _add_derived_features(self):
        """Add derived features to the messages dataframe"""
        # Convert timestamps
        self.messages_df['create_datetime'] = pd.to_datetime(
            self.messages_df['message_create_time'], unit='s', utc=True
        )
        self.messages_df['conversation_create_datetime'] = pd.to_datetime(
            self.messages_df['conversation_create_time'], unit='s', utc=True
        )

        # Extract date components
        self.messages_df['date'] = self.messages_df['create_datetime'].dt.date
        self.messages_df['hour'] = self.messages_df['create_datetime'].dt.hour
        self.messages_df['day_of_week'] = self.messages_df['create_datetime'].dt.day_name()
        self.messages_df['month'] = self.messages_df['create_datetime'].dt.month
        self.messages_df['year'] = self.messages_df['create_datetime'].dt.year

        # Filter to user messages
        self.user_messages_df = self.messages_df[
            self.messages_df['author_role'] == 'user'
        ].copy()

        print(f"   Total messages: {len(self.messages_df)}")
        print(f"   Your messages: {len(self.user_messages_df)}")
        print(f"   Role distribution: {dict(self.messages_df['author_role'].value_counts())}")

    def _get_message_context(self, message_row):
        """Get context for a message"""
        conv_id = message_row['conversation_id']
        msg_time = message_row['message_create_time']

        conv_messages = self.messages_df[
            (self.messages_df['conversation_id'] == conv_id) &
            (self.messages_df['message_create_time'] <= msg_time)
        ].sort_values('message_create_time')

        context_messages = conv_messages.tail(10)

        context = []
        for _, ctx_msg in context_messages.iterrows():
            role = ctx_msg['author_role']
            content = ctx_msg['content'][:1000]
            context.append(f"[{role}]: {content}")

        return "\n".join(context)

    def classify_messages(self, sample_size=None):
        """Classify messages using the paper's methodology"""
        if len(self.user_messages_df) == 0:
            print("❌ No user messages to classify")
            return self

        print("Starting message classification...")

        # Report which method is being used
        if self.client:
            print("🤖 Using OpenAI API for accurate classifications")
        else:
            print("📝 Using heuristic keyword-based classifications")

        if sample_size and len(self.user_messages_df) > sample_size:
            messages_to_classify = self.user_messages_df.sample(sample_size, random_state=42)
            print(f"📊 Sampling {sample_size} messages for classification")
        else:
            messages_to_classify = self.user_messages_df.copy()

        classified_results = []
        api_calls_made = 0

        for idx, row in messages_to_classify.iterrows():
            if len(classified_results) % 50 == 0:
                print(f"   Classifying message {len(classified_results) + 1}/{len(messages_to_classify)}...")

            context = self._get_message_context(row)

            # Track API usage
            pre_client_state = self.client is not None

            classifications = {
                'message_idx': idx,
                'conversation_id': row['conversation_id'],
                'message_id': row['message_id'],
                'content': row['content'][:200] + '...' if len(row['content']) > 200 else row['content'],
                'create_datetime': row['create_datetime'],
                'word_count': row['word_count'],
                'is_work': self._classify_work_related(context),
                'intent': self._classify_intent(context),
                'topic': self._classify_topic(context)
            }

            if pre_client_state and self.client:
                api_calls_made += 3  # 3 classifications per message

            classified_results.append(classifications)

        self.classified_messages = pd.DataFrame(classified_results)

        if api_calls_made > 0:
            print(f"✅ Completed classification of {len(classified_results)} messages using OpenAI API ({api_calls_made} API calls)")
        else:
            print(f"✅ Completed classification of {len(classified_results)} messages using heuristics")

        return self

    def generate_analysis(self):
        """Generate comprehensive analysis"""
        if self.classified_messages is None or len(self.classified_messages) == 0:
            print("❌ No classified messages available. Run classify_messages() first.")
            return

        print("📊 Generating analysis report...")

        fig, axes = plt.subplots(2, 3, figsize=(18, 12))

        # Work vs Non-work
        work_counts = self.classified_messages['is_work'].value_counts()
        axes[0,0].pie([work_counts.get(0, 0), work_counts.get(1, 0)],
                     labels=['Non-Work', 'Work'], autopct='%1.1f%%')
        axes[0,0].set_title('Work vs Non-Work Messages')

        # Intent distribution
        intent_counts = self.classified_messages['intent'].value_counts()
        axes[0,1].bar(intent_counts.index, intent_counts.values)
        axes[0,1].set_title('User Intent Distribution')
        axes[0,1].tick_params(axis='x', rotation=45)

        # Topic distribution
        topic_counts = self.classified_messages['topic'].value_counts()
        axes[0,2].bar(topic_counts.index, topic_counts.values)
        axes[0,2].set_title('Topic Distribution')
        axes[0,2].tick_params(axis='x', rotation=45)

        # Hourly usage
        hourly_counts = self.user_messages_df.groupby('hour').size()
        axes[1,0].bar(hourly_counts.index, hourly_counts.values)
        axes[1,0].set_title('Usage by Hour')

        # Daily usage
        day_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
        daily_counts = self.user_messages_df.groupby('day_of_week').size().reindex(day_order, fill_value=0)
        axes[1,1].bar(daily_counts.index, daily_counts.values)
        axes[1,1].set_title('Usage by Day of Week')
        axes[1,1].tick_params(axis='x', rotation=45)

        # Message length
        axes[1,2].hist(self.user_messages_df['word_count'], bins=30, alpha=0.7)
        axes[1,2].set_title('Message Length Distribution')
        axes[1,2].set_xlabel('Word Count')

        plt.tight_layout()
        plt.show()

        self._print_summary()

    def _print_summary(self):
        """Print summary statistics"""
        print("\n" + "="*60)
        print("CHATGPT PERSONAL USAGE ANALYSIS SUMMARY")
        print("="*60)

        print(f"\n📊 BASIC STATISTICS:")
        print(f"   Total Conversations: {len(self.conversations):,}")
        print(f"   Total Messages: {len(self.messages_df):,}")
        print(f"   Your Messages: {len(self.user_messages_df):,}")
        print(f"   Classified Messages: {len(self.classified_messages):,}")

        if len(self.user_messages_df) > 0:
            start_date = self.user_messages_df['create_datetime'].min()
            end_date = self.user_messages_df['create_datetime'].max()
            print(f"   Time Range: {start_date.strftime('%Y-%m-%d')} to {end_date.strftime('%Y-%m-%d')}")

            avg_length = self.user_messages_df['word_count'].mean()
            print(f"   Average Message Length: {avg_length:.1f} words")

        if len(self.classified_messages) > 0:
            print(f"\n🏢 WORK VS NON-WORK:")
            work_stats = self.classified_messages['is_work'].value_counts(normalize=True) * 100
            print(f"   Work-Related: {work_stats.get(1, 0):.1f}%")
            print(f"   Non-Work: {work_stats.get(0, 0):.1f}%")

            print(f"\n🎯 INTENT DISTRIBUTION:")
            intent_stats = self.classified_messages['intent'].value_counts(normalize=True) * 100
            for intent, pct in intent_stats.items():
                print(f"   {intent}: {pct:.1f}%")

            print(f"\n📚 TOPIC DISTRIBUTION:")
            topic_stats = self.classified_messages['topic'].value_counts(normalize=True) * 100
            for topic, pct in topic_stats.items():
                print(f"   {topic}: {pct:.1f}%")

            print(f"\n📈 COMPARISON TO RESEARCH PAPER:")
            your_work_pct = work_stats.get(1, 0)
            print(f"   Your work usage: {your_work_pct:.1f}% (Paper average: ~27%)")

            if 'Asking' in intent_stats:
                your_asking_pct = intent_stats.get('Asking', 0)
                print(f"   Your 'Asking' usage: {your_asking_pct:.1f}% (Paper average: ~49%)")

        print("\n" + "="*60)

print("✅ FIXED analyzer loaded with improved OpenAI API handling!")
print("\nKey improvements:")
print("- Better API key detection from multiple sources")
print("- Improved error handling and fallback logic")
print("- Context truncation to avoid token limits")
print("- API usage tracking and reporting")
print("- More detailed initialization messages")
print("\nNow run:")
print("analyzer = FixedCustomFormatAnalyzer()")
print("analyzer.load_conversations('conversations.json')")
print("analyzer.extract_messages()")
print("analyzer.classify_messages(sample_size=500)")
print("analyzer.generate_analysis()")

In [None]:
# Initialize with API key
analyzer = FixedCustomFormatAnalyzer(userdata.get('OPENAI_API_KEY'))
analyzer.load_conversations('conversations.json')
analyzer.extract_messages()
analyzer.classify_messages(sample_size=6000)
analyzer.generate_analysis()