# Teams Clone - Data Analysis

Analyze user behavior, message patterns, and platform usage statistics.

## Features:
- üìä Message frequency analysis
- üë• User activity patterns
- üìà Channel engagement metrics
- ‚è∞ Temporal analysis (hourly, daily patterns)
- üî• Popular topics and keywords

## 1. Setup and Imports

In [None]:
# Install required packages
!pip install requests pandas matplotlib seaborn wordcloud plotly

In [None]:
import requests
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
from collections import Counter
import json

# Set style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

# Backend URL
BASE_URL = "http://localhost:3001"

print("‚úÖ Imports successful!")

## 2. Fetch Data from Backend

In [None]:
def fetch_messages(channel_id="general"):
    """Fetch all messages from a channel"""
    try:
        response = requests.get(f"{BASE_URL}/api/messages/{channel_id}")
        if response.status_code == 200:
            return response.json().get('messages', [])
        return []
    except Exception as e:
        print(f"Error fetching messages: {e}")
        return []

def fetch_rl_history():
    """Fetch RL episode history"""
    try:
        response = requests.get(f"{BASE_URL}/env/history?limit=100")
        if response.status_code == 200:
            return response.json().get('episodes', [])
        return []
    except Exception as e:
        print(f"Error fetching RL history: {e}")
        return []

# Fetch data
messages = fetch_messages()
rl_episodes = fetch_rl_history()

print(f"üìä Fetched {len(messages)} messages")
print(f"ü§ñ Fetched {len(rl_episodes)} RL episodes")

## 3. Message Analysis

In [None]:
# Convert to DataFrame
if messages:
    df = pd.DataFrame(messages)
    
    # Basic statistics
    print("=" * 50)
    print("üìä MESSAGE STATISTICS")
    print("=" * 50)
    print(f"Total Messages: {len(df)}")
    print(f"Unique Users: {df['sender'].nunique() if 'sender' in df.columns else 'N/A'}")
    print(f"Average Message Length: {df['content'].str.len().mean():.1f} chars" if 'content' in df.columns else "")
    print(f"Total Characters: {df['content'].str.len().sum():,}" if 'content' in df.columns else "")
    print("=" * 50)
    
    # Display sample
    display(df.head(10))
else:
    print("‚ö†Ô∏è No messages found. Start the backend and send some messages first!")

## 4. User Activity Analysis

In [None]:
if messages and 'sender' in df.columns:
    # Messages per user
    user_activity = df['sender'].value_counts()
    
    # Plot
    plt.figure(figsize=(12, 6))
    user_activity.head(10).plot(kind='bar', color='skyblue')
    plt.title('Top 10 Most Active Users', fontsize=16, fontweight='bold')
    plt.xlabel('User', fontsize=12)
    plt.ylabel('Number of Messages', fontsize=12)
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    plt.show()
    
    # Show top 5
    print("\nüèÜ Top 5 Most Active Users:")
    for i, (user, count) in enumerate(user_activity.head(5).items(), 1):
        print(f"{i}. {user}: {count} messages")

## 5. Message Length Distribution

In [None]:
if messages and 'content' in df.columns:
    # Calculate message lengths
    df['message_length'] = df['content'].str.len()
    
    # Plot distribution
    plt.figure(figsize=(12, 6))
    plt.hist(df['message_length'], bins=50, color='coral', alpha=0.7, edgecolor='black')
    plt.axvline(df['message_length'].mean(), color='red', linestyle='--', linewidth=2, label=f'Mean: {df["message_length"].mean():.1f}')
    plt.axvline(df['message_length'].median(), color='green', linestyle='--', linewidth=2, label=f'Median: {df["message_length"].median():.1f}')
    plt.title('Message Length Distribution', fontsize=16, fontweight='bold')
    plt.xlabel('Message Length (characters)', fontsize=12)
    plt.ylabel('Frequency', fontsize=12)
    plt.legend()
    plt.tight_layout()
    plt.show()
    
    # Statistics
    print("\nüìè Message Length Statistics:")
    print(f"Mean: {df['message_length'].mean():.1f} chars")
    print(f"Median: {df['message_length'].median():.1f} chars")
    print(f"Min: {df['message_length'].min()} chars")
    print(f"Max: {df['message_length'].max()} chars")
    print(f"Std Dev: {df['message_length'].std():.1f} chars")

## 6. Most Common Words

In [None]:
if messages and 'content' in df.columns:
    # Extract all words
    all_text = ' '.join(df['content'].astype(str))
    words = all_text.lower().split()
    
    # Remove common stop words
    stop_words = {'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'is', 'are', 'was', 'were'}
    filtered_words = [w for w in words if w not in stop_words and len(w) > 2]
    
    # Count
    word_counts = Counter(filtered_words)
    top_words = word_counts.most_common(15)
    
    # Plot
    words_df = pd.DataFrame(top_words, columns=['Word', 'Count'])
    plt.figure(figsize=(12, 6))
    plt.barh(words_df['Word'], words_df['Count'], color='lightgreen')
    plt.title('Top 15 Most Common Words', fontsize=16, fontweight='bold')
    plt.xlabel('Frequency', fontsize=12)
    plt.ylabel('Word', fontsize=12)
    plt.gca().invert_yaxis()
    plt.tight_layout()
    plt.show()

## 7. RL Agent Performance Analysis

In [None]:
if rl_episodes:
    # Convert to DataFrame
    rl_df = pd.DataFrame(rl_episodes)
    
    print("=" * 50)
    print("ü§ñ RL AGENT STATISTICS")
    print("=" * 50)
    print(f"Total Episodes: {len(rl_df)}")
    print(f"Completed Tasks: {rl_df['completed'].sum() if 'completed' in rl_df.columns else 'N/A'}")
    print(f"Success Rate: {(rl_df['completed'].sum() / len(rl_df) * 100):.1f}%" if 'completed' in rl_df.columns else "")
    print(f"Average Reward: {rl_df['totalReward'].mean():.3f}" if 'totalReward' in rl_df.columns else "")
    print(f"Average Steps: {rl_df['steps'].mean():.1f}" if 'steps' in rl_df.columns else "")
    print("=" * 50)
    
    # Plot reward over episodes
    if 'totalReward' in rl_df.columns:
        plt.figure(figsize=(12, 6))
        plt.plot(rl_df.index, rl_df['totalReward'], marker='o', linestyle='-', alpha=0.7)
        plt.title('RL Agent Rewards Over Episodes', fontsize=16, fontweight='bold')
        plt.xlabel('Episode', fontsize=12)
        plt.ylabel('Total Reward', fontsize=12)
        plt.grid(True, alpha=0.3)
        plt.tight_layout()
        plt.show()
else:
    print("‚ö†Ô∏è No RL episodes found. Run the RL agent first!")

## 8. Task Type Distribution

In [None]:
if rl_episodes and 'taskType' in rl_df.columns:
    # Count task types
    task_counts = rl_df['taskType'].value_counts()
    
    # Plot pie chart
    plt.figure(figsize=(10, 10))
    plt.pie(task_counts, labels=task_counts.index, autopct='%1.1f%%', startangle=90, colors=sns.color_palette('pastel'))
    plt.title('Task Type Distribution', fontsize=16, fontweight='bold')
    plt.tight_layout()
    plt.show()
    
    # Success rate by task
    if 'completed' in rl_df.columns:
        success_by_task = rl_df.groupby('taskType')['completed'].mean() * 100
        
        plt.figure(figsize=(12, 6))
        success_by_task.plot(kind='bar', color='lightblue')
        plt.title('Success Rate by Task Type', fontsize=16, fontweight='bold')
        plt.xlabel('Task Type', fontsize=12)
        plt.ylabel('Success Rate (%)', fontsize=12)
        plt.xticks(rotation=45, ha='right')
        plt.ylim(0, 100)
        plt.tight_layout()
        plt.show()

## 9. Export Summary Report

In [None]:
# Create summary report
report = {
    'timestamp': datetime.now().isoformat(),
    'messages': {
        'total': len(messages),
        'unique_users': df['sender'].nunique() if messages and 'sender' in df.columns else 0,
        'avg_length': df['message_length'].mean() if messages and 'message_length' in df.columns else 0
    },
    'rl_agent': {
        'total_episodes': len(rl_episodes),
        'completed': rl_df['completed'].sum() if rl_episodes and 'completed' in rl_df.columns else 0,
        'success_rate': (rl_df['completed'].sum() / len(rl_df) * 100) if rl_episodes and 'completed' in rl_df.columns else 0,
        'avg_reward': rl_df['totalReward'].mean() if rl_episodes and 'totalReward' in rl_df.columns else 0
    }
}

# Save to file
with open('analysis_report.json', 'w') as f:
    json.dump(report, f, indent=2)

print("‚úÖ Report saved to analysis_report.json")
print(json.dumps(report, indent=2))