# 📊 Telegram Group Chat Analysis
## Understanding Group Activity and Message Trends

### 📌 Objective
This notebook analyzes Telegram group chat data to extract insights such as:
- User activity patterns 📅
- Most active members 🗣️
- Commonly used words and emojis 🔤😂
- User message trends 📈

### 🛠️ Libraries Used
We will use the following Python libraries:
- `pandas` for data manipulation
- `matplotlib` and `seaborn` for visualizations
- `wordcloud` for text analysis
- `ipywidgets` for input collection
- `calplot` for heatmaps

---


## 🔧 Installing Required Libraries
Run the following command if you haven't installed the required libraries:


In [None]:
# Install necessary libraries (Uncomment line below if packages are not already installed)
# %pip install matplotlib seaborn numpy pandas calplot wordcloud ipywidgets

## 📥 Importing Libraries
Now, let's import the necessary libraries for analysis.

In [None]:
# import libraries
import os
import json
import pytz
from itertools import islice
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
import calplot
import calendar
import re
from wordcloud import WordCloud
from collections import defaultdict, Counter
from datetime import datetime, timedelta
import ipywidgets as widgets
from IPython.display import display
from IPython.display import clear_output


# 📂 Loading and Preprocessing Data  
Before proceeding, make sure to **download the Telegram chat data** (TODO: Export chat history from Telegram).  

## 🛠️ Steps Involved:  
- Load the exported **Telegram chat JSON file** ⏳  
- Extract relevant data from a **private group chat** based on user input (e.g., messages from a specific year) 📜  
- Filter and process messages to create a structured list, with each message as an individual element 📑  


In [None]:
# TODO: Ensure your Telegram data JSON file is placed in the current directory.
# TODO: Change 'chat_data.json' to the actual file name if different.

file_name = "test.json"  # Update this string with the name of your data download

# Check if the file exists before attempting to open it
if os.path.exists(file_name):
    with open(file_name, "r", encoding="utf-8") as file:
        data = json.load(file)
    print(f"✅ Successfully loaded '{file_name}'")
else:
    print(f"❌ Error: File '{file_name}' not found. Please place it in the current directory.")

In [None]:
# Extract all available private group chats
if "chats" in data and "list" in data["chats"]:
    chats_list = data["chats"]["list"]
    
    # Extract chat names
    available_chats = [chat.get("name", "Unnamed Chat") for chat in chats_list]

    if available_chats:
        # Create dropdown widget for chat selection
        selected_chat = widgets.Dropdown(
            options=available_chats,
            value=available_chats[0],  # Default to the first available chat
            description="Select Chat:",
            disabled=False,
        )

        # Display the widget
        display(selected_chat)
    else:
        print("⚠️ No private group chats available in the dataset.")
else:
    print("❌ Error: Invalid data format. 'chats' key not found.")


In [None]:
# Define a valid year range (e.g., last 10 years to current year)
current_year = datetime.now().year
valid_years = list(range(current_year - 10, current_year + 1))  # Adjust range if needed

# Create a dropdown instead of free text input for better UX & validation
selected_year = widgets.Dropdown(
    options=valid_years,
    value=current_year,  # Default to the current year
    description="Select Year:",
    disabled=False,
)

# Display the widget
display(selected_year)

In [None]:
# Fetch all messages sent during the selected year

# Initialize an empty list to store messages for the selected year
chat_data_for_selected_year = []

# Get the Unix timestamp range for the selected year
comparison_year_first = datetime(selected_year.value, 1, 1, 0, 0, 0)
comparison_year_last = datetime(selected_year.value + 1, 1, 1, 0, 0, 0)
comparison_timestamp_first = int(comparison_year_first.timestamp())
comparison_timestamp_last = int(comparison_year_last.timestamp())

# Iterate through all chats to fetch messages from the selected chat
for chat in chats_list:
    if chat['name'] == selected_chat.value:
        # Use list comprehension for a cleaner approach to collect messages
        chat_data_for_selected_year = [
            message for message in chat['messages']
            if comparison_timestamp_first < int(message["date_unixtime"]) < comparison_timestamp_last
        ]

# Print the number of messages collected to provide feedback
print(f"✅ Found {len(chat_data_for_selected_year):,} messages for {selected_chat.value} in {selected_year.value}.")

# 📊 Exploratory Data Analysis (EDA)
We will explore:
- 📈 General chat statistics
- 🔥 Most active users
- ⏰ Chat activity trends over time
- 🌎 Word and emoji frequency

# 🎨 Data Visualization
We will create:
- 📊 Bar charts for user activity
- 📅 A heatmap of daily message frequency
- ☁️ A word cloud of the most used words


In [None]:
# Print Ensure there are messages in the selected year before printing

if chat_data_for_selected_year:
    # Print the first and last messages of the year with more detailed formatting
    print("📅 First message of the year:")
    print(f"Sender: {chat_data_for_selected_year[0].get('from', 'Unknown Sender')}")
    print(f"Message: {chat_data_for_selected_year[0].get('text', 'No content available')}")

    print("\n📅 Last message of the year:")
    print(f"Sender: {chat_data_for_selected_year[-1].get('from', 'Unknown Sender')}")
    print(f"Message: {chat_data_for_selected_year[-1].get('text', 'No content available')}")
else:
    print("❌ No messages found for the selected year.")


In [None]:
# previous names of selected group chat

# A defaultdict to store the previous names and users that made the change
previous_names = defaultdict(list)

for message in chat_data_for_selected_year:
   
    if message.get('type') == 'service' and message.get('action') in ['create_group', 'edit_group_title']:
        
        date = message.get('date_unixtime')
        if not date:
            continue  # Skip if no valid date found

        # Convert the Unix timestamp to a datetime object
        datetime_obj = datetime.fromtimestamp(int(date))
        date_only = datetime_obj.date()

        # Store the title change and actor in the dictionary
        title = message.get('title', 'Unknown Title')
        actor = message.get('actor', 'Unknown Actor')
        previous_names[date_only].append((title, actor))

# Display previous group chat names and who changed them
if previous_names:
    print("Previous names of the selected group chat:")
    print("-------------------------------------")
    for date, changes in sorted(previous_names.items()):
        for title, actor in changes:
            print(f"{date}: Name -> {title} | Changed by -> {actor}")
else:
    print("No title changes found for the selected group chat.")

In [None]:
# Message contribution by each member

# Initialize dictionary for counting messages and a list to store members
member_contribution = defaultdict(int)
members = set()  # A set to store members of group chat

# Count messages per member and store unique members
for message in chat_data_for_selected_year:
    sender = message.get('from') 
    if sender:
        member_contribution[sender] += 1
        members.add(sender)

# Sort contributions by message count (descending order)
sorted_contributions = dict(sorted(member_contribution.items(), key=lambda item: item[1], reverse=True))

# Display results
print("📊 Message Contribution by Each Member")
print("=======================================")
if sorted_contributions:
    for index, (member, count) in enumerate(sorted_contributions.items(), start=1):
        print(f"{index}. {member}: {count:,} messages")
else:
    print(f"❌ No messages found in {selected_year.value}.")

# Print members
print("\nMembers:")
print(members)


In [None]:
# Pie chart of contribution by user

# Sort the contributions by message count (highest first)
sorted_contributions = dict(sorted(member_contribution.items(), key=lambda item: item[1], reverse=True))


# Extract labels and values
contribution_data_labels = list(sorted_contributions.keys())
contribution_data_values = list(sorted_contributions.values())

# Handle the case where there are too many contributors
threshold = 0.02 * sum(contribution_data_values)  # Threshold: 2% of total messages
consolidated_labels = []
consolidated_values = []

other_contribution = 0  # Store contributions that fall below threshold

for label, value in zip(contribution_data_labels, contribution_data_values):
    if value >= threshold:
        consolidated_labels.append(label)
        consolidated_values.append(value)
    else:
        other_contribution += value

# Add "Others" category if applicable
if other_contribution > 0:
    consolidated_labels.append("Others")
    consolidated_values.append(other_contribution)

# Generate color map
colors_cntr = plt.get_cmap('Blues')(np.linspace(0.3, 0.8, len(consolidated_labels)))

# Highlight the highest contributor
explode = [0.1 if i == 0 else 0 for i in range(len(consolidated_labels))]

# Create the pie chart
fig, ax = plt.subplots(figsize=(8, 8))
ax.pie(consolidated_values, labels=consolidated_labels, autopct='%1.1f%%', startangle=90, 
       colors=colors_cntr, shadow=False, explode=explode, wedgeprops={'edgecolor': 'black'})

ax.set_title(f"{selected_chat.value} Contribution Data ({selected_year.value})")

# Ensure layout fits well
plt.tight_layout()

# Show the pie chart
plt.show()


In [None]:
# number of messages per day

# Store message count per day and per month
message_per_day = defaultdict(int)
message_per_month = defaultdict(int)

# Populate message counts per day & month
for message in chat_data_for_selected_year:
    date = message['date_unixtime']
    datetime_obj = datetime.fromtimestamp(int(date))
    
    date_only = datetime_obj.date()
    month_only = datetime_obj.month

    message_per_day[date_only] += 1
    message_per_month[calendar.month_name[month_only]] += 1

# Ensure all days in the range have a value, since some days with no activity are not included in our message_per_day 
# include those days with 0 as their value
start_date = datetime(selected_year.value, 1, 1).date()  # Start of the range
end_date = datetime(selected_year.value, 12, 31).date()  # End of the range

current_date = start_date

while current_date <= end_date:
    if current_date not in message_per_day:
        message_per_day[current_date] = 0
    current_date += timedelta(days=1)

# Sort message_per_month by calendar order
message_per_month = {month: message_per_month.get(month, 0) for month in calendar.month_name[1:]}

# 📌 **Summarized Printing**
print("\n📅 Messages Per Day (Sample 10 Days):")
for key, value in list(sorted(message_per_day.items()))[:10]:  # Print only first 10 days
    print(f"{key}: {value:,}")
print("... (Truncated for brevity)")

print("\n📆 Messages Per Month:")
for key, value in message_per_month.items():
    print(f"{key}: {value:,}")

In [None]:
# visualise number of messages per day with a bar graph

# Convert string dates to datetime objects
active_dates =  list(message_per_day.keys())
active_date_values = list(message_per_day.values())

# Create a DataFrame for Seaborn
active_dates_data = {'Date': active_dates, 'Value': active_date_values}
active_dates_data_df = pd.DataFrame(active_dates_data)

# Convert 'Date' to datetime and sort
active_dates_data_df['Date'] = pd.to_datetime(active_dates_data_df['Date'])
active_dates_data_df = active_dates_data_df.sort_values(by='Date') 

# Set the style
sns.set_theme(style="darkgrid", rc={'figure.figsize':(80,40)})

# Create the bar plot
sns.barplot(x='Date', y='Value', data = active_dates_data_df)
plt.title('Date vs Messages sent')
plt.xticks(rotation=90, ha='right')  # Rotate x-axis labels for better readability
plt.show()

In [None]:
# number of messages per day calendar plot

# set style
sns.set_theme(style="darkgrid", rc={'figure.figsize':(20,10)})

# Create a calendar plot 
calplot.calplot(
    active_dates_data_df.set_index('Date')['Value'], 
    cmap='Blues', linewidth=0.4, colorbar=True
)

plt.title(f'Activity Calendar for {selected_year.value}')
plt.show()

In [None]:
# number of messages per month bar graph

month_list = list(message_per_month.keys())
messages_by_month_values = list(message_per_month.values())

# Create a DataFrame for Seaborn
monthly_message_data = {'Months': month_list, 'Messages': messages_by_month_values}
monthly_message_data_Df = pd.DataFrame(monthly_message_data)

# Set the style
# Create bar plot with color gradient
ax = sns.barplot(x='Months', y='Messages', data=monthly_message_data_Df, palette="Blues_r")

# Add data labels on top of bars
for p in ax.patches:
    ax.annotate(f'{int(p.get_height())}',  # Convert to integer for clarity
                (p.get_x() + p.get_width() / 2, p.get_height()),  # Position
                ha='center', va='bottom', fontsize=12, color='black')

# Improve title and labels
plt.title(f'Monthly Message Activity ({selected_year.value})', fontsize=18, pad=20)
plt.xlabel("Months", fontsize=14)
plt.ylabel("Number of Messages", fontsize=14)
plt.xticks(rotation=45, ha='right')  # Rotate x-axis labels for better readability

# Show plot
plt.show()

In [None]:
# Top 5 busiest days, The days with most amount of messages sent
sorted_message_per_day = dict(sorted(message_per_day.items(), key=lambda item: item[1], reverse=True))

# Get the top 5 busiest days
top_days = list(islice(sorted_message_per_day.items(), 5))

# Print results
print(f"📆 Top {len(top_days)} Busiest Days in {selected_year.value}")
print("-------------------------------------------------------")
for rank, (date, count) in enumerate(top_days, start=1):
    print(f"{rank}. {date} | {count} messages sent")

In [None]:
# message per day per person

# Initialize the dictionary using defaultdict
message_per_day_per_person = defaultdict(lambda: defaultdict(int))

# Populate all days with zero contributions for each member
for date in pd.date_range(start=start_date, end=end_date):
    message_per_day_per_person[date.date()] = {user: 0 for user in members}

# Process messages and update counts
for message in chat_data_for_selected_year:
    if 'from' in message and message['from']: # Messages from deleted accounts are ignored
        date_only = datetime.fromtimestamp(int(message['date_unixtime'])).date()
        message_per_day_per_person[date_only][message['from']] += 1

# Print sample result (only first 5 days for readability)
print(f"📊 Message Per Day Per Person for {selected_year.value}") 
print("-----------------------------------------------------")  
for i, (date, person_counts) in enumerate(message_per_day_per_person.items()):
    print(f'📅 Date: {date}')
    for person, count in person_counts.items():
        print(f'   👤 {person}: {count}')
    if i == 4:  # Limit print to first 5 days
        print("... (Output truncated for readability)")
        break

In [None]:
# Prepare dataframe to plot message per day per person

# Create a DataFrame from message_per_day_per_person
message_per_day_per_person_df = pd.DataFrame(message_per_day_per_person).T #.T - Transpose, Date -> rows & users -> columns
message_per_day_per_person_df.index = pd.to_datetime(message_per_day_per_person_df.index)  # Convert index to datetime
message_per_day_per_person_df.reset_index(inplace=True)  # Move datetime index into a column
message_per_day_per_person_df.rename(columns={"index": "Date"}, inplace=True)

# Melt the DataFrame for Seaborn compatibility
message_per_day_per_person_long_df = message_per_day_per_person_df.melt(id_vars=["Date"], var_name="Person", value_name="Contribution")

message_per_day_per_person_long_df.head()

In [None]:
# Plot line graph for each user

# Set Seaborn theme for better styling
sns.set_style("whitegrid")

# Adjust figure size dynamically based on user count
num_users = len(members)
fig_height = min(max(4 * num_users, 10), 40)  # Ensure it's within a reasonable range
fig, axes = plt.subplots(num_users, 1, figsize=(15, fig_height), sharex=True)

# If only one user, `axes` isn't an array
if num_users == 1:
    axes = [axes]

# Loop through each user and plot their activity
for i, user in enumerate(members):
    axes[i].plot(message_per_day_per_person_df['Date'], 
                 message_per_day_per_person_df[user], 
                 label=user,  linestyle='-')
    
    axes[i].set_title(f"{user} Activity Over Time", fontsize=12)
    axes[i].set_ylabel("Messages Sent", fontsize=10)
    axes[i].legend()
    axes[i].grid(True)

# Set x-axis label for the last subplot
axes[-1].set_xlabel("Date", fontsize=12)
plt.xticks(rotation=45)  # Rotate x-axis labels for readability

# Adjust layout to prevent overlap
plt.tight_layout()
plt.subplots_adjust(hspace=0.5)  # Increase spacing between subplots

# Show the plot
plt.show()


In [None]:
# message per day per person stacked bar graph

# Prepare Data for Stacked Bar Chart
# Pivot the melted DataFrame back into wide format for stacking
stacked_bar_graph_df = message_per_day_per_person_long_df.pivot(index="Date", columns="Person", values="Contribution")

# Set Seaborn style
sns.set_style("whitegrid")

# Adjust figure size dynamically based on the number of members
fig_height = min(max(1.5 * len(stacked_bar_graph_df.columns), 8), 30)  # Dynamic height
plt.figure(figsize=(15, fig_height))

# Define color palette
colors = sns.color_palette("tab10", n_colors=len(stacked_bar_graph_df.columns))

# Initialize the bottom variable for stacking
bottom = np.zeros(len(stacked_bar_graph_df))  # Initialize bottom as zeros

# Iterate over each person and plot their contributions
for person in stacked_bar_graph_df.columns:
    plt.bar(stacked_bar_graph_df.index, stacked_bar_graph_df[person], 
            label=person, bottom=bottom, alpha=0.8)  # Ensure label=person
    bottom += stacked_bar_graph_df[person]  # Update bottom for stacking

# Customize the Plot
plt.title("Daily Contributions by Team Members (Stacked)", fontsize=16)
plt.xlabel("Date", fontsize=12)
plt.ylabel("Messages Sent", fontsize=12)

# Improve X-axis readability by setting date tick spacing
plt.xticks(rotation=45, ha="right")
plt.gca().xaxis.set_major_locator(plt.MaxNLocator(10))  # Show only 10 x-axis labels

plt.legend(title="Members", bbox_to_anchor=(1.05, 1), loc='upper left')  # Move legend outside
plt.grid(axis="y", linestyle="--", alpha=0.7)

# Improve layout
plt.tight_layout()
plt.show()

In [None]:
# The longest inactive steak by a person and the longest active streak

def initialize_streaks(members):
    """Initialize streak tracking dictionary for each user."""
    return {
        user: {
            'inactive': {'start': None, 'end': None, 'max_range': timedelta(0), 'max_start': None, 'max_end': None},
            'active': {'start': None, 'end': None, 'max_range': timedelta(0), 'max_start': None, 'max_end': None}
        } 
        for user in members
    }

def update_streak(streak, current_start, current_end):
    """Update the maximum streak if the current one is longer."""
    if current_start and current_end:
        current_range = current_end - current_start
        if current_range > streak['max_range']:
            streak['max_range'] = current_range
            streak['max_start'] = current_start
            streak['max_end'] = current_end

def process_message_data(message_per_day_per_person, members):
    """Process message data to determine active and inactive streaks."""
    streaks = initialize_streaks(members)

    for date, user_counts in message_per_day_per_person.items():
        for user in members:
            count = user_counts.get(user, 0)

            if count == 0:  # User is inactive
                if streaks[user]['inactive']['start'] is None:
                    streaks[user]['inactive']['start'] = date
                streaks[user]['inactive']['end'] = date

                # Finalize active streak
                update_streak(streaks[user]['active'], streaks[user]['active']['start'], streaks[user]['active']['end'])
                streaks[user]['active']['start'] = None
                streaks[user]['active']['end'] = None

            else:  # User is active
                if streaks[user]['active']['start'] is None:
                    streaks[user]['active']['start'] = date
                streaks[user]['active']['end'] = date

                # Finalize inactive streak
                update_streak(streaks[user]['inactive'], streaks[user]['inactive']['start'], streaks[user]['inactive']['end'])
                streaks[user]['inactive']['start'] = None
                streaks[user]['inactive']['end'] = None

    return streaks

def print_streaks(streaks, year):
    """Print the longest active and inactive streaks for each user."""
    print(f"\nThe longest inactive and active streaks for {year}")
    print("-------------------------------------------------")    

    for user, data in streaks.items():
        print(f"\n{user}:")
        print(f"  Inactive: {data['inactive']['max_start']} to {data['inactive']['max_end']} ({data['inactive']['max_range'].days} days)")
        print(f"  Active: {data['active']['max_start']} to {data['active']['max_end']} ({data['active']['max_range'].days} days)")

# Run the functions
streaks = process_message_data(message_per_day_per_person, members)
print_streaks(streaks, selected_year.value)


In [None]:
# Total absent days & active days per person

activity_absence_freq_by_user = {
    user: {
            'inactive': 0,
            'active': 0
        } 
        for user in members
}

for date, members in message_per_day_per_person.items():
    for user, count in members.items():
        if count == 0:
            activity_absence_freq_by_user[user]['inactive'] += 1
        else:
            activity_absence_freq_by_user[user]['active'] += 1
            
# Print the results
print("Total days absent and present by user")
print("-------------------")   
for user, freq in activity_absence_freq_by_user.items():
    print(f"\n{user}:")
    print(f"  Active: {freq['active']:,} days")
    print(f"  Absent: {freq['inactive']:,}  days")


In [None]:
# hastags usage in the group chat 

hashtags_only = defaultdict(int)

for message in chat_data_for_selected_year:
    if 'forwarded_from' not in message and isinstance(message['text'], list):
        for m in message['text']:
            if isinstance(m, dict) and 'type' in m and m['type'] == 'hashtag' and 'text' in m:
                hashtags_only[m['text'].lower()] += 1

sorted_hashtags_only = Counter(hashtags_only).most_common()
                    
for key, value in sorted_hashtags_only:
    print(f'{key}: {value:,}')

In [None]:
# hastags usage in a word map

# Convert sorted list of tuples back into a dictionary
hashtag_frequencies = dict(sorted_hashtags_only)

# Filter out words exceeding the threshold
# filtered_hashtags_only = {word: count for word, count in hashtags_only.items() if count <= 16}

# Generate a word cloud image
hashtag_word_cloud = WordCloud(
    colormap="plasma", width=1000, height=500, background_color='white'
    ).generate_from_frequencies(hashtag_frequencies)

# Display the generated image:
plt.figure(figsize=(20, 10))
plt.imshow(hashtag_word_cloud, interpolation='bilinear')
plt.axis('off')  # Turn off axis labels
plt.show()

In [None]:
# gifs, photos, videos and voice message usage frequency per group member

member_message_type_freq = {
    user: {
            'gif_usage': 0,
            'vid_sent': 0,
            'photo_sent': 0,
            'voice_message': 0
        } 
        for user in members
}

# Count occurrences of each media type
for message in chat_data_for_selected_year:
    sender = message.get('from')
    media_type = message.get('media_type')

    if sender:
        if media_type == 'animation':
            member_message_type_freq[sender]['gif_usage'] += 1
        elif media_type == 'video_file':
            member_message_type_freq[sender]['vid_sent'] += 1
        elif media_type == 'voice_message':
            member_message_type_freq[sender]['voice_message'] += 1

        # Check for photos separately since they are not always classified under 'media_type'
        if 'photo' in message:
            member_message_type_freq[sender]['photo_sent'] += 1

# Print the results
print("Gifs, photos, videos and voice message usage frequency per group member")
print("-----------------------------------------------------------------------")   

for user, freq in member_message_type_freq.items():
    print(f"\n{user}:")
    print(f"  gif usage: {freq['gif_usage']:,} gifs")
    print(f"  video sent: {freq['vid_sent']:,} vids")
    print(f"  photo sent: {freq['photo_sent']:,} photos")
    print(f"  voice message sent: {freq['voice_message']:,} voice msg")

In [None]:
# active day of the week and activity through the day

# Define days of the week and hours of the day
DAYS_OF_WEEK = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
HOURS_OF_DAY = list(range(24))  # 0 to 23 (12am - 11pm)

# Initialize dictionaries (populate values to avoid missing dates and hours causing problems in ploting)
active_day_of_the_week = defaultdict(int, {day: 0 for day in DAYS_OF_WEEK})
active_time_of_the_day = defaultdict(int, {hour: 0 for hour in HOURS_OF_DAY})
active_day_of_the_week_by_user = {user: defaultdict(int, {day: 0 for day in DAYS_OF_WEEK}) for user in members}
active_time_of_the_day_by_user = {user: defaultdict(int, {hour: 0 for hour in HOURS_OF_DAY}) for user in members}

# set the UTC timezone
utc = pytz.UTC

for message in chat_data_for_selected_year:
    sender = message.get('from')
    
    if sender:  # Ensure message has a sender
        dt_object = datetime.fromtimestamp(int(message['date_unixtime']))

        # Add timezone information (assuming timestamps are in UTC)
        dt_object = utc.localize(dt_object)

        # Extract day of the week and hour of the day
        day_of_week = dt_object.strftime('%A')
        hour_of_day = dt_object.hour
        
        # Update counts
        active_day_of_the_week[day_of_week] += 1
        active_time_of_the_day[hour_of_day] += 1

        # Update user-specific activity
        active_day_of_the_week_by_user[sender][day_of_week] += 1
        active_time_of_the_day_by_user[sender][hour_of_day] += 1

In [None]:
print("Messages by day of the week")
print("---------------------------")
for key, value in active_day_of_the_week.items():
    print(f'{key}: {value:,} messages')

In [None]:
# plot active day of the week bar graph

# Create a DataFrame for Seaborn
days_in_a_week_data_Df = pd.DataFrame.from_dict(active_day_of_the_week, orient='index', columns=['Messages sent'])

# Reorder index to match custom day order (Mon - Sun)
days_in_a_week_data_Df = days_in_a_week_data_Df.reindex(DAYS_OF_WEEK)

# Set the style
sns.set_theme(style="darkgrid", rc={'figure.figsize': (20, 10)})

# Create the bar plot
ax = sns.barplot(x=days_in_a_week_data_Df.index, y=days_in_a_week_data_Df['Messages sent'], palette="Blues_r")

# Add data labels on top of bars
for p in ax.patches:
    ax.annotate(f'{int(p.get_height())}',  # Convert to integer for clarity
                (p.get_x() + p.get_width() / 2, p.get_height()),  # Position
                ha='center', va='bottom', fontsize=12, color='black')

# Customize the plot
plt.title('Activity by Day of the Week', fontsize=20, fontweight='bold')
plt.xlabel("Day of the Week", fontsize=16)
plt.ylabel("Messages Sent", fontsize=16)
plt.xticks(rotation=45, ha='right', fontsize=14)  # Rotate x-axis labels for better readability
plt.yticks(fontsize=14)
plt.grid(axis="y", linestyle="--", alpha=0.7)  # Add a subtle grid for better readability

# Show the plot
plt.show()

In [None]:
print("\n📅 Messages by Hour of the Day")
print("──────────────────────────────")

# Sort and format the output results
for hour in sorted(active_time_of_the_day.keys()):
    print(f"🕒 {hour:02d}:00 - {hour:02d}:59  |  {active_time_of_the_day[hour]:,} messages")


In [None]:
# plot active time of the day bar graph

# Function to convert 24-hour format to 12-hour format
def convert_to_12hr_format(hour):
    dt = datetime.strptime(str(hour), "%H")
    return dt.strftime("%I %p")

# Convert the keys to 12-hour format
time_of_day_key_12hr = [convert_to_12hr_format(hour) for hour in active_time_of_the_day.keys()]

# Define a sorted order for AM/PM formatting
custom_order = [
    '12 AM', '01 AM', '02 AM', '03 AM', '04 AM', '05 AM', '06 AM',
    '07 AM', '08 AM', '09 AM', '10 AM', '11 AM', '12 PM', '01 PM',
    '02 PM', '03 PM', '04 PM', '05 PM', '06 PM', '07 PM', '08 PM',
    '09 PM', '10 PM', '11 PM'
]

# Create a DataFrame for Seaborn
time_of_day_data = {
    'Time of Day': time_of_day_key_12hr,
    'Number of Messages': list(active_time_of_the_day.values())
}
time_of_day_data_Df = pd.DataFrame(time_of_day_data)

# Sort the DataFrame using the custom order
time_of_day_data_Df['Time of Day'] = pd.Categorical(
    time_of_day_data_Df['Time of Day'], categories=custom_order, ordered=True
)
time_of_day_data_Df = time_of_day_data_Df.sort_values('Time of Day')

# Set the style
sns.set_theme(style="darkgrid", rc={'figure.figsize': (30, 15)})

# Create the bar plot with color mapping
ax = sns.barplot(x='Time of Day', y='Number of Messages', data=time_of_day_data_Df, palette="Blues_r")

# Add data labels on top of bars
for p in ax.patches:
    ax.annotate(f'{int(p.get_height())}',  # Convert to integer for clarity
                (p.get_x() + p.get_width() / 2, p.get_height()),  # Position
                ha='center', va='bottom', fontsize=12, color='black')
    
# Customize the plot
plt.title("Messages Sent by Hour of the Day", fontsize=18, fontweight='bold')
plt.xlabel("Time of Day", fontsize=16)
plt.ylabel("Number of Messages", fontsize=16)
plt.xticks(fontsize=14, rotation=45, ha='right')  # Rotate x-axis labels for readability
plt.yticks(fontsize=14)

# Show the plot
plt.show()


In [None]:
# active day of the week bar graph per user

# Number of users
num_users = len(active_day_of_the_week_by_user)

# Set up subplots (rows = users, 1 column)
fig, axes = plt.subplots(nrows=num_users, ncols=1, figsize=(12, 3 * num_users), sharex=True)

# If only one user, make sure axes is iterable
if num_users == 1:
    axes = [axes]

# Loop through users and plot each in a subplot
for ax, (user, active_days) in zip(axes, active_day_of_the_week_by_user.items()):
    # Convert to DataFrame
    days_in_a_week_data_Df = pd.DataFrame(list(active_days.items()), columns=['Days', 'Messages Sent'])

    # Create the bar plot
    sns.barplot(x='Days', y='Messages Sent', data=days_in_a_week_data_Df, order=DAYS_OF_WEEK, palette='coolwarm', ax=ax)

    # Customize each subplot
    ax.set_title(f'{user}', fontsize=14, fontweight='bold')
    ax.set_ylabel("Messages Sent", fontsize=12)
    ax.grid(axis='y', linestyle='--', alpha=0.6)
    

# Improve layout
plt.xlabel("Day of the Week", fontsize=14, labelpad=10)
plt.xticks(rotation=45, ha='right', fontsize=12)
plt.tight_layout()
plt.show()


In [None]:
# Number of users
num_users = len(active_time_of_the_day_by_user)

# Set up subplots (rows = users, 1 column)
fig, axes = plt.subplots(nrows=num_users, ncols=1, figsize=(12, 3 * num_users), sharex=True)

# Ensure axes is iterable for a single user
if num_users == 1:
    axes = [axes]

# Loop through users and plot each in a subplot
for ax, (user, active_hrs) in zip(axes, active_time_of_the_day_by_user.items()):
    # Convert to DataFrame
    time_of_day_data_Df = pd.DataFrame(list(active_hrs.items()), columns=['Hour', 'Messages Sent'])

    # Convert to 12-hour format
    time_of_day_data_Df['Hour Range'] = time_of_day_data_Df['Hour'].apply(convert_to_12hr_format)

    # Sort using the custom order
    time_of_day_data_Df['Hour Range'] = pd.Categorical(time_of_day_data_Df['Hour Range'], categories=custom_order, ordered=True)
    time_of_day_data_Df = time_of_day_data_Df.sort_values('Hour Range')

    # Create the bar plot
    sns.barplot(x='Hour Range', y='Messages Sent', data=time_of_day_data_Df, palette='coolwarm', ax=ax)

    # Customize each subplot
    ax.set_title(f'{user}', fontsize=14, fontweight='bold')
    ax.set_ylabel("Messages Sent", fontsize=12)
    ax.grid(axis='y', linestyle='--', alpha=0.6)

# Improve layout
plt.xlabel("Time of Day", fontsize=14, labelpad=10)
plt.xticks(rotation=45, ha='right', fontsize=12)
plt.tight_layout()
plt.show()


In [None]:
# mentions by members, if members are refered by different name than the name saved on the 
# telegram contact details, results may not be accurate

mentions_by_members = {user : 0 for user in members}

# Precompile regex patterns for member names (case insensitive)
mention_patterns = {user: re.compile(rf'\b{re.escape(user)}\b', re.IGNORECASE) for user in members}


# Split each text message into words and check if word is the same as a group member's name
for message in chat_data_for_selected_year:
    if 'from' in message and isinstance(message['text'], str):
        text = message['text'].lower()  # Convert to lowercase
        
        # Check for mentions using regex
        for user, pattern in mention_patterns.items():
            if pattern.search(text):
                mentions_by_members[user] += 1

# Display results
print("\nHow many times were group chat members mentioned?")
print("------------------------------------------------")
for user, count in sorted(mentions_by_members.items(), key=lambda x: x[1], reverse=True):
    print(f'{user}: {count}')

In [None]:
# plot mentions pie chart

# Filter out members with zero mentions
filtered_mentions = {user: count for user, count in mentions_by_members.items() if count > 0}

# If no mentions exist, show a message instead of an empty pie chart
if not filtered_mentions:
    print("No members were mentioned in the selected time period.")
else:
    mention_name_labels = list(filtered_mentions.keys())
    mention_name_values = list(filtered_mentions.values())

    # Define colors using seaborn
    colors = sns.color_palette('Blues', len(mention_name_labels))

    # Create a pie chart
    fig, ax = plt.subplots(figsize=(8, 8))
    wedges, texts, autotexts = ax.pie(
        mention_name_values, 
        labels=mention_name_labels, 
        autopct=lambda p: f'{p:.1f}%\n({int(p * sum(mention_name_values) / 100)})',  
        startangle=140, 
        colors=colors, 
        wedgeprops={'edgecolor': 'black'}
    )

    # Style text elements
    for text in texts + autotexts:
        text.set_fontsize(12)
    
    ax.set_title("How Many Times Was Each Member Mentioned?", fontsize=14, pad=20)
    
    # Display the pie chart
    plt.show()


In [None]:
# mentions continued - Check how many times a person or word was mentioned 

# Output widget
output = widgets.Output()

# Dictionary to store search results
mention_counts = {}

# Text widget for user input
selected_word_for_mention = widgets.Text(
    value='',
    placeholder='Enter words (comma-separated) & press Enter:',
    description='Search:',
    continuous_update=False
)

# Function to update mentions and visualize results
def update_mentions(change):
    global mention_counts

    # Extract new input and split words
    word_input = change['new'].strip().lower() 
    search_words = [word.strip() for word in word_input.split(',') if word.strip()]

    # Find only **newly added words** (avoid reprocessing old ones)
    new_words = [word for word in search_words if word not in mention_counts]
    
    if not new_words:
        return  # Exit if no new valid input
    
    # Initialize mention count
    for word in new_words:
        mention_counts[word] = 0  # Ensure word is in dictionary
    
        # Process messages with regex
        for message in chat_data_for_selected_year:
            if 'text' in message and isinstance(message['text'], str):
                text = message['text'].lower()
                mention_counts[word] += len(re.findall(rf'\b{re.escape(word)}\b', text))

    # Display results
    with output:
        clear_output(wait=True)
        print("🔍 **Mention Counts:**")
        for word, count in mention_counts.items():
            print(f'  - "{word}" mentioned **{count}** times')

        # Plot the mentions dynamically
        plot_mentions(mention_counts)

# Function to plot mentions
def plot_mentions(data):
    if not data:
        return  # Don't plot if no data

    plt.figure(figsize=(8, 5))
    df = pd.DataFrame({'Word': list(data.keys()), 'Count': list(data.values())})
    df = df.sort_values(by='Count', ascending=False)

    plt.barh(df['Word'], df['Count'], color='skyblue')
    plt.xlabel('Mentions')
    plt.ylabel('Words')
    plt.title('Word Mention Frequency')
    plt.gca().invert_yaxis()  # Highest mentions at the top
    plt.show()

# Observe text changes
selected_word_for_mention.observe(update_mentions, names='value')

# Display widgets
display(selected_word_for_mention, output)


In [None]:
# frequency of words used per person

# Define N: Number of top words to store per user (change amount if needed)
TOP_N = 50

word_frequency_by_user = {user: defaultdict(int) for user in members}

# Define a function to clean words (remove punctuation, convert to lowercase)
def clean_word(word):
    return re.sub(r'[^\w\s]', '', word).lower()  # Remove punctuation and lowercase

# Process messages
for message in chat_data_for_selected_year:
    if 'from' in message and isinstance(message['text'], str):
        sender = message['from']
        if sender in members:  # Ensure sender is in the group
            words = message['text'].split()
            for word in words:
                cleaned_word = clean_word(word)
                if cleaned_word:  # Avoid empty strings after cleaning
                    word_frequency_by_user[sender][cleaned_word] += 1

# Keep only the top N words per user
top_words_by_user = {
    user: dict(Counter(word_counts).most_common(TOP_N))  
    for user, word_counts in word_frequency_by_user.items()
}

In [None]:
# word map per person for frequency of most used words per person

for user, word_cloud in top_words_by_user.items():
    if not word_cloud:  # Skip users with no words
        continue

    print(f'\n\n{user} - Most Used Words\n')

    # Generate word cloud image
    user_WordCloud = WordCloud(
        width=800, height=400, background_color='white', colormap='viridis'
    ).generate_from_frequencies(word_cloud)

    # Display the generated word cloud
    plt.figure(figsize=(12, 6))
    plt.imshow(user_WordCloud, interpolation='bilinear')
    plt.axis('off')  # Hide axis labels
    plt.title(f"Word Cloud for {user}", fontsize=14)
    plt.show()

In [None]:
# Average length of message sent by each user and median length of message sent

len_of_words_by_user = [] # Holds the average length of every message sent by user

# itereate through all the messages and append the average length of every message sent by user
for message in chat_data_for_selected_year:
    if 'from' in message and isinstance(message['text'], str):
        user = message['from']

        if user in members:
            len_of_words_by_user.append({"User": user, "Message Length": len(message['text'])}
                                        )
# Convert to DataFrame
length_df = pd.DataFrame(len_of_words_by_user)

# Compute average & median for each user
avg_median_word_length = length_df.groupby("User")["Message Length"].agg(['mean', 'median']).reset_index()

# Rename columns for clarity
avg_median_word_length.rename(columns={'mean': 'Average', 'median': 'Median'}, inplace=True)

# Print the results
print("📊 Average and median length of messages sent")
print("----------------------------------------------")   

for _, row in avg_median_word_length.iterrows():
    print(f"\n{row['User']}:")
    print(f"  🟢 Average: {row['Average']:.1f} words")
    print(f"  🔵 Median: {row['Median']:.1f} words")

In [None]:
# plot results of message length by members

# Set figure size
plt.figure(figsize=(12, 6))

# Create the boxplot
sns.boxplot(
    x="User",
    y="Message Length",
    data=length_df,
    palette="coolwarm",  # Gradient color scheme
    showfliers=False,  # Hide outliers for cleaner visualization
)

# Customize labels and title
plt.xlabel("Users", fontsize=12)
plt.ylabel("Message Length (words)", fontsize=12)
plt.title("Message Length Distribution per User", fontsize=14, fontweight="bold")

# Rotate x-axis labels for better readability
plt.xticks(rotation=30, ha="right")

# Remove top & right spines for a cleaner look
sns.despine()

# Show plot
plt.show()


In [None]:
# reactions emojis or who reacts

reaction_map = defaultdict(int)
reaction_freq_by_user = {user : 0 for user in members}

for message in chat_data_for_selected_year:
    if 'reactions' in message:
        for reaction in message['reactions']:
            emoji = reaction['emoji']
            reaction_map[emoji] += 1  # Count occurrences of each emoji
            
            for react in reaction['recent']:
                user = react['from']
                if user in members:
                    reaction_freq_by_user[user] += 1  # Count reactions per user

# Sort reaction frequencies in descending order
sorted_reactions = sorted(reaction_map.items(), key=lambda item: item[1], reverse=True)

# Display results
print("📊 Amount of Reactions by User")
print("-------------------------------")
for user, count in reaction_freq_by_user.items():
    print(f'{user}: {count}')

print("\n💬 Reaction Frequency (Emoji Usage)")
print("------------------------------------")
for emoji, count in sorted_reactions:
    print(f'{emoji}: {count}')

In [None]:
 # Average and median time between messages sent (in seconds) and who starts new conversation (if difference since last message is 1hr or more 
 # we estimate a new conversation started)

differences_seconds = []  # Holds the difference in seconds between two consecutive messages
conversation_starters = {user : 0 for user in members} 
differences_seconds_with_date = [] # Holds the difference in seconds between two consecutive messages with the date when it occured

for i in range(1, len(chat_data_for_selected_year)):
    curr_msg = chat_data_for_selected_year[i]
    prev_msg = chat_data_for_selected_year[i - 1]

    if 'from' in curr_msg and 'date_unixtime' in curr_msg and 'date_unixtime' in prev_msg:
        # Convert timestamps
        curr_time = datetime.fromtimestamp(int(curr_msg['date_unixtime']))
        prev_time = datetime.fromtimestamp(int(prev_msg['date_unixtime']))
        
        # Calculate time difference
        time_diff = int((curr_time - prev_time).total_seconds())
        differences_seconds.append(time_diff)
        differences_seconds_with_date.append((time_diff, prev_time))

        # Check for new conversation (1-hour gap)
        if time_diff >= 3600 and curr_msg['from'] in members:
            conversation_starters[curr_msg['from']] += 1


# Calculate average and median (integer values)
average_secs_between_messages = int(np.mean(differences_seconds)) if differences_seconds else 0
median_secs_between_messages = int(np.median(differences_seconds)) if differences_seconds else 0

# Display results
print("⏳ Average and Median Time Between Messages Sent (Seconds)")
print("----------------------------------------------------------")
print(f"📊 Average: {average_secs_between_messages} sec")
print(f"📉 Median: {median_secs_between_messages} sec")

In [None]:
# who starts conversations the most

print("🚀 Estimated new conversation started by user (1hr+ since last message)")
print("--------------------------------------------------------------------") 
for user, count in conversation_starters.items():
    print(f"{user}: {count} new conversations started")

In [None]:
# How long on average do conversations last

# If there is no reply or follow up text within 10 minutes, we'll assume the conversation is over and 
# anything after that point is considered a different conversation

length_of_conversations_with_date = [] 
number_of_msgs = 1
conversation_time = 0

for i, time_diff in enumerate(differences_seconds):
    if time_diff <= 600:  # Check if message was sent within 10 minutes
        conversation_time += time_diff
        number_of_msgs += 1
    else:
        if conversation_time > 0:
            length_of_conversations_with_date.append(
                (number_of_msgs, conversation_time, differences_seconds_with_date[i][1])
            )
        number_of_msgs = 1
        conversation_time = 0


print("\n⏳ Longest Conversations of the Year (Reply Gap ≤ 10 mins)")
print("-----------------------------------------------------------") 

for i, (num_msgs, duration, last_message_time) in enumerate(
    sorted(length_of_conversations_with_date, key=lambda x: x[1], reverse=True)[:5]  # Get top 5
):
    start_time = last_message_time - timedelta(seconds=duration)
    end_time = last_message_time

    print(f"📅 {start_time.date()}: {num_msgs} messages in {duration//60} mins "
          f"({start_time.strftime('%I:%M %p')} - {end_time.strftime('%I:%M %p')})")

In [None]:
# Emoji usage

# Emoji usage counter
emoji_usage = defaultdict(int)

# Regex pattern to match emojis
emoji_pattern = re.compile('[\U0001F600-\U0001F64F\U0001F300-\U0001F5FF\U0001F680-\U0001F6FF\U0001F700-\U0001F77F\U0001F780-\U0001F7FF\U0001F800-\U0001F8FF\U0001F900-\U0001F9FF\U0001FA00-\U0001FA6F\U0001FA70-\U0001FAFF\U00002702-\U000027B0\U000024C2-\U0001F251]')

# Loop through each message in the chat
for message in chat_data_for_selected_year:
    if 'from' in message and isinstance(message['text'], str):
        # Find all emojis in the message using regex
        emojis_in_message = emoji_pattern.findall(message['text'])
        
        # Increment the count for each emoji found
        for em in emojis_in_message:
            emoji_usage[em] += 1

# Print sorted emoji usage frequency
print("Emoji usage frequency")
print("---------------------") 
for emoji_char, count in sorted(emoji_usage.items(), key=lambda item: item[1], reverse=True):
    print(f'{emoji_char}: {count}')

# 📊 Overall Trends in the Selected Group Chat (All Years)
In this analysis, we will explore key messaging trends over the years:
-  📈 Yearly Message Trends: How the total number of messages has changed over time.
-  ⏰ Monthly Message Distribution: A breakdown of message activity across different months, visualized with a stacked bar chart by year.

In [None]:
# Total number of messages sent by year for selected groupchat & amount of messages sent montly by years

total_messages_sent_by_year = defaultdict(int)
total_messages_sent_by_year_per_month = defaultdict(lambda: defaultdict(int))  # Nested defaultdict

for chat in chats_list:
    if chat['name'] == selected_chat.value:
        for message in chat['messages']:
            date_time = datetime.fromtimestamp(int(message["date_unixtime"]))
            year, month = date_time.year, date_time.month  # Extract year and month
            
            total_messages_sent_by_year[year] += 1
            total_messages_sent_by_year_per_month[year][calendar.month_name[month]] += 1
            
print(f"📅 Total number of messages sent by year for {selected_chat.value}")
print("----------------------------------------------------------------")            
for key, count in total_messages_sent_by_year.items():
    print(f'{key}: {count:,} messages')

In [None]:
# ploting total number of messages sent by year for selected groupchat

# Convert to a DataFrame for better visualization with Seaborn
yearly_data = pd.DataFrame(list(total_messages_sent_by_year.items()), columns=['Year', 'Messages'])

# Set the Seaborn style for the plot
sns.set_theme(style="darkgrid")

# Create the bar plot
plt.figure(figsize=(10, 6))
ax = sns.barplot(x='Year', y='Messages', data=yearly_data, palette="Blues")

# Add data labels on top of bars
for p in ax.patches:
    ax.annotate(f'{int(p.get_height()):,}',  # Convert to integer for clarity
                (p.get_x() + p.get_width() / 2, p.get_height()),  # Position
                ha='center', va='bottom', fontsize=12, color='black')
    
# Customize the plot
plt.title(f"Total Number of Messages Sent by Year for {selected_chat.value}", fontsize=16)
plt.xlabel("Year", fontsize=12)
plt.ylabel("Number of Messages", fontsize=12)

# Display the plot
plt.show()

In [None]:
# Monthly Message Distribution by Year

# Convert the dictionary into a long-format DataFrame
message_data_monthly = []
for year, months in total_messages_sent_by_year_per_month.items():
    for month, count in months.items():
        message_data_monthly.append({"Year": year, "Month": month, "Messages": count})

message_data_monthly_df = pd.DataFrame(message_data_monthly)

# Convert Month names to categorical data for correct ordering
message_data_monthly_df["Month"] = pd.Categorical(message_data_monthly_df["Month"], categories=list(calendar.month_name[1:]), ordered=True)

# Pivot the data for stacked bar chart
df_pivot = message_data_monthly_df.pivot(index="Month", columns="Year", values="Messages").fillna(0)

# Plot the stacked bar chart
plt.figure(figsize=(12, 6))
df_pivot.plot(kind="bar", stacked=True, colormap="viridis", figsize=(12, 6))

# Customize labels and title
plt.xlabel("Month")
plt.ylabel("Total Messages")
plt.title(f"Monthly Message Distribution by Year for {selected_chat.value}")
plt.legend(title="Year")

# Show the plot
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
