In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import re
import numpy as np
import seaborn as sns
import time
from matplotlib.colors import Normalize
import time
import matplotlib.cm as cm
import time
from matplotlib.collections import CircleCollection
import math

In [2]:
def extract_email_data(filename):
    with open(filename, 'r') as file:
        data = file.read()
    all_emails = re.findall(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', data)
    num_email_apps = {}
    for email in all_emails:
        if email in num_email_apps.keys():
            num_email_apps[email] += 1
        else:
            num_email_apps[email] = 1
    return num_email_apps
    

In [3]:
training_emails = extract_email_data("datasets/enron/training.txt")

In [4]:
validation_emails = extract_email_data("datasets/enron/validation.txt")

In [5]:
def create_top10_frequency_chart(data, output_file=None, figsize=(12, 8), 
                               chart_type="horizontal_bar", color_map='viridis', 
                               title=None):
    """
    Creates a chart showing the top 10 most frequent data points.
    
    Parameters:
    -----------
    data : dict
        Dictionary with keys as data points and values as frequencies
    output_file : str, optional
        Path to save the output file, if None, the plot is shown
    figsize : tuple, optional
        Figure size in inches
    chart_type : str, optional
        Type of chart: "horizontal_bar", "bubble", or "pie"
    color_map : str, optional
        Matplotlib colormap name to use
    title : str, optional
        Title for the plot
    
    Returns:
    --------
    float
        Execution time in seconds
    """
    start_time = time.time()
    
    # Get the top 10 frequencies
    sorted_data = dict(sorted(data.items(), key=lambda item: item[1], reverse=True)[:10])
    
    # Extract keys and values
    keys = list(sorted_data.keys())
    values = list(sorted_data.values())
    
    # Create figure and axis
    fig, ax = plt.subplots(figsize=figsize)
    
    # Create color mapping
    norm = Normalize(vmin=min(values), vmax=max(values))
    colors = [cm.get_cmap(color_map)(norm(value)) for value in values]
    
    # Create chart based on selected type
    if chart_type == "horizontal_bar":
        # Create horizontal bar chart (best for frequency visualization)
        bars = ax.barh(range(len(keys)), values, color=colors)
        
        # Add data labels
        for i, bar in enumerate(bars):
            width = bar.get_width()
            label_x_pos = width + max(values) * 0.01
            ax.text(label_x_pos, bar.get_y() + bar.get_height()/2, f"{values[i]:,}",
                  va='center', fontsize=10, fontweight='bold')
        
        # Set y-ticks with data point labels
        ax.set_yticks(range(len(keys)))
        ax.set_yticklabels(keys)
        
        # Set x-axis label
        ax.set_xlabel('Frequency', fontsize=12)
        
        # Add grid lines for readability
        ax.grid(axis='x', linestyle='--', alpha=0.7)
        
        # Remove top and right spines
        ax.spines['top'].set_visible(False)
        ax.spines['right'].set_visible(False)
        
    elif chart_type == "bubble":
        # Position bubbles in a horizontal line
        x = np.arange(len(keys))
        y = np.zeros(len(keys))
        
        # Calculate bubble sizes
        max_value = max(values)
        sizes = [v / max_value * 3000 for v in values]
        
        # Plot bubbles
        scatter = ax.scatter(x, y, s=sizes, c=colors, alpha=0.8, 
                           edgecolor='white', linewidth=1.5)
        
        # Add data labels inside bubbles
        for i in range(len(keys)):
            # Add key label
            ax.text(x[i], y[i], keys[i], ha='center', va='center',
                  fontsize=11, fontweight='bold', color='black')
            
            # Add frequency below the bubble
            ax.text(x[i], y[i] - 0.3, f"{values[i]:,}", ha='center', va='top',
                  fontsize=10, fontweight='bold', color='black',
                  bbox=dict(facecolor='white', alpha=0.7, edgecolor='none', 
                          boxstyle='round,pad=0.2'))
        
        # Set axis limits
        ax.set_xlim(-0.5, len(keys) - 0.5)
        ax.set_ylim(-1, 1)
        
        # Remove ticks
        ax.set_xticks([])
        ax.set_yticks([])
        
        # Remove spines
        for spine in ax.spines.values():
            spine.set_visible(False)
            
    elif chart_type == "pie":
        # Create pie chart
        wedges, texts, autotexts = ax.pie(
            values, 
            labels=None,
            autopct='%1.1f%%',
            startangle=90,
            wedgeprops=dict(width=0.5, edgecolor='w'),
            colors=colors
        )
        
        # Modify text properties
        for autotext in autotexts:
            autotext.set_fontsize(9)
            autotext.set_fontweight('bold')
        
        # Add a legend with custom labels showing both key and value
        legend_labels = [f"{k} ({v:,})" for k, v in sorted_data.items()]
        ax.legend(wedges, legend_labels, loc="center left", bbox_to_anchor=(1, 0.5))
        
        # Equal aspect ratio ensures the pie chart is circular
        ax.set_aspect('equal')
        
    # Add title
    if title:
        ax.set_title(title, fontsize=16, pad=20)
    else:
        ax.set_title(f'Top 10 Most Frequent Data Points', fontsize=16, pad=20)
    
    # Add generation time note
    end_time = time.time()
    execution_time = end_time - start_time
    fig.text(0.02, 0.02, f'Generated in {execution_time:.2f} seconds', 
            fontsize=8, alpha=0.7)
    
    # Add a note showing total entries vs. displayed entries
    total_entries = sum(data.values())
    displayed_entries = sum(values)
    percentage = (displayed_entries / total_entries) * 100 if total_entries > 0 else 0
    
    fig.text(0.5, 0.02, 
            f'Showing top 10 entries ({displayed_entries:,} out of {total_entries:,} total, {percentage:.1f}%)',
            fontsize=10, ha='center', style='italic')
    
    # Tight layout
    plt.tight_layout()
    
    # Save or show
    if output_file:
        plt.savefig(output_file, dpi=150, bbox_inches='tight')
        print(f"Saved to {output_file}")
    else:
        plt.show()
    
    plt.close()
    
    return execution_time

In [6]:
execution_time = create_top10_frequency_chart(
        training_emails,
        output_file="top10_frequency_bar_train.png",
        figsize=(12, 8),
        chart_type="horizontal_bar",  # Options: "horizontal_bar", "bubble", "pie"
        color_map='viridis',
        title="Top 10 Most Frequent Items")
execution_time

Saved to top10_frequency_bar_train.png


0.040287017822265625

In [7]:
execution_time = create_top10_frequency_chart(
        validation_emails,
        output_file="top10_frequency_bar_val.png",
        figsize=(12, 8),
        chart_type="horizontal_bar",  # Options: "horizontal_bar", "bubble", "pie"
        color_map='viridis',
        title="Top 10 Most Frequent Items")
execution_time

Saved to top10_frequency_bar_val.png


0.02391815185546875