In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import networkx as nx
import squarify
from datetime import datetime
import matplotlib.colors as mcolors

# Set seaborn style for professional look
sns.set_style("whitegrid")

# Load the CSV data
df = pd.read_csv('/content/output.csv')  # Replace 'your_file.csv' with the actual file path

# Convert event_timestamp from microseconds to datetime
try:
    df['event_timestamp'] = pd.to_datetime(df['event_timestamp'], unit='us', errors='coerce')
except Exception as e:
    print(f"Error converting timestamps: {e}")
    df['event_timestamp'] = pd.to_datetime(df['event_timestamp'], unit='us', errors='coerce')

# Drop rows with invalid timestamps
df = df.dropna(subset=['event_timestamp'])

# 1. Bar Chart (UI Usage Statistics)
plt.figure(figsize=(10, 6), dpi=300)
spatial_search_count = len(df[df['event_name'].str.contains('lookup', case=False, na=False)])
opacity_change_count = 0  # No opacity change events
button_clicks = df[df['event_name'].isin(['click', 'nodes_selected', 'nodes_selected_by_ids'])].groupby('event_label').size().sort_values(ascending=False)
categories = ['Spatial Search (EUI)', 'Opacity Change (RUI)'] + button_clicks.index[:3].tolist() + [button_clicks.index[-1]]
values = [spatial_search_count, opacity_change_count] + button_clicks[:3].tolist() + [button_clicks[-1]]
plt.bar(categories, values, color=['blue', 'green', 'red', 'red', 'red', 'red'])
plt.title('UI Usage Statistics', fontsize=14, pad=10)
plt.xlabel('Event Type', fontsize=12)
plt.ylabel('Frequency', fontsize=12)
plt.xticks(rotation=45, ha='right', fontsize=10)
for i, v in enumerate(values):
    plt.text(i, v + 5, str(v), ha='center', fontsize=10)
plt.tight_layout()
plt.savefig('ui_usage_statistics.png', bbox_inches='tight')
plt.close()

# 2. Line Chart (Event Distribution Over Time) - Improved
plt.figure(figsize=(10, 6), dpi=300)
# Resample by hour to create a time series (adjust to 'D' for daily if more data available)
temporal_df = df.groupby([pd.Grouper(key='event_timestamp', freq='H'), 'event_name']).size().unstack(fill_value=0)
# Limit to top 5 events by total frequency to reduce clutter
top_events = df['event_name'].value_counts().head(5).index
temporal_df = temporal_df[top_events]
# Plot as a line chart
ax = temporal_df.plot(kind='line', marker='o', linewidth=2, ax=plt.gca())
plt.title('Event Distribution Over Time (Hourly)', fontsize=14, pad=10)
plt.xlabel('Time', fontsize=12)
plt.ylabel('Event Count', fontsize=12)
plt.legend(title='Event Name', title_fontsize=10, fontsize=8, bbox_to_anchor=(1.05, 1), loc='upper left')
plt.grid(True, linestyle='--', alpha=0.7)
# Format x-axis as datetime
plt.gcf().autofmt_xdate()
plt.xticks(fontsize=10)
plt.tight_layout()
plt.savefig('event_distribution_over_time.png', bbox_inches='tight')
plt.close()
# 3. Heat Map (UI Interaction Heat Map)
plt.figure(figsize=(8, 6), dpi=300)
spatial_df = df[df['event_category'] == 'interaction'].groupby('event_label').size().reset_index(name='counts')
positions = {'Biomarkers': (0, 4), 'body': (1, 3), 'cell': (2, 2), 'Sex:Both': (3, 1), 'mdc-button__label': (4, 0)}
heat_data = pd.DataFrame(index=range(5), columns=range(5)).fillna(0)
for label, count in zip(spatial_df['event_label'], spatial_df['counts']):
    if label in positions:
        row, col = positions[label]
        heat_data.iloc[row, col] = count
sns.heatmap(heat_data, annot=True, fmt='d', cmap='YlOrRd', cbar_kws={'label': 'Click Frequency'}, annot_kws={"size": 10})
plt.title('UI Interaction Heat Map', fontsize=14, pad=10)
plt.xlabel('Column', fontsize=12)
plt.ylabel('Row', fontsize=12)
plt.tight_layout()
plt.savefig('ui_interaction_heat_map.png', bbox_inches='tight')
plt.close()

# 4. Directed Graph (User Event State Graph)
plt.figure(figsize=(8, 8), dpi=300)
G = nx.DiGraph()
user_df = df[df['user_pseudo_id'] == 1038633330.0].sort_values('event_timestamp')
for i in range(len(user_df) - 1):
    source = user_df.iloc[i]['event_name']
    target = user_df.iloc[i + 1]['event_name']
    if G.has_edge(source, target):
        G[source][target]['weight'] += 1
    else:
        G.add_edge(source, target, weight=1)
pos = nx.circular_layout(G)
nx.draw(G, pos, with_labels=True, node_color='lightblue', node_size=2000, arrowstyle='->', arrowsize=20, width=[d['weight'] for (u, v, d) in G.edges(data=True)])
edge_labels = {(u, v): d['weight'] for (u, v, d) in G.edges(data=True)}
nx.draw_networkx_edge_labels(G, pos, edge_labels=edge_labels, font_size=10)
plt.title('User Event State Graph (User 1038633330.0)', fontsize=14, pad=10)
plt.axis('off')
plt.tight_layout()
plt.savefig('user_event_state_graph.png', bbox_inches='tight')
plt.close()

# 5. Treemap (Improved Event Category Distribution)
# Filter to top 10 most frequent event categories/labels to avoid clutter
plt.figure(figsize=(12, 8), dpi=300)
treemap_df = df.groupby(['event_category', 'event_label']).size().reset_index(name='counts')
# Sort by counts and take top 10
treemap_df = treemap_df.sort_values('counts', ascending=False).head(10)
# Simplify labels and wrap text for readability
labels = [f"{cat[:10]}...\n{label[:10]}...\n{count}" if len(cat) > 10 or len(label) > 10 else f"{cat}\n{label}\n{count}"
          for cat, label, count in zip(treemap_df['event_category'].fillna('Unknown'),
                                       treemap_df['event_label'].fillna('Unknown'),
                                       treemap_df['counts'])]
# Use a color palette for better distinction
colors = sns.color_palette("Set2", len(treemap_df))
squarify.plot(sizes=treemap_df['counts'], label=labels, color=colors, alpha=0.7, text_kwargs={'fontsize': 10, 'wrap': True})
plt.title('Top 10 Event Category Distribution', fontsize=14, pad=10)
plt.axis('off')
plt.tight_layout()
plt.savefig('event_category_treemap.png', bbox_inches='tight')
plt.close()

# 6. Bubble Chart (Event Impact) - Fixed Color Issue
plt.figure(figsize=(10, 6), dpi=300)
df['next_timestamp'] = df['event_timestamp'].shift(-1)
df['duration'] = (df['next_timestamp'] - df['event_timestamp']).dt.total_seconds().fillna(0)
bubble_df = df.groupby('event_name').agg({'event_name': 'count', 'duration': 'mean'}).rename(columns={'event_name': 'frequency'})
# Dynamically generate colors based on the number of unique event names
num_events = len(bubble_df)
colors = list(mcolors.TABLEAU_COLORS.keys()) + list(mcolors.CSS4_COLORS.keys())
colors = colors[:num_events]  # Ensure we have enough colors
plt.scatter(bubble_df['duration'], bubble_df['frequency'], s=bubble_df['frequency']*5, c=colors, alpha=0.6)
for i, txt in enumerate(bubble_df.index):
    plt.annotate(txt, (bubble_df['duration'][i], bubble_df['frequency'][i]), xytext=(5, 5), textcoords='offset points', fontsize=8)
plt.title('Event Impact Analysis', fontsize=14, pad=10)
plt.xlabel('Average Duration (seconds)', fontsize=12)
plt.ylabel('Frequency', fontsize=12)
plt.grid(True, linestyle='--', alpha=0.7)
plt.tight_layout()
plt.savefig('event_impact_bubble_chart.png', bbox_inches='tight')
plt.close()

print("Visualizations saved as PNG files in the current directory.")

  values = [spatial_search_count, opacity_change_count] + button_clicks[:3].tolist() + [button_clicks[-1]]
  temporal_df = df.groupby([pd.Grouper(key='event_timestamp', freq='H'), 'event_name']).size().unstack(fill_value=0)
  heat_data = pd.DataFrame(index=range(5), columns=range(5)).fillna(0)
  plt.tight_layout()
  plt.annotate(txt, (bubble_df['duration'][i], bubble_df['frequency'][i]), xytext=(5, 5), textcoords='offset points', fontsize=8)


Visualizations saved as PNG files in the current directory.


In [None]:
# 6. Stacked Area Chart (Event Impact Over Time)
plt.figure(figsize=(10, 6), dpi=300)
# Aggregate event frequency by date and event name
stacked_df = df.groupby([df['event_timestamp'].dt.date, 'event_name']).size().unstack(fill_value=0)
# Normalize dates for plotting
stacked_df.index = pd.to_datetime(stacked_df.index)
# Plot stacked area chart
stacked_df.plot(kind='area', stacked=True, alpha=0.7, colormap='tab20')
plt.title('Event Impact Over Time', fontsize=14, pad=10)
plt.xlabel('Date', fontsize=12)
plt.ylabel('Event Frequency', fontsize=12)
plt.legend(title='Event Name', title_fontsize=10, fontsize=8, bbox_to_anchor=(1.05, 1), loc='upper left')
plt.grid(True, linestyle='--', alpha=0.7)
plt.xticks(rotation=45, fontsize=10)
plt.tight_layout()
plt.savefig('event_impact_stacked_area.png', bbox_inches='tight')
plt.close()

<Figure size 3000x1800 with 0 Axes>