# FullStory Data Exploratory Analysis

This notebook provides a starting point for analyzing exported FullStory data.

## Setup

1. Export data from FullStory (UI or API)
2. Place files in `../exports/` directory
3. Update file paths below

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta

# Configure display
pd.set_option('display.max_columns', 50)
pd.set_option('display.max_rows', 100)
plt.style.use('seaborn-v0_8-whitegrid')
%matplotlib inline

## 1. Load Data

Update the file path to point to your exported data.

In [None]:
# Load session data (CSV)
# Uncomment and update path:
# df_sessions = pd.read_csv('../../exports/sessions.csv')

# Load event data (JSON lines)
# Uncomment and update path:
# df_events = pd.read_json('../../exports/events.json', lines=True)

# For demo, create sample data
np.random.seed(42)
dates = pd.date_range(start='2024-01-01', end='2024-01-31', freq='H')
df_sessions = pd.DataFrame({
    'session_id': range(len(dates)),
    'user_id': np.random.randint(1, 500, len(dates)),
    'session_start': dates,
    'duration_seconds': np.random.exponential(180, len(dates)),
    'page_count': np.random.poisson(5, len(dates)),
    'has_rage_click': np.random.choice([True, False], len(dates), p=[0.05, 0.95]),
    'has_error': np.random.choice([True, False], len(dates), p=[0.03, 0.97])
})

print(f"Loaded {len(df_sessions):,} sessions")
df_sessions.head()

## 2. Basic Statistics

In [None]:
# Overview statistics
print("=" * 50)
print("OVERVIEW")
print("=" * 50)
print(f"Total sessions: {len(df_sessions):,}")
print(f"Unique users: {df_sessions['user_id'].nunique():,}")
print(f"Date range: {df_sessions['session_start'].min()} to {df_sessions['session_start'].max()}")
print(f"\nAvg session duration: {df_sessions['duration_seconds'].mean():.1f}s")
print(f"Median session duration: {df_sessions['duration_seconds'].median():.1f}s")
print(f"Avg pages per session: {df_sessions['page_count'].mean():.1f}")

In [None]:
# Frustration metrics
print("\n" + "=" * 50)
print("FRUSTRATION SIGNALS")
print("=" * 50)
rage_click_rate = df_sessions['has_rage_click'].mean() * 100
error_rate = df_sessions['has_error'].mean() * 100
print(f"Sessions with rage clicks: {rage_click_rate:.1f}%")
print(f"Sessions with errors: {error_rate:.1f}%")

## 3. Session Trends Over Time

In [None]:
# Daily session count
df_sessions['date'] = df_sessions['session_start'].dt.date
daily_sessions = df_sessions.groupby('date').size()

fig, ax = plt.subplots(figsize=(14, 5))
daily_sessions.plot(kind='line', marker='o', markersize=4, ax=ax)
ax.set_title('Daily Session Count', fontsize=14)
ax.set_xlabel('Date')
ax.set_ylabel('Sessions')
plt.tight_layout()
plt.show()

In [None]:
# Daily active users
daily_users = df_sessions.groupby('date')['user_id'].nunique()

fig, ax = plt.subplots(figsize=(14, 5))
daily_users.plot(kind='line', marker='o', markersize=4, color='green', ax=ax)
ax.set_title('Daily Active Users (DAU)', fontsize=14)
ax.set_xlabel('Date')
ax.set_ylabel('Unique Users')
plt.tight_layout()
plt.show()

## 4. Session Duration Distribution

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Histogram
axes[0].hist(df_sessions['duration_seconds'], bins=50, edgecolor='black', alpha=0.7)
axes[0].set_title('Session Duration Distribution')
axes[0].set_xlabel('Duration (seconds)')
axes[0].set_ylabel('Count')
axes[0].axvline(df_sessions['duration_seconds'].median(), color='red', linestyle='--', label='Median')
axes[0].legend()

# Box plot
axes[1].boxplot(df_sessions['duration_seconds'], vert=True)
axes[1].set_title('Session Duration Box Plot')
axes[1].set_ylabel('Duration (seconds)')

plt.tight_layout()
plt.show()

## 5. User Engagement Analysis

In [None]:
# Sessions per user
user_sessions = df_sessions.groupby('user_id').size().reset_index(name='session_count')

print("Sessions per User Distribution")
print(user_sessions['session_count'].describe())

fig, ax = plt.subplots(figsize=(10, 5))
user_sessions['session_count'].hist(bins=30, edgecolor='black', alpha=0.7, ax=ax)
ax.set_title('Sessions per User')
ax.set_xlabel('Number of Sessions')
ax.set_ylabel('Number of Users')
plt.tight_layout()
plt.show()

In [None]:
# Power users (top 10%)
threshold = user_sessions['session_count'].quantile(0.9)
power_users = user_sessions[user_sessions['session_count'] >= threshold]

print(f"\nPower Users (top 10%):")
print(f"  Threshold: {threshold:.0f}+ sessions")
print(f"  Count: {len(power_users):,} users")
print(f"  % of total users: {len(power_users)/len(user_sessions)*100:.1f}%")

## 6. Frustration Analysis

In [None]:
# Frustration signals over time
daily_frustration = df_sessions.groupby('date').agg({
    'has_rage_click': 'mean',
    'has_error': 'mean'
}) * 100

fig, ax = plt.subplots(figsize=(14, 5))
daily_frustration.plot(ax=ax, marker='o', markersize=3)
ax.set_title('Daily Frustration Rates', fontsize=14)
ax.set_xlabel('Date')
ax.set_ylabel('% of Sessions')
ax.legend(['Rage Clicks', 'Errors'])
plt.tight_layout()
plt.show()

In [None]:
# Frustrated vs non-frustrated session comparison
frustrated = df_sessions[df_sessions['has_rage_click'] | df_sessions['has_error']]
not_frustrated = df_sessions[~(df_sessions['has_rage_click'] | df_sessions['has_error'])]

print("Frustrated vs Normal Sessions Comparison")
print("=" * 50)
print(f"{'Metric':<30} {'Frustrated':<15} {'Normal':<15}")
print("-" * 50)
print(f"{'Count':<30} {len(frustrated):<15,} {len(not_frustrated):<15,}")
print(f"{'Avg Duration (s)':<30} {frustrated['duration_seconds'].mean():<15.1f} {not_frustrated['duration_seconds'].mean():<15.1f}")
print(f"{'Avg Pages':<30} {frustrated['page_count'].mean():<15.1f} {not_frustrated['page_count'].mean():<15.1f}")

## 7. Cohort Retention (Example)

Note: This requires user signup dates. Adjust based on your data.

In [None]:
# Simple retention: users who returned after first session
user_first_session = df_sessions.groupby('user_id')['session_start'].min().reset_index()
user_first_session.columns = ['user_id', 'first_session']
user_first_session['first_session_date'] = user_first_session['first_session'].dt.date

df_with_first = df_sessions.merge(user_first_session[['user_id', 'first_session_date']], on='user_id')
df_with_first['days_since_first'] = (df_with_first['date'].apply(lambda x: pd.Timestamp(x)) - 
                                      df_with_first['first_session_date'].apply(lambda x: pd.Timestamp(x))).dt.days

# Retention by days since first session
retention = df_with_first.groupby('days_since_first')['user_id'].nunique()
retention_rate = retention / retention.iloc[0] * 100

fig, ax = plt.subplots(figsize=(12, 5))
retention_rate.head(15).plot(kind='bar', ax=ax, color='steelblue', edgecolor='black')
ax.set_title('Retention by Days Since First Session', fontsize=14)
ax.set_xlabel('Days Since First Session')
ax.set_ylabel('% of Users')
plt.tight_layout()
plt.show()

## 8. Export Results

In [None]:
# Create summary report
summary = {
    'report_date': datetime.now().isoformat(),
    'data_range': {
        'start': str(df_sessions['session_start'].min()),
        'end': str(df_sessions['session_start'].max())
    },
    'metrics': {
        'total_sessions': int(len(df_sessions)),
        'unique_users': int(df_sessions['user_id'].nunique()),
        'avg_session_duration': float(df_sessions['duration_seconds'].mean()),
        'avg_pages_per_session': float(df_sessions['page_count'].mean()),
        'rage_click_rate': float(df_sessions['has_rage_click'].mean()),
        'error_rate': float(df_sessions['has_error'].mean())
    }
}

# Save to JSON
import json
with open('../../exports/analysis_summary.json', 'w') as f:
    json.dump(summary, f, indent=2)
    
print("Summary saved to exports/analysis_summary.json")
print(json.dumps(summary, indent=2))

---

## Next Steps

1. Replace sample data with actual FullStory exports
2. Add page-level analysis if URL data is available
3. Create funnel analysis for key conversion paths
4. Correlate frustration signals with specific pages/features
5. Build cohort retention analysis with signup dates