# AEI Report v3 Claude.ai Analysis

This notebook performs statistical analysis and creates visualizations from enriched Clio data.
It works directly with long format data from the preprocessing pipeline.

**Input**: `aei_enriched_claude_ai_2025-08-04_to_2025-08-11.csv`

**Output**: Visualizations

## 1. Setup and Data Loading

In [None]:
from pathlib import Path
import pandas as pd
import matplotlib.pyplot as plt

# Import all analysis functions
from aei_analysis_functions_claude_ai import (
    setup_plot_style,
    get_filtered_geographies,
    plot_usage_index_bars,
    plot_tier_map,
    plot_usage_share_bars,
    plot_tier_summary_table,
    plot_gdp_scatter,
    plot_request_comparison_cards,
    plot_soc_usage_scatter,
    plot_dc_task_request_cards,
    collaboration_task_regression,
    plot_usage_index_histogram,
    plot_variable_map,
    plot_soc_distribution,
    plot_automation_preference_residuals,
    plot_variable_bars,
)

In [None]:
# Set matplotlib to use the correct backend and style
setup_plot_style()

In [None]:
# Set up output directory for saving figures
output_dir = Path("../data/output/figures/")
output_dir.mkdir(parents=True, exist_ok=True)
output_dir_app = Path("../data/output/figures/appendix/")
output_dir_app.mkdir(parents=True, exist_ok=True)

# Load enriched data
data_path = "../data/output/aei_enriched_claude_ai_2025-08-04_to_2025-08-11.csv"

# Load the data - use keep_default_na=False to preserve "NA" (Namibia) as string
df = pd.read_csv(data_path, keep_default_na=False, na_values=[""])

In [None]:
# Filter countries to those with at least 200 observations
# Filter US states to those with at least 100 observations
filtered_countries, filtered_states = get_filtered_geographies(df)

## 2.2 Global

In [None]:
# Top countries by share of global usage
plot_usage_share_bars(
    df,
    geography="country",
    top_n=30,
)
plt.savefig(
    output_dir / "usage_pct_bar_country_top30.png", dpi=300, bbox_inches="tight"
)

In [None]:
# Create world map showing usage tiers
plot_tier_map(
    df,
    geography="country",
    title="Anthropic AI Usage Index tiers by country",
    figsize=(16, 10),
)
plt.savefig(
    output_dir / "ai_usage_index_tier_map_country_all.png", dpi=300, bbox_inches="tight"
)

In [None]:
# Create tier summary table for countries
plot_tier_summary_table(df, geography="country")
plt.savefig(
    output_dir / "tier_summary_table_country.png",
    dpi=300,
    bbox_inches="tight",
    transparent=True,
)

In [None]:
# Top countries by usage per capita
plot_usage_index_bars(
    df, geography="country", top_n=20, filtered_entities=filtered_countries
)
plt.savefig(
    output_dir / "ai_usage_index_bar_country_top20.png", dpi=300, bbox_inches="tight"
)

In [None]:
# GDP vs usage regression for countries
plot_gdp_scatter(df, geography="country", filtered_entities=filtered_countries)
plt.savefig(
    output_dir / "ai_usage_index_gdp_reg_country_min_obs.png",
    dpi=300,
    bbox_inches="tight",
)

In [None]:
# GDP vs usage regression for countries
plot_gdp_scatter(
    df, geography="country", filtered_entities=filtered_countries, figsize=(13.2, 8.25)
)
plt.savefig(
    output_dir / "ai_usage_index_gdp_reg_country_min_obs_wide.png",
    dpi=300,
    bbox_inches="tight",
)

In [None]:
# Create SOC diffusion scatter plot with top 4 classified SOC groups (2x2 grid)
plot_soc_usage_scatter(df, geography="country")
plt.savefig(
    output_dir / "soc_usage_scatter_top4_country_min.png", dpi=300, bbox_inches="tight"
)

In [None]:
# Find the highest usage country in each tier (1-4)

# Get usage tier and usage count data for all countries
tier_data = df[
    (df["geography"] == "country")
    & (df["variable"] == "usage_tier")
    & (df["facet"] == "country")
][["geo_id", "value"]].rename(columns={"value": "tier"})

usage_data = df[
    (df["geography"] == "country")
    & (df["variable"] == "usage_count")
    & (df["facet"] == "country")
][["geo_id", "geo_name", "value"]].rename(columns={"value": "usage_count"})

# Merge tier and usage data
country_data = usage_data.merge(tier_data, on="geo_id")

selected_countries = [
    country_data[country_data["tier"] == tier]
    .sort_values("usage_count", ascending=False)
    .iloc[0]["geo_id"]
    for tier in [4, 3, 2, 1]
]

In [None]:
# Compare top overrepresented requests for 4 highest usage countries in each tier
plot_request_comparison_cards(
    df,
    geo_ids=selected_countries,
    top_n=5,
    title="Top overrepresented requests for the United States, Brazil, Vietnam and India",
    geography="country",
)

plt.savefig(
    output_dir / "request_comparison_cards_by_tier_country_selected4.png",
    dpi=300,
    bbox_inches="tight",
)

## 3. United States

In [None]:
# State tier map
plot_tier_map(
    df, geography="state_us", title="Anthropic AI Usage Index tier by US state"
)
plt.savefig(
    output_dir / "ai_usage_index_tier_map_state_all.png", dpi=300, bbox_inches="tight"
)

In [None]:
# Top 20 US states
plot_usage_index_bars(
    df,
    geography="state_us",
    top_n=20,
)
plt.savefig(
    output_dir / "ai_usage_index_bar_state_top20.png", dpi=300, bbox_inches="tight"
)

In [None]:
# Create tier summary table for US states
plot_tier_summary_table(df, geography="state_us")
plt.savefig(
    output_dir / "tier_summary_table_state.png",
    dpi=300,
    bbox_inches="tight",
    transparent=True,
)

In [None]:
# Find the highest usage US state in each tier (1-4)

# Get usage tier and usage count data for US states
tier_data_states = df[
    (df["geography"] == "state_us")
    & (df["variable"] == "usage_tier")
    & (df["facet"] == "state_us")
][["geo_id", "value"]].rename(columns={"value": "tier"})

usage_data_states = df[
    (df["geography"] == "state_us")
    & (df["variable"] == "usage_count")
    & (df["facet"] == "state_us")
][["geo_id", "geo_name", "value"]].rename(columns={"value": "usage_count"})

# Merge tier and usage data
state_data = usage_data_states.merge(tier_data_states, on="geo_id")

# Find the highest usage state in each tier
selected_states = [
    state_data[state_data["tier"] == tier]
    .sort_values("usage_count", ascending=False)
    .iloc[0]["geo_id"]
    for tier in [4, 3, 2, 1]
]

In [None]:
# Compare top overrepresented requests for US states representing each tier
# CA (Tier 4), TX (Tier 3), FL (Tier 2), SC (Tier 1)
states_to_compare = ["CA", "TX", "FL", "SC"]

plot_request_comparison_cards(
    df,
    geo_ids=states_to_compare,
    top_n=5,
    title="Top overrepresented high-level requests for California, Texas, Florida and South Carolina",
    geography="state_us",
)

plt.savefig(
    output_dir / "request_comparison_cards_by_tier_state_selected4.png",
    dpi=300,
    bbox_inches="tight",
)

In [None]:
# Create card-style visualization for Washington DC
# Shows top O*NET tasks and top request categories
plot_dc_task_request_cards(
    df, title="Washington, DC: Highest Anthropic AI Usage Index in the US"
)

plt.savefig(
    output_dir / "task_request_comparison_state_dc.png", dpi=300, bbox_inches="tight"
)

In [None]:
# Collaboration pattern analysis with task mix control
# This analysis determines whether the relationship between AUI
# and automation preference persists after controlling for task composition
collaboration_task_regression(df, geography="country")
plt.savefig(
    output_dir / "collaboration_task_control_partial_corr_country.png",
    dpi=300,
    bbox_inches="tight",
)

# Appendix

## Global

In [None]:
# Distribution histogram
plot_usage_index_histogram(
    df, geography="country", title="Distribution of Anthropic AI Usage Index"
)
plt.savefig(
    output_dir_app / "ai_usage_index_histogram_country_all.png",
    dpi=300,
    bbox_inches="tight",
)

In [None]:
# Create map showing share of usage
plot_variable_map(
    df,
    variable="usage_pct",
    geography="country",
    title="Share of global Claude usage by country",
    figsize=(14, 8),
)
plt.savefig(
    output_dir_app / "usage_pct_map_country_all.png", dpi=300, bbox_inches="tight"
)

In [None]:
# Create world map showing usage per capita
plot_variable_map(
    df,
    variable="usage_per_capita_index",
    geography="country",
    title="Anthropic AI Usage Index by country",
    center_at_one=True,
    figsize=(14, 8),
)
plt.savefig(
    output_dir_app / "ai_usage_index_map_country_all.png", dpi=300, bbox_inches="tight"
)

In [None]:
# AUI for all countries
plot_usage_index_bars(
    df,
    geography="country",
    filtered_entities=filtered_countries,
)
plt.savefig(
    output_dir_app / "ai_usage_index_country_all.png", dpi=300, bbox_inches="tight"
)

In [None]:
# SOC distribution comparison for countries by usage tier
plot_soc_distribution(
    df,
    selected_countries,
    "country",
    title="Occupation groups by Claude task usage in the United States, Brazil, Vietnam and India",
)
plt.savefig(
    output_dir_app / "soc_distribution_by_tier_country_selected4.png",
    dpi=300,
    bbox_inches="tight",
)

In [None]:
# Plot automation preference residuals after controlling for task mix
# This shows which countries prefer more automation vs augmentation
# than would be expected given their task composition
plot_automation_preference_residuals(df)
plt.savefig(
    output_dir_app / "automation_preference_residuals.png", dpi=300, bbox_inches="tight"
)

## United States

In [None]:
# Top countries by share of global usage
plot_usage_share_bars(
    df,
    geography="state_us",
    top_n=30,
    title="Top 30 US states by share of US Claude usage",
)
plt.savefig(
    output_dir_app / "usage_pct_bar_state_top30.png", dpi=300, bbox_inches="tight"
)

In [None]:
# Distribution histogram
plot_usage_index_histogram(
    df, geography="state_us", title="Distribution of Anthropic AI Usage Index"
)
plt.savefig(
    output_dir_app / "ai_usage_index_histogram_state_all.png",
    dpi=300,
    bbox_inches="tight",
)

In [None]:
# Create map showing share of usage
plot_variable_map(
    df,
    variable="usage_pct",
    geography="state_us",
    title="Share of global Claude usage by US state",
    figsize=(14, 8),
)
plt.savefig(
    output_dir_app / "usage_pct_map_state_all.png", dpi=300, bbox_inches="tight"
)

In [None]:
# Create map showing per capita usage
plot_variable_map(
    df,
    variable="usage_per_capita_index",
    geography="state_us",
    title="Anthropic AI Usage Index by US state",
    center_at_one=True,
    figsize=(14, 8),
)
plt.savefig(
    output_dir_app / "ai_usage_index_map_state_all.png", dpi=300, bbox_inches="tight"
)

In [None]:
plot_usage_index_bars(
    df,
    geography="state_us",
)
plt.savefig(
    output_dir_app / "ai_usage_index_bar_state_all.png", dpi=300, bbox_inches="tight"
)

In [None]:
# GDP vs usage regression for US states
plot_gdp_scatter(df, geography="state_us", filtered_entities=filtered_states)
plt.savefig(
    output_dir_app / "ai_usage_index_gdp_reg_state_min_obs.png",
    dpi=300,
    bbox_inches="tight",
)

In [None]:
# SOC distribution comparison for US states by usage tier
plot_soc_distribution(
    df,
    selected_states,
    "state_us",
    title="Occupation groups by Claude task usage in California, Texas, Florida and South Carolina",
)
plt.savefig(
    output_dir_app / "soc_distribution_by_tier_state_selected4.png",
    dpi=300,
    bbox_inches="tight",
)

In [None]:
# Top SOC chart
plot_variable_bars(
    df,
    variable="soc_pct",
    geography="country",
    facet="soc_occupation",
    geo_id="USA",
    title="Occupation groups in the US by Claude use for associated tasks",
    xlabel="Share of total usage (%)",
    exclude_not_classified=True,
)

# Save the figure
plt.savefig(output_dir_app / "soc_bar_country_us.png", dpi=300, bbox_inches="tight")

In [None]:
# Create SOC diffusion scatter plot with top 4 classified SOC groups
plot_soc_usage_scatter(
    df,
    geography="state_us",
)
plt.savefig(
    output_dir_app / "soc_usage_scatter_top4_state_min.png",
    dpi=300,
    bbox_inches="tight",
)