# 02 - Strategic Analysis

Deep strategic analysis of AI agent cost-performance data.

**Course:** DATA 230 (Data Visualization) at SJSU

## Analysis Sections:
1. **Cost-Performance Optimization**: Pareto-optimal configurations, trade-off curves, sweet spots
2. **Agent Portfolio Strategy**: Clustering, diversification metrics, capability gaps
3. **Risk Assessment**: Failure correlations, systemic risks, Value at Risk (VaR)
4. **Business Impact Modeling**: ROI, business value, optimization opportunities


In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

# Load strategic features data
df = pd.read_csv('../data/ml/strategic_agent_features.csv')
print(f"Loaded {len(df)} records with {len(df.columns)} features")
print(f"Columns: {list(df.columns)}")


ModuleNotFoundError: No module named 'plotly'

## 1. Cost-Performance Optimization


In [None]:
# Identify Pareto-optimal agent configurations
# An agent is Pareto-optimal if no other agent has both lower cost AND higher performance

def find_pareto_optimal(df, cost_col='cost_per_task_cents', perf_col='performance_index'):
    """Find Pareto-optimal points (minimize cost, maximize performance)"""
    pareto_mask = np.ones(len(df), dtype=bool)
    
    for i in range(len(df)):
        for j in range(len(df)):
            if i != j:
                # j dominates i if j has lower/equal cost AND higher/equal performance (with at least one strict)
                if (df[cost_col].iloc[j] <= df[cost_col].iloc[i] and 
                    df[perf_col].iloc[j] >= df[perf_col].iloc[i] and
                    (df[cost_col].iloc[j] < df[cost_col].iloc[i] or 
                     df[perf_col].iloc[j] > df[perf_col].iloc[i])):
                    pareto_mask[i] = False
                    break
    return pareto_mask

# Sample for efficiency (Pareto calculation is O(n²))
sample_df = df.sample(n=min(500, len(df)), random_state=42).reset_index(drop=True)
sample_df['is_pareto_optimal'] = find_pareto_optimal(sample_df)

pareto_count = sample_df['is_pareto_optimal'].sum()
print(f"Pareto-optimal configurations: {pareto_count} out of {len(sample_df)} sampled")

# Show Pareto-optimal agents
pareto_agents = sample_df[sample_df['is_pareto_optimal']][['agent_id', 'agent_type', 'model_architecture', 'cost_per_task_cents', 'performance_index']]
print(f"\nTop Pareto-optimal agents:")
print(pareto_agents.head(10))


In [None]:
# Calculate trade-off curves between cost vs performance
# Group by cost buckets and calculate average performance

df['cost_bucket'] = pd.qcut(df['cost_per_task_cents'], q=10, labels=False, duplicates='drop')
tradeoff_curve = df.groupby('cost_bucket').agg({
    'cost_per_task_cents': 'mean',
    'performance_index': 'mean',
    'success_rate': 'mean',
    'agent_id': 'count'
}).rename(columns={'agent_id': 'count'}).reset_index()

print("Cost-Performance Trade-off Curve:")
print(tradeoff_curve)

# Calculate marginal performance gain per cost increase
tradeoff_curve['marginal_perf_gain'] = tradeoff_curve['performance_index'].diff() / tradeoff_curve['cost_per_task_cents'].diff()
print("\nMarginal Performance Gain per Cost Unit:")
print(tradeoff_curve[['cost_bucket', 'cost_per_task_cents', 'performance_index', 'marginal_perf_gain']])


In [None]:
# Find "sweet spot" configurations for each task category
# Sweet spot = best cost_efficiency_ratio within each task_category

sweet_spots = df.loc[df.groupby('task_category')['cost_efficiency_ratio'].idxmax()]
sweet_spots_summary = sweet_spots[['task_category', 'agent_type', 'model_architecture', 
                                    'cost_per_task_cents', 'performance_index', 'cost_efficiency_ratio']]

print("Sweet Spot Configurations by Task Category:")
print(sweet_spots_summary.sort_values('cost_efficiency_ratio', ascending=False))

# Calculate average metrics for sweet spots vs overall
print(f"\nSweet Spot Average Performance: {sweet_spots['performance_index'].mean():.4f}")
print(f"Overall Average Performance: {df['performance_index'].mean():.4f}")
print(f"Sweet Spot Average Cost: {sweet_spots['cost_per_task_cents'].mean():.4f}")
print(f"Overall Average Cost: {df['cost_per_task_cents'].mean():.4f}")


## 2. Agent Portfolio Strategy


In [None]:
# Cluster agents into strategic categories: Workhorses, Specialists, Underperformers
# Using performance_index, cost_efficiency_ratio, and success_rate

clustering_features = ['performance_index', 'cost_efficiency_ratio', 'success_rate', 'efficiency_score']
X_cluster = df[clustering_features].copy()
X_cluster = X_cluster.fillna(X_cluster.mean())

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_cluster)

# K-means with 3 clusters
kmeans = KMeans(n_clusters=3, random_state=42, n_init=10)
df['cluster'] = kmeans.fit_predict(X_scaled)

# Analyze clusters to label them
cluster_stats = df.groupby('cluster')[clustering_features].mean()
print("Cluster Statistics:")
print(cluster_stats)

# Label clusters based on characteristics
cluster_labels = {}
for cluster_id in range(3):
    perf = cluster_stats.loc[cluster_id, 'performance_index']
    cost_eff = cluster_stats.loc[cluster_id, 'cost_efficiency_ratio']
    
    if perf > cluster_stats['performance_index'].median() and cost_eff > cluster_stats['cost_efficiency_ratio'].median():
        cluster_labels[cluster_id] = 'Workhorse'
    elif perf > cluster_stats['performance_index'].median():
        cluster_labels[cluster_id] = 'Specialist'
    else:
        cluster_labels[cluster_id] = 'Underperformer'

df['strategic_category'] = df['cluster'].map(cluster_labels)
print("\nStrategic Category Distribution:")
print(df['strategic_category'].value_counts())


In [None]:
# Calculate portfolio diversification metrics
# Herfindahl-Hirschman Index (HHI) for concentration

# By agent_type
agent_type_shares = df['agent_type'].value_counts(normalize=True)
hhi_agent_type = (agent_type_shares ** 2).sum()

# By model_architecture
arch_shares = df['model_architecture'].value_counts(normalize=True)
hhi_architecture = (arch_shares ** 2).sum()

# By strategic_category
category_shares = df['strategic_category'].value_counts(normalize=True)
hhi_category = (category_shares ** 2).sum()

print("Portfolio Diversification Metrics (HHI):")
print(f"Agent Type HHI: {hhi_agent_type:.4f} (lower = more diversified)")
print(f"Architecture HHI: {hhi_architecture:.4f}")
print(f"Strategic Category HHI: {hhi_category:.4f}")

# Diversification score (inverse of average HHI)
diversification_score = 1 - (hhi_agent_type + hhi_architecture + hhi_category) / 3
print(f"\nOverall Diversification Score: {diversification_score:.4f}")


In [None]:
# Identify gaps in agent capability coverage
# Check which task_category + deployment_environment combinations have low coverage or poor performance

coverage_matrix = df.groupby(['task_category', 'deployment_environment']).agg({
    'agent_id': 'count',
    'performance_index': 'mean',
    'success_rate': 'mean'
}).rename(columns={'agent_id': 'agent_count'}).reset_index()

# Identify gaps (low agent count or low performance)
coverage_matrix['is_gap'] = (
    (coverage_matrix['agent_count'] < coverage_matrix['agent_count'].quantile(0.25)) |
    (coverage_matrix['performance_index'] < coverage_matrix['performance_index'].quantile(0.25))
)

print("Capability Coverage Gaps:")
gaps = coverage_matrix[coverage_matrix['is_gap']]
print(gaps.sort_values('performance_index'))

print(f"\nTotal gaps identified: {len(gaps)} out of {len(coverage_matrix)} combinations")


## 3. Risk Assessment


In [None]:
# Build failure correlation matrices
# Calculate failure rate (1 - success_rate) correlations across different dimensions

df['failure_rate'] = 1 - df['success_rate']

# Correlation matrix for risk-related metrics
risk_metrics = ['failure_rate', 'operational_risk_index', 'degradation_risk_score', 
                'task_complexity', 'execution_time_seconds', 'response_latency_ms']
risk_correlation = df[risk_metrics].corr()

print("Failure Correlation Matrix:")
print(risk_correlation.round(3))

# Identify highly correlated risk factors (potential systemic risks)
high_corr_pairs = []
for i in range(len(risk_metrics)):
    for j in range(i+1, len(risk_metrics)):
        corr = risk_correlation.iloc[i, j]
        if abs(corr) > 0.5:
            high_corr_pairs.append((risk_metrics[i], risk_metrics[j], corr))

print("\nHighly Correlated Risk Factors (|corr| > 0.5):")
for pair in high_corr_pairs:
    print(f"  {pair[0]} <-> {pair[1]}: {pair[2]:.3f}")


In [None]:
# Identify systemic risk factors
# Factors that affect multiple agent types or architectures similarly

# Calculate failure rate variance explained by different groupings
systemic_risks = {}

# By model_architecture
arch_failure = df.groupby('model_architecture')['failure_rate'].agg(['mean', 'std', 'count'])
systemic_risks['model_architecture'] = arch_failure['mean'].std()

# By deployment_environment
env_failure = df.groupby('deployment_environment')['failure_rate'].agg(['mean', 'std', 'count'])
systemic_risks['deployment_environment'] = env_failure['mean'].std()

# By task_category
task_failure = df.groupby('task_category')['failure_rate'].agg(['mean', 'std', 'count'])
systemic_risks['task_category'] = task_failure['mean'].std()

print("Systemic Risk Factors (variance in failure rate across groups):")
for factor, risk in sorted(systemic_risks.items(), key=lambda x: x[1], reverse=True):
    print(f"  {factor}: {risk:.4f}")

print("\nHighest Risk Architecture:")
print(arch_failure.sort_values('mean', ascending=False).head(3))

print("\nHighest Risk Environment:")
print(env_failure.sort_values('mean', ascending=False))


In [None]:
# Calculate Value at Risk (VaR) for agent deployments
# VaR = potential loss at a given confidence level

# Cost-based VaR (95% confidence)
confidence_level = 0.95

# Total cost distribution
df['potential_loss'] = df['cost_per_task_cents'] * df['failure_rate']
var_95 = df['potential_loss'].quantile(confidence_level)
cvar_95 = df[df['potential_loss'] >= var_95]['potential_loss'].mean()  # Conditional VaR

print(f"Value at Risk Analysis (95% confidence):")
print(f"  VaR (95%): {var_95:.4f} cents per task")
print(f"  CVaR (Expected Shortfall): {cvar_95:.4f} cents per task")

# VaR by agent_type
var_by_type = df.groupby('agent_type')['potential_loss'].quantile(confidence_level)
print(f"\nVaR by Agent Type (95%):")
print(var_by_type.sort_values(ascending=False).head(10))

# Identify high-risk agents (top 5% potential loss)
high_risk_threshold = df['potential_loss'].quantile(0.95)
high_risk_agents = df[df['potential_loss'] >= high_risk_threshold]
print(f"\nHigh-Risk Agents (top 5%): {len(high_risk_agents)} agents")


## 4. Business Impact Modeling


In [None]:
# Calculate ROI for each agent type
# ROI = (Value Generated - Cost) / Cost
# Proxy value generated = success_rate * performance_index (normalized)

df['value_generated'] = df['success_rate'] * df['performance_index']
df['roi'] = (df['value_generated'] - df['cost_per_task_cents']) / df['cost_per_task_cents']

# ROI by agent_type
roi_by_type = df.groupby('agent_type').agg({
    'roi': 'mean',
    'value_generated': 'mean',
    'cost_per_task_cents': 'mean',
    'agent_id': 'count'
}).rename(columns={'agent_id': 'count'}).sort_values('roi', ascending=False)

print("ROI by Agent Type:")
print(roi_by_type.head(15))

# ROI by model_architecture
roi_by_arch = df.groupby('model_architecture').agg({
    'roi': 'mean',
    'value_generated': 'mean',
    'cost_per_task_cents': 'mean'
}).sort_values('roi', ascending=False)

print("\nROI by Model Architecture:")
print(roi_by_arch)


In [None]:
# Estimate business value created per agent
# Business value = performance * efficiency * (1 - risk)

df['business_value'] = (
    df['performance_index'] * 
    df['efficiency_score'] * 
    (1 - df['operational_risk_index'].clip(upper=1))
)

# Aggregate by agent_type
business_value_by_type = df.groupby('agent_type').agg({
    'business_value': ['mean', 'sum', 'std'],
    'agent_id': 'count'
})
business_value_by_type.columns = ['avg_value', 'total_value', 'value_std', 'count']
business_value_by_type = business_value_by_type.sort_values('total_value', ascending=False)

print("Business Value by Agent Type:")
print(business_value_by_type.head(15))

# Total business value
total_value = df['business_value'].sum()
print(f"\nTotal Business Value Generated: {total_value:.2f}")


In [None]:
# Identify high-leverage optimization opportunities
# Agents with high potential but currently underperforming

df['optimization_potential'] = (
    df['scalability_potential'] * 
    (1 - df['success_rate']) *  # Room for improvement
    df['cost_efficiency_ratio']  # Cost-effective to optimize
)

# Top optimization opportunities
optimization_opportunities = df.nlargest(20, 'optimization_potential')[
    ['agent_id', 'agent_type', 'model_architecture', 'success_rate', 
     'scalability_potential', 'optimization_potential']
]

print("Top 20 High-Leverage Optimization Opportunities:")
print(optimization_opportunities)

# Aggregate by agent_type
optimization_by_type = df.groupby('agent_type')['optimization_potential'].agg(['mean', 'sum']).sort_values('sum', ascending=False)
print("\nOptimization Potential by Agent Type:")
print(optimization_by_type.head(10))


## Strategic Insights Report


In [None]:
# Generate Strategic Insights Report

print("=" * 80)
print("STRATEGIC INSIGHTS REPORT: AI Agent Performance Intelligence System")
print("=" * 80)

print("\n### 1. COST-PERFORMANCE OPTIMIZATION")
print("-" * 40)
print(f"• Pareto-optimal configurations identified: {pareto_count} agents")
print(f"• Sweet spot configurations found for {len(sweet_spots)} task categories")
print(f"• Best marginal performance gain at cost bucket: {tradeoff_curve.loc[tradeoff_curve['marginal_perf_gain'].idxmax(), 'cost_bucket'] if tradeoff_curve['marginal_perf_gain'].notna().any() else 'N/A'}")

print("\n### 2. AGENT PORTFOLIO STRATEGY")
print("-" * 40)
category_counts = df['strategic_category'].value_counts()
for cat, count in category_counts.items():
    print(f"• {cat}: {count} agents ({count/len(df)*100:.1f}%)")
print(f"• Portfolio Diversification Score: {diversification_score:.2f}")
print(f"• Capability gaps identified: {len(gaps)} areas")

print("\n### 3. RISK ASSESSMENT")
print("-" * 40)
print(f"• Value at Risk (95%): {var_95:.4f} cents/task")
print(f"• Conditional VaR: {cvar_95:.4f} cents/task")
print(f"• High-risk agents: {len(high_risk_agents)} ({len(high_risk_agents)/len(df)*100:.1f}%)")
highest_risk_factor = max(systemic_risks, key=systemic_risks.get)
print(f"• Highest systemic risk factor: {highest_risk_factor}")

print("\n### 4. BUSINESS IMPACT")
print("-" * 40)
print(f"• Total business value generated: {total_value:.2f}")
print(f"• Average ROI: {df['roi'].mean():.2f}")
top_roi_type = roi_by_type.index[0]
print(f"• Highest ROI agent type: {top_roi_type} ({roi_by_type.loc[top_roi_type, 'roi']:.2f})")

print("\n### 5. ACTIONABLE RECOMMENDATIONS")
print("-" * 40)
print("1. OPTIMIZE: Focus on high-leverage optimization opportunities identified")
print("2. RETIRE: Consider retiring Underperformer agents with low ROI")
print("3. SCALE: Expand Workhorse agents in high-demand task categories")
print("4. DIVERSIFY: Address capability gaps in underserved task-environment combinations")
print("5. MITIGATE: Implement risk controls for high-VaR agent deployments")

print("\n" + "=" * 80)


In [None]:
# Save analysis results
analysis_columns = [
    'agent_id', 'agent_type', 'model_architecture', 'deployment_environment', 'task_category',
    'performance_index', 'success_rate', 'cost_per_task_cents', 'cost_efficiency_ratio',
    'strategic_category', 'failure_rate', 'potential_loss', 'roi', 'business_value', 'optimization_potential'
]

df[analysis_columns].to_csv('../data/ml/strategic_analysis_results.csv', index=False)
print("Analysis results saved to data/ml/strategic_analysis_results.csv")
