# Nuclear Fusion Data - Exploratory Data Analysis

This notebook performs comprehensive exploratory data analysis on nuclear fusion plasma data.

## Contents
1. Data Loading and Overview
2. Statistical Analysis
3. Plasma Parameter Visualization
4. Correlation Analysis
5. Feature Engineering
6. Anomaly Detection
7. Performance Metrics Analysis

In [None]:
# Import required libraries
import sys
import os
sys.path.append('..')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Import fusion analyzer modules
from src.data.generator import FusionDataGenerator
from src.data.processor import FusionDataProcessor
from src.visualization.plotter import FusionPlotter
from src.models.anomaly_detector import FusionAnomalyDetector

# Set style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")
pd.set_option('display.max_columns', None)

print("Libraries imported successfully!")

## 1. Data Loading and Overview

In [None]:
# Generate synthetic fusion data
generator = FusionDataGenerator()
data = generator.generate_dataset(n_samples=5000)

print(f"Dataset shape: {data.shape}")
print(f"\nColumns: {list(data.columns)}")
print(f"\nData types:")
print(data.dtypes)

In [None]:
# Basic statistics
print("Dataset Overview:")
print(f"Number of samples: {len(data)}")
print(f"Number of features: {len(data.columns)}")
print(f"Memory usage: {data.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

# Missing values
missing_values = data.isnull().sum()
if missing_values.sum() > 0:
    print(f"\nMissing values:")
    print(missing_values[missing_values > 0])
else:
    print("\nNo missing values found.")

In [None]:
# Display first few rows
data.head()

In [None]:
# Statistical summary
data.describe()

## 2. Plasma Parameter Analysis

In [None]:
# Analyze key plasma parameters
plasma_params = ['plasma_temperature', 'plasma_density', 'magnetic_field', 
                'pressure', 'confinement_time', 'beta_plasma', 'safety_factor']

# Create subplot for plasma parameters
fig, axes = plt.subplots(3, 3, figsize=(15, 12))
axes = axes.flatten()

for i, param in enumerate(plasma_params):
    if param in data.columns:
        axes[i].hist(data[param], bins=50, alpha=0.7, edgecolor='black')
        axes[i].set_title(f'{param.replace("_", " ").title()}')
        axes[i].grid(True, alpha=0.3)
        
        # Add statistics
        mean_val = data[param].mean()
        std_val = data[param].std()
        axes[i].axvline(mean_val, color='red', linestyle='--', 
                       label=f'Mean: {mean_val:.2e}')
        axes[i].legend()

# Remove empty subplots
for i in range(len(plasma_params), len(axes)):
    fig.delaxes(axes[i])

plt.tight_layout()
plt.suptitle('Plasma Parameter Distributions', fontsize=16, y=1.02)
plt.show()

In [None]:
# Q factor analysis
if 'q_factor' in data.columns:
    fig, axes = plt.subplots(1, 3, figsize=(15, 5))
    
    # Q factor distribution
    axes[0].hist(data['q_factor'], bins=50, alpha=0.7, edgecolor='black')
    axes[0].axvline(1.0, color='red', linestyle='--', label='Breakeven (Q=1)')
    axes[0].axvline(5.0, color='orange', linestyle='--', label='Ignition (Q=5)')
    axes[0].set_xlabel('Q Factor')
    axes[0].set_ylabel('Frequency')
    axes[0].set_title('Q Factor Distribution')
    axes[0].legend()
    axes[0].grid(True, alpha=0.3)
    
    # Q factor vs time
    axes[1].plot(data.index, data['q_factor'], alpha=0.6)
    axes[1].axhline(1.0, color='red', linestyle='--', label='Breakeven')
    axes[1].set_xlabel('Sample Index')
    axes[1].set_ylabel('Q Factor')
    axes[1].set_title('Q Factor Over Time')
    axes[1].legend()
    axes[1].grid(True, alpha=0.3)
    
    # Performance categories
    q_categories = pd.cut(data['q_factor'], 
                         bins=[0, 0.1, 1.0, 5.0, np.inf],
                         labels=['Very Low', 'Sub-critical', 'Breakeven+', 'Ignition+'])
    
    q_counts = q_categories.value_counts()
    axes[2].pie(q_counts.values, labels=q_counts.index, autopct='%1.1f%%')
    axes[2].set_title('Performance Categories')
    
    plt.tight_layout()
    plt.show()
    
    # Print Q factor statistics
    print(f"Q Factor Statistics:")
    print(f"Mean: {data['q_factor'].mean():.3f}")
    print(f"Median: {data['q_factor'].median():.3f}")
    print(f"Max: {data['q_factor'].max():.3f}")
    print(f"Breakeven rate (Q >= 1): {(data['q_factor'] >= 1.0).mean()*100:.1f}%")
    print(f"Ignition rate (Q >= 5): {(data['q_factor'] >= 5.0).mean()*100:.1f}%")

## 3. Correlation Analysis

In [None]:
# Calculate correlation matrix for numerical columns
numerical_cols = data.select_dtypes(include=[np.number]).columns
correlation_matrix = data[numerical_cols].corr()

# Create correlation heatmap
plt.figure(figsize=(14, 12))
mask = np.triu(np.ones_like(correlation_matrix, dtype=bool))
sns.heatmap(correlation_matrix, mask=mask, annot=True, cmap='RdBu_r', 
            center=0, square=True, cbar_kws={"shrink": .8})
plt.title('Parameter Correlation Matrix')
plt.tight_layout()
plt.show()

In [None]:
# Find strong correlations with Q factor
if 'q_factor' in data.columns:
    q_correlations = correlation_matrix['q_factor'].abs().sort_values(ascending=False)
    
    print("Strongest correlations with Q Factor:")
    print(q_correlations[1:11])  # Top 10 (excluding self-correlation)
    
    # Visualize top correlations
    top_corr_features = q_correlations[1:6].index  # Top 5
    
    fig, axes = plt.subplots(2, 3, figsize=(15, 10))
    axes = axes.flatten()
    
    for i, feature in enumerate(top_corr_features):
        axes[i].scatter(data[feature], data['q_factor'], alpha=0.5)
        axes[i].set_xlabel(feature.replace('_', ' ').title())
        axes[i].set_ylabel('Q Factor')
        
        # Add correlation coefficient
        corr_coef = correlation_matrix.loc['q_factor', feature]
        axes[i].set_title(f'Correlation: {corr_coef:.3f}')
        axes[i].grid(True, alpha=0.3)
    
    # Remove empty subplot
    fig.delaxes(axes[5])
    
    plt.tight_layout()
    plt.suptitle('Q Factor vs Key Parameters', fontsize=16, y=1.02)
    plt.show()

## 4. Feature Engineering Analysis

In [None]:
# Apply feature engineering
processor = FusionDataProcessor()
engineered_data = processor.engineer_features(data)

print(f"Original features: {len(data.columns)}")
print(f"Engineered features: {len(engineered_data.columns)}")
print(f"New features added: {len(engineered_data.columns) - len(data.columns)}")

# List new features
new_features = set(engineered_data.columns) - set(data.columns)
print(f"\nNew engineered features:")
for feature in sorted(new_features):
    print(f"  - {feature}")

In [None]:
# Analyze some key engineered features
key_engineered = ['thermal_energy_density', 'magnetic_pressure', 'power_efficiency', 
                 'normalized_lawson', 'troyon_beta_ratio']

available_engineered = [f for f in key_engineered if f in engineered_data.columns]

if available_engineered:
    fig, axes = plt.subplots(2, 3, figsize=(15, 10))
    axes = axes.flatten()
    
    for i, feature in enumerate(available_engineered):
        if i < len(axes):
            axes[i].hist(engineered_data[feature], bins=50, alpha=0.7, edgecolor='black')
            axes[i].set_title(feature.replace('_', ' ').title())
            axes[i].grid(True, alpha=0.3)
    
    # Remove empty subplots
    for i in range(len(available_engineered), len(axes)):
        fig.delaxes(axes[i])
    
    plt.tight_layout()
    plt.suptitle('Engineered Feature Distributions', fontsize=16, y=1.02)
    plt.show()

## 5. Anomaly Detection Analysis

In [None]:
# Generate some anomalous data
anomaly_data = generator.generate_anomaly_data(data, anomaly_fraction=0.05)

print(f"Total samples: {len(anomaly_data)}")
print(f"Anomalous samples: {anomaly_data['is_anomaly'].sum()}")
print(f"Anomaly rate: {anomaly_data['is_anomaly'].mean()*100:.2f}%")

In [None]:
# Analyze anomaly characteristics
normal_data = anomaly_data[~anomaly_data['is_anomaly']]
anomalous_data = anomaly_data[anomaly_data['is_anomaly']]

# Compare key parameters between normal and anomalous samples
comparison_params = ['plasma_temperature', 'q_factor', 'plasma_stability', 'disruption_probability']

fig, axes = plt.subplots(2, 2, figsize=(12, 10))
axes = axes.flatten()

for i, param in enumerate(comparison_params):
    if param in anomaly_data.columns:
        axes[i].hist(normal_data[param], bins=30, alpha=0.7, 
                    label='Normal', color='blue', density=True)
        axes[i].hist(anomalous_data[param], bins=30, alpha=0.7, 
                    label='Anomalous', color='red', density=True)
        axes[i].set_xlabel(param.replace('_', ' ').title())
        axes[i].set_ylabel('Density')
        axes[i].legend()
        axes[i].grid(True, alpha=0.3)

plt.tight_layout()
plt.suptitle('Normal vs Anomalous Sample Comparison', fontsize=16, y=1.02)
plt.show()

## 6. Performance Analysis

In [None]:
# Analyze fusion performance metrics
performance_metrics = ['q_factor', 'fusion_power', 'lawson_criterion', 
                      'triple_product', 'energy_confinement_time']

available_metrics = [m for m in performance_metrics if m in data.columns]

if len(available_metrics) >= 2:
    # Create performance correlation matrix
    perf_corr = data[available_metrics].corr()
    
    plt.figure(figsize=(10, 8))
    sns.heatmap(perf_corr, annot=True, cmap='RdBu_r', center=0, 
                square=True, cbar_kws={"shrink": .8})
    plt.title('Performance Metrics Correlation')
    plt.tight_layout()
    plt.show()

In [None]:
# Performance vs operational mode
if 'operational_mode' in data.columns and 'q_factor' in data.columns:
    plt.figure(figsize=(12, 6))
    
    # Box plot of Q factor by operational mode
    sns.boxplot(data=data, x='operational_mode', y='q_factor')
    plt.xticks(rotation=45)
    plt.title('Q Factor Distribution by Operational Mode')
    plt.axhline(y=1.0, color='red', linestyle='--', alpha=0.7, label='Breakeven')
    plt.legend()
    plt.tight_layout()
    plt.show()
    
    # Summary statistics by mode
    mode_stats = data.groupby('operational_mode')['q_factor'].agg(['mean', 'median', 'std', 'count'])
    print("\nQ Factor Statistics by Operational Mode:")
    print(mode_stats)

## 7. Summary and Insights

In [None]:
# Generate summary insights
print("FUSION DATA ANALYSIS SUMMARY")
print("=" * 50)

# Dataset overview
print(f"Dataset Size: {len(data):,} samples, {len(data.columns)} features")
print(f"Data Quality: {(1 - data.isnull().sum().sum() / data.size) * 100:.1f}% complete")

# Performance insights
if 'q_factor' in data.columns:
    breakeven_rate = (data['q_factor'] >= 1.0).mean() * 100
    ignition_rate = (data['q_factor'] >= 5.0).mean() * 100
    print(f"\nPerformance Metrics:")
    print(f"  - Breakeven Achievement Rate: {breakeven_rate:.1f}%")
    print(f"  - Ignition Achievement Rate: {ignition_rate:.1f}%")
    print(f"  - Average Q Factor: {data['q_factor'].mean():.3f}")

# Anomaly insights
if 'is_anomaly' in anomaly_data.columns:
    anomaly_rate = anomaly_data['is_anomaly'].mean() * 100
    print(f"\nAnomaly Analysis:")
    print(f"  - Anomaly Rate: {anomaly_rate:.2f}%")
    
    if 'disruption_probability' in anomaly_data.columns:
        avg_disruption_risk = anomaly_data['disruption_probability'].mean() * 100
        print(f"  - Average Disruption Risk: {avg_disruption_risk:.1f}%")

# Feature insights
print(f"\nFeature Engineering:")
print(f"  - Original Features: {len(data.columns)}")
print(f"  - Engineered Features: {len(engineered_data.columns)}")
print(f"  - Feature Expansion: {(len(engineered_data.columns) / len(data.columns) - 1) * 100:.1f}%")

# Correlation insights
if 'q_factor' in correlation_matrix.columns:
    top_corr = correlation_matrix['q_factor'].abs().nlargest(6)
    print(f"\nStrongest Q Factor Correlations:")
    for feature, corr in top_corr[1:4].items():  # Top 3 (excluding self)
        print(f"  - {feature}: {corr:.3f}")

print("\nAnalysis Complete!")