# TPP Data Analysis & Visualization

Comprehensive analysis notebook for Thyrotoxic Periodic Paralysis monitoring data.

**Contents:**
1. Data loading and preprocessing
2. Exploratory data analysis (EDA)
3. Statistical analysis
4. Time-series visualization
5. Anomaly detection
6. Export results

## 1. Setup & Import Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from datetime import datetime, timedelta
import json
import warnings
warnings.filterwarnings('ignore')

# Configure plotting
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)
plt.rcParams['font.size'] = 10

print("✓ Libraries loaded successfully")

## 2. Load Sample Data

Loading all available sample data files (CSV and JSON formats).

In [None]:
# Load CSV data
df_csv = pd.read_csv('../sample-data/heart-rate-sample.csv')

# Load JSON data
with open('../sample-data/heart-rate-sample.json', 'r') as f:
    data_json = json.load(f)
df_json = pd.DataFrame(data_json)

with open('../sample-data/night-monitoring.json', 'r') as f:
    night_data = json.load(f)
df_night = pd.DataFrame(night_data)

# Combine all data
df_all = pd.concat([df_csv, df_json, df_night], ignore_index=True)

# Convert timestamp to datetime
df_all['timestamp'] = pd.to_datetime(df_all['timestamp'])

# Sort by timestamp
df_all = df_all.sort_values('timestamp').reset_index(drop=True)

print(f"✓ Loaded {len(df_all)} total records")
print(f"  - CSV: {len(df_csv)} records")
print(f"  - JSON (day): {len(df_json)} records")
print(f"  - JSON (night): {len(df_night)} records")
print(f"\nDate range: {df_all['timestamp'].min()} to {df_all['timestamp'].max()}")
df_all.head()

## 3. Exploratory Data Analysis (EDA)

In [None]:
# Data overview
print("Dataset Info:")
print(f"Shape: {df_all.shape}")
print(f"\nColumns: {list(df_all.columns)}")
print(f"\nData types:\n{df_all.dtypes}")
print(f"\nMissing values:\n{df_all.isnull().sum()}")
print(f"\n{df_all.describe()}")

# Activity distribution
print("\n\nActivity Distribution:")
print(df_all['activity'].value_counts())

## 4. Time-Series Visualization

In [None]:
# Heart Rate over time
fig, axes = plt.subplots(3, 1, figsize=(14, 10))

# Plot 1: Heart Rate
axes[0].plot(df_all['timestamp'], df_all['heartRate'], marker='o', linestyle='-', linewidth=1.5)
axes[0].set_ylabel('Heart Rate (bpm)', fontsize=12)
axes[0].set_title('Heart Rate Over Time', fontsize=14, fontweight='bold')
axes[0].grid(True, alpha=0.3)
axes[0].axhline(y=df_all['heartRate'].mean(), color='r', linestyle='--', label=f'Mean: {df_all["heartRate"].mean():.1f} bpm')
axes[0].legend()

# Plot 2: HRV
axes[1].plot(df_all['timestamp'], df_all['hrv'], marker='s', linestyle='-', linewidth=1.5, color='green')
axes[1].set_ylabel('HRV (ms)', fontsize=12)
axes[1].set_title('Heart Rate Variability Over Time', fontsize=14, fontweight='bold')
axes[1].grid(True, alpha=0.3)
axes[1].axhline(y=df_all['hrv'].mean(), color='r', linestyle='--', label=f'Mean: {df_all["hrv"].mean():.1f} ms')
axes[1].legend()

# Plot 3: Activity-colored scatter
activity_colors = {'resting': 'blue', 'walking': 'orange', 'exercise': 'red', 'sleeping': 'purple'}
for activity in df_all['activity'].unique():
    mask = df_all['activity'] == activity
    axes[2].scatter(df_all[mask]['timestamp'], df_all[mask]['heartRate'], 
                   label=activity, alpha=0.7, s=100, color=activity_colors.get(activity, 'gray'))
axes[2].set_xlabel('Time', fontsize=12)
axes[2].set_ylabel('Heart Rate (bpm)', fontsize=12)
axes[2].set_title('Heart Rate by Activity Type', fontsize=14, fontweight='bold')
axes[2].legend()
axes[2].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 5. Statistical Analysis by Activity

In [None]:
# Group statistics by activity
activity_stats = df_all.groupby('activity').agg({
    'heartRate': ['mean', 'std', 'min', 'max'],
    'hrv': ['mean', 'std', 'min', 'max']
}).round(2)

print("Statistics by Activity Type:\n")
print(activity_stats)

# Visualize distributions
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Heart Rate distribution by activity
df_all.boxplot(column='heartRate', by='activity', ax=axes[0])
axes[0].set_title('Heart Rate Distribution by Activity', fontsize=14, fontweight='bold')
axes[0].set_xlabel('Activity', fontsize=12)
axes[0].set_ylabel('Heart Rate (bpm)', fontsize=12)
plt.sca(axes[0])
plt.xticks(rotation=45)

# HRV distribution by activity
df_all.boxplot(column='hrv', by='activity', ax=axes[1])
axes[1].set_title('HRV Distribution by Activity', fontsize=14, fontweight='bold')
axes[1].set_xlabel('Activity', fontsize=12)
axes[1].set_ylabel('HRV (ms)', fontsize=12)
plt.sca(axes[1])
plt.xticks(rotation=45)

plt.suptitle('')  # Remove default title
plt.tight_layout()
plt.show()

## 6. Anomaly Detection

Identify unusual heart rate patterns that may indicate TPP episodes.

In [None]:
# Z-score based anomaly detection
df_all['hr_zscore'] = np.abs(stats.zscore(df_all['heartRate']))
df_all['hrv_zscore'] = np.abs(stats.zscore(df_all['hrv']))

# Flag anomalies (Z-score > 2)
threshold = 2
df_all['is_anomaly'] = (df_all['hr_zscore'] > threshold) | (df_all['hrv_zscore'] > threshold)

anomalies = df_all[df_all['is_anomaly']]

print(f"Detected {len(anomalies)} potential anomalies ({len(anomalies)/len(df_all)*100:.1f}%)\n")
print(anomalies[['timestamp', 'heartRate', 'hrv', 'activity', 'hr_zscore', 'hrv_zscore']])

# Visualize anomalies
plt.figure(figsize=(14, 6))
plt.scatter(df_all[~df_all['is_anomaly']]['timestamp'], 
           df_all[~df_all['is_anomaly']]['heartRate'], 
           c='blue', alpha=0.6, s=50, label='Normal')
plt.scatter(anomalies['timestamp'], 
           anomalies['heartRate'], 
           c='red', alpha=0.8, s=150, marker='X', label='Anomaly')
plt.xlabel('Time', fontsize=12)
plt.ylabel('Heart Rate (bpm)', fontsize=12)
plt.title('Anomaly Detection: Heart Rate', fontsize=14, fontweight='bold')
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

## 7. Correlation Analysis

In [None]:
# Correlation matrix
numeric_cols = ['heartRate', 'hrv']
if 'temperature' in df_all.columns and df_all['temperature'].notna().sum() > 0:
    numeric_cols.append('temperature')

correlation = df_all[numeric_cols].corr()

plt.figure(figsize=(8, 6))
sns.heatmap(correlation, annot=True, cmap='coolwarm', center=0, 
            square=True, linewidths=1, cbar_kws={"shrink": 0.8})
plt.title('Correlation Matrix: Health Metrics', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()

print("\nCorrelation coefficients:")
print(correlation)

## 8. Export Results

Save processed data and summary statistics for further use.

In [None]:
# Export processed data with anomaly flags
df_all.to_csv('../sample-data/processed-data.csv', index=False)
print("✓ Exported processed data to: sample-data/processed-data.csv")

# Export anomalies only
anomalies.to_csv('../sample-data/detected-anomalies.csv', index=False)
print(f"✓ Exported {len(anomalies)} anomalies to: sample-data/detected-anomalies.csv")

# Export summary statistics
with open('../sample-data/summary-stats.txt', 'w') as f:
    f.write("TPP Data Analysis Summary\n")
    f.write("=" * 50 + "\n\n")
    f.write(f"Total records: {len(df_all)}\n")
    f.write(f"Date range: {df_all['timestamp'].min()} to {df_all['timestamp'].max()}\n\n")
    f.write("Statistics by Activity:\n")
    f.write(str(activity_stats))
    f.write("\n\nAnomalies detected: " + str(len(anomalies)))
    
print("✓ Exported summary statistics to: sample-data/summary-stats.txt")
print("\n✅ Analysis complete!")