In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

In [2]:
df = pd.read_csv('data/training_data.csv', encoding='utf-8-sig')

In [3]:
#Basic information about data

print(f"rows: {len(df):,}")
print(f" columns: {len(df.columns)}")
print()

print("columns:")
for i, col in enumerate(df.columns, 1):
    print(f"  {i}. {col}")
print()

# أول 5 صفوف
print(df.head())
print()

rows: 33,600
 columns: 10

columns:
  1. child_id
  2. timestamp
  3. age
  4. condition
  5. heart_rate
  6. activity_level
  7. location
  8. latitude
  9. longitude
  10. incident_type

    child_id            timestamp  age condition  heart_rate  activity_level  \
0  child_000  2026-01-30 07:00:00   10    normal          78              15   
1  child_000  2026-01-30 07:15:00   10    normal          80              29   
2  child_000  2026-01-30 07:30:00   10    normal          78              28   
3  child_000  2026-01-30 07:45:00   10    normal          75              25   
4  child_000  2026-01-30 08:00:00   10    normal          82              28   

    location   latitude  longitude incident_type  
0  في الطريق  30.054326  31.245748          none  
1  في الطريق  30.054471  31.245617          none  
2  في الطريق  30.054315  31.245654          none  
3  في الطريق  30.054439  31.245778          none  
4    المدرسة  30.064449  31.255787          none  



In [4]:
#Data Types
print(df.dtypes)
print()

child_id           object
timestamp          object
age                 int64
condition          object
heart_rate          int64
activity_level      int64
location           object
latitude          float64
longitude         float64
incident_type      object
dtype: object



In [5]:
#Check for missing values
missing = df.isnull().sum()
if missing.sum() == 0:
    print("No missing values")
else:
    print("missing values")
    print(missing[missing > 0])
print()

No missing values



In [6]:
print(df.describe())
print()

                age    heart_rate  activity_level      latitude     longitude
count  33600.000000  33600.000000    33600.000000  33600.000000  33600.000000
mean       8.760000     98.221935       52.468988     30.055233     31.246533
std        2.209648     16.899822       24.464470      0.012556      0.012556
min        5.000000     66.000000        0.000000     30.034300     31.225600
25%        7.000000     88.000000       32.000000     30.049350     31.240650
50%        9.000000     95.000000       50.000000     30.064329     31.255629
75%       11.000000    102.000000       71.000000     30.064414     31.255715
max       12.000000    191.000000      157.000000     30.064500     31.255800



In [7]:
# Child distribution

print("Child Distribution")
print("_" * 70)

print(f"Total children: {df['child_id'].nunique()}")
print()
print("By condition:")
print(df.groupby('condition')['child_id'].nunique())
print()
print("By age:")
print(df['age'].value_counts().sort_index())
print()

Child Distribution
______________________________________________________________________
Total children: 100

By condition:
condition
adhd      22
autism    26
normal    52
Name: child_id, dtype: int64

By age:
age
5     3696
6     4032
7     2016
8     4368
9     4368
10    6384
11    5376
12    3360
Name: count, dtype: int64



In [8]:
# Incident distribution

print("Incident Distribution")
print("_" * 70)
incident_counts = df['incident_type'].value_counts()
print(incident_counts)
print()
total_incidents = len(df[df['incident_type'] != 'none'])
print(f"Total incidents: {total_incidents:,}")
print(f"Incident rate: {(total_incidents/len(df)*100):.2f}%")
print()

if 'potential_danger' in incident_counts:
    danger_count = incident_counts['potential_danger']
    print(f"Potential danger incidents: {danger_count:,}")
    print(f"Percentage: {(danger_count/len(df)*100):.2f}%")
print()

Incident Distribution
______________________________________________________________________
incident_type
none                31309
potential_danger     1230
anxiety_attack        600
intense_exercise      347
fall                  114
Name: count, dtype: int64

Total incidents: 2,291
Incident rate: 6.82%

Potential danger incidents: 1,230
Percentage: 3.66%



In [9]:
# Save summary
print("Saving summary report...")
summary = f"""
Safe Kids Data Analysis - Summary Report
Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}

Dataset Overview:
- Total readings: {len(df):,}
- Children: {df['child_id'].nunique()}
- Date range: {df['timestamp'].min()} to {df['timestamp'].max()}

Condition Distribution:
{df['condition'].value_counts()}

Incident Distribution:
{df['incident_type'].value_counts()}

Heart Rate Statistics:
- Mean: {df['heart_rate'].mean():.1f}
- Std: {df['heart_rate'].std():.1f}
- Min: {df['heart_rate'].min()}
- Max: {df['heart_rate'].max()}

Activity Level Statistics:
- Mean: {df['activity_level'].mean():.1f}
- Std: {df['activity_level'].std():.1f}
- Min: {df['activity_level'].min()}
- Max: {df['activity_level'].max()}
"""

with open('data/data_summary.txt', 'w', encoding='utf-8') as f:
    f.write(summary)

print("Summary saved: data/data_summary.txt")
print()

Saving summary report...
Summary saved: data/data_summary.txt



In [11]:
# Visualizations
print("=" * 70)
print("Generating Visualizations")
print("=" * 70)
print()

import os
os.makedirs('visualizations', exist_ok=True)


# Plot 1: Heart rate distribution
print("Creating visualization 1: Heart rate distribution...")
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.hist(df['heart_rate'], bins=50, color='skyblue', edgecolor='black', alpha=0.7)
plt.axvline(df['heart_rate'].mean(), color='red', linestyle='--', 
            label=f'Mean: {df["heart_rate"].mean():.1f}')
plt.xlabel('Heart Rate (BPM)')
plt.ylabel('Frequency')
plt.title('Heart Rate Distribution')
plt.legend()
plt.grid(True, alpha=0.3)

plt.subplot(1, 2, 2)
df.boxplot(column='heart_rate', by='condition', ax=plt.gca())
plt.xlabel('Condition')
plt.ylabel('Heart Rate (BPM)')
plt.title('Heart Rate by Condition')
plt.suptitle('')

plt.tight_layout()
plt.savefig('visualizations/01_heart_rate_distribution.png', dpi=300, bbox_inches='tight')
print("Saved: visualizations/01_heart_rate_distribution.png")
plt.close()


# Plot 2: Activity distribution
print("Creating visualization 2: Activity distribution...")
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.hist(df['activity_level'], bins=50, color='lightgreen', edgecolor='black', alpha=0.7)
plt.axvline(df['activity_level'].mean(), color='red', linestyle='--',
            label=f'Mean: {df["activity_level"].mean():.1f}')
plt.xlabel('Activity Level (0-100)')
plt.ylabel('Frequency')
plt.title('Activity Level Distribution')
plt.legend()
plt.grid(True, alpha=0.3)

plt.subplot(1, 2, 2)
df.boxplot(column='activity_level', by='condition', ax=plt.gca())
plt.xlabel('Condition')
plt.ylabel('Activity Level')
plt.title('Activity by Condition')
plt.suptitle('')

plt.tight_layout()
plt.savefig('visualizations/02_activity_distribution.png', dpi=300, bbox_inches='tight')
print("Saved: visualizations/02_activity_distribution.png")
plt.close()


# Plot 3: HR vs Activity scatter
print("Creating visualization 3: Heart rate vs activity...")
plt.figure(figsize=(12, 6))

colors = {
    'none': 'lightblue',
    'potential_danger': 'red',
    'anxiety_attack': 'orange',
    'intense_exercise': 'green',
    'fall': 'purple'
}

for incident_type, color in colors.items():
    subset = df[df['incident_type'] == incident_type]
    plt.scatter(subset['heart_rate'], subset['activity_level'], 
                c=color, label=incident_type, alpha=0.5, s=20)

plt.xlabel('Heart Rate (BPM)')
plt.ylabel('Activity Level (0-100)')
plt.title('Heart Rate vs Activity Level by Incident Type')
plt.legend()
plt.grid(True, alpha=0.3)

plt.savefig('visualizations/03_hr_vs_activity.png', dpi=300, bbox_inches='tight')
print("Saved: visualizations/03_hr_vs_activity.png")
plt.close()


# Plot 4: Incident distribution
print("Creating visualization 4: Incident distribution...")
plt.figure(figsize=(10, 6))

incident_counts = df['incident_type'].value_counts()
colors_list = ['lightblue' if x == 'none' else 'red' if x == 'potential_danger' 
               else 'orange' if x == 'anxiety_attack' else 'green' if x == 'intense_exercise'
               else 'purple' for x in incident_counts.index]

plt.bar(incident_counts.index, incident_counts.values, color=colors_list, edgecolor='black')
plt.xlabel('Incident Type')
plt.ylabel('Count')
plt.title('Incident Type Distribution')
plt.xticks(rotation=45, ha='right')
plt.grid(True, alpha=0.3, axis='y')

for i, (idx, val) in enumerate(incident_counts.items()):
    plt.text(i, val, f'{val:,}', ha='center', va='bottom')

plt.tight_layout()
plt.savefig('visualizations/04_incidents_distribution.png', dpi=300, bbox_inches='tight')
print("Saved: visualizations/04_incidents_distribution.png")
plt.close()


# Plot 5: Time series
print("Creating visualization 5: Heart rate by time of day...")
df['timestamp'] = pd.to_datetime(df['timestamp'])
df['hour'] = df['timestamp'].dt.hour

hourly_hr = df.groupby('hour')['heart_rate'].mean()

plt.figure(figsize=(12, 5))
plt.plot(hourly_hr.index, hourly_hr.values, marker='o', linewidth=2, markersize=8)
plt.xlabel('Hour of Day')
plt.ylabel('Average Heart Rate (BPM)')
plt.title('Average Heart Rate by Time of Day')
plt.grid(True, alpha=0.3)
plt.xticks(range(0, 24, 2))

plt.axvspan(7, 8, alpha=0.2, color='yellow', label='Transit')
plt.axvspan(8, 15, alpha=0.2, color='orange', label='School')
plt.axvspan(15, 19, alpha=0.2, color='green', label='Play')

plt.legend()
plt.tight_layout()
plt.savefig('visualizations/05_hr_by_time.png', dpi=300, bbox_inches='tight')
print("Saved: visualizations/05_hr_by_time.png")
plt.close()


# Plot 6: Condition comparison
print("Creating visualization 6: Condition comparison...")
fig, axes = plt.subplots(1, 3, figsize=(15, 5))

for i, condition in enumerate(['normal', 'autism', 'adhd']):
    condition_data = df[df['condition'] == condition]
    axes[i].scatter(condition_data['heart_rate'], condition_data['activity_level'], 
                    alpha=0.3, s=10)
    axes[i].set_title(f'{condition.upper()}')
    axes[i].set_xlabel('Heart Rate')
    axes[i].set_ylabel('Activity Level')
    axes[i].grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('visualizations/06_conditions_comparison.png', dpi=300, bbox_inches='tight')
print("Saved: visualizations/06_conditions_comparison.png")
plt.close()

print()
print("All visualizations created successfully")
print()

Generating Visualizations

Creating visualization 1: Heart rate distribution...
Saved: visualizations/01_heart_rate_distribution.png
Creating visualization 2: Activity distribution...
Saved: visualizations/02_activity_distribution.png
Creating visualization 3: Heart rate vs activity...
Saved: visualizations/03_hr_vs_activity.png
Creating visualization 4: Incident distribution...
Saved: visualizations/04_incidents_distribution.png
Creating visualization 5: Heart rate by time of day...
Saved: visualizations/05_hr_by_time.png
Creating visualization 6: Condition comparison...
Saved: visualizations/06_conditions_comparison.png

All visualizations created successfully



In [12]:
# Pattern analysis

print("Pattern Analysis")
print("_" * 70)
print()

print("Comparison: Normal vs Potential Danger")
print("-" * 70)

normal_readings = df[df['incident_type'] == 'none']
danger_readings = df[df['incident_type'] == 'potential_danger']

print(f"\nNormal readings ({len(normal_readings):,}):")
print(f"  Heart rate:")
print(f"    Mean: {normal_readings['heart_rate'].mean():.1f}")
print(f"    Std: {normal_readings['heart_rate'].std():.1f}")
print(f"    Range: {normal_readings['heart_rate'].min()} - {normal_readings['heart_rate'].max()}")
print(f"  Activity level:")
print(f"    Mean: {normal_readings['activity_level'].mean():.1f}")
print(f"    Std: {normal_readings['activity_level'].std():.1f}")

print(f"\nPotential danger ({len(danger_readings):,}):")
print(f"  Heart rate:")
print(f"    Mean: {danger_readings['heart_rate'].mean():.1f}")
print(f"    Std: {danger_readings['heart_rate'].std():.1f}")
print(f"    Range: {danger_readings['heart_rate'].min()} - {danger_readings['heart_rate'].max()}")
print(f"  Activity level:")
print(f"    Mean: {danger_readings['activity_level'].mean():.1f}")
print(f"    Std: {danger_readings['activity_level'].std():.1f}")

hr_diff = danger_readings['heart_rate'].mean() - normal_readings['heart_rate'].mean()
activity_diff = danger_readings['activity_level'].mean() - normal_readings['activity_level'].mean()

print(f"\nDifference:")
print(f"  Heart rate: +{hr_diff:.1f} BPM in danger situations")
print(f"  Activity level: +{activity_diff:.1f} in danger situations")
print()


# Location analysis

print("Location Analysis")
print("_" * 70)

location_incidents = df[df['incident_type'] != 'none'].groupby('location').size().sort_values(ascending=False)

print("\nIncidents by location:")
for location, count in location_incidents.items():
    percentage = (count / len(df[df['incident_type'] != 'none'])) * 100
    print(f"  {location}: {count:,} incidents ({percentage:.1f}%)")
print()


# Condition analysis

print("Condition Analysis")
print("_" * 70)

for condition in ['normal', 'autism', 'adhd']:
    condition_data = df[df['condition'] == condition]
    condition_incidents = condition_data[condition_data['incident_type'] != 'none']
    
    print(f"\n{condition.upper()}:")
    print(f"  Children: {condition_data['child_id'].nunique()}")
    print(f"  Total readings: {len(condition_data):,}")
    print(f"  Incidents: {len(condition_incidents):,}")
    print(f"  Incident rate: {(len(condition_incidents)/len(condition_data)*100):.2f}%")
    print(f"  Mean heart rate: {condition_data['heart_rate'].mean():.1f}")
    print(f"  Mean activity: {condition_data['activity_level'].mean():.1f}")

print()

Pattern Analysis
______________________________________________________________________

Comparison: Normal vs Potential Danger
----------------------------------------------------------------------

Normal readings (31,309):
  Heart rate:
    Mean: 94.4
    Std: 8.6
    Range: 66 - 123
  Activity level:
    Mean: 49.7
    Std: 21.5

Potential danger (1,230):
  Heart rate:
    Mean: 159.9
    Std: 10.8
    Range: 128 - 191
  Activity level:
    Mean: 99.1
    Std: 22.7

Difference:
  Heart rate: +65.5 BPM in danger situations
  Activity level: +49.4 in danger situations

Location Analysis
______________________________________________________________________

Incidents by location:
  المدرسة: 1,300 incidents (56.7%)
  النادي: 607 incidents (26.5%)
  في الطريق: 384 incidents (16.8%)

Condition Analysis
______________________________________________________________________

NORMAL:
  Children: 52
  Total readings: 17,472
  Incidents: 1,206
  Incident rate: 6.90%
  Mean heart rate: 96.5
 

In [13]:
# Feature engineering
print("=" * 70)
print("Feature Engineering")
print("=" * 70)
print()

df_enhanced = df.copy()

df_enhanced['timestamp'] = pd.to_datetime(df_enhanced['timestamp'])
df_enhanced['hour'] = df_enhanced['timestamp'].dt.hour

df_enhanced['is_school_time'] = df_enhanced['hour'].apply(
    lambda x: 1 if 8 <= x < 15 else 0
)


print("Adding features:")
print("  1. is_high_hr (heart rate > 110)")
df_enhanced['is_high_hr'] = (df_enhanced['heart_rate'] > 110).astype(int)

print("  2. is_high_activity (activity > 60)")
df_enhanced['is_high_activity'] = (df_enhanced['activity_level'] > 60).astype(int)

print("  3. hr_to_activity_ratio")
df_enhanced['hr_to_activity_ratio'] = df_enhanced['heart_rate'] / (df_enhanced['activity_level'] + 1)

print("  4. is_school_time (8 AM - 3 PM)")
df_enhanced['is_school_time'] = df_enhanced['hour'].apply(lambda x: 1 if 8 <= x < 15 else 0)

print("  5. in_safe_zone")
safe_locations = ['home', 'school']
df_enhanced['in_safe_zone'] = df_enhanced['location'].apply(lambda x: 1 if x in safe_locations else 0)

print("  6. is_dangerous (label for ML)")
df_enhanced['is_dangerous'] = (df_enhanced['incident_type'] != 'none').astype(int)


print("Saving enhanced dataset...")
df_enhanced.to_csv('data/enhanced_data.csv', index=False, encoding='utf-8-sig')


Feature Engineering

Adding features:
  1. is_high_hr (heart rate > 110)
  2. is_high_activity (activity > 60)
  3. hr_to_activity_ratio
  4. is_school_time (8 AM - 3 PM)
  5. in_safe_zone
  6. is_dangerous (label for ML)
Saving enhanced dataset...


In [14]:
#Anomaly Detection
import pandas as pd
import numpy as np
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns
import os

In [20]:
df = pd.read_csv('data/enhanced_data.csv', encoding='utf-8-sig')

# Method 1: Z-Score
print("Method 1: Z-Score")

def detect_anomaly_zscore(df, column='heart_rate', threshold=2.5):

    mean = df[column].mean()
    std = df[column].std()
    
    # Calculate Z-Score
    df['z_score'] = np.abs((df[column] - mean) / std)
    
    # Mark anomaly
    df['is_anomaly_zscore'] = (df['z_score'] > threshold).astype(int)
    
    return df


df = detect_anomaly_zscore(df, column='heart_rate', threshold=2.5)

anomalies_zscore = df[df['is_anomaly_zscore'] == 1]
print(f"Detected anomalies: {len(anomalies_zscore):,}")
print(f"Percentage: {(len(anomalies_zscore)/len(df)*100):.2f}%")
print()


# Method 2: IQR

print("_" * 60)
print("Method 2: IQR")


def detect_anomaly_iqr(df, column='heart_rate'):
    """
    Detect anomaly using IQR
    """
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    print(f"Normal range: {lower_bound:.1f} - {upper_bound:.1f}")
    
    # Mark anomaly
    df['is_anomaly_iqr'] = (
        (df[column] < lower_bound) | (df[column] > upper_bound)
    ).astype(int)
    
    return df

# Apply
df = detect_anomaly_iqr(df, column='heart_rate')

anomalies_iqr = df[df['is_anomaly_iqr'] == 1]
print(f"Detected anomalies: {len(anomalies_iqr):,}")
print(f"Percentage: {(len(anomalies_iqr)/len(df)*100):.2f}%")
print()


# Method 3: Isolation Forest (ML)

print("_" * 60)
print("Method 3: Isolation Forest (Machine Learning)")


# Prepare data for ML
features = ['heart_rate', 'activity_level', 'hr_to_activity_ratio']
X = df[features].values

# Train model

model = IsolationForest(
    contamination=0.07,  # Expected anomaly rate (7%)
    random_state=42
)
model.fit(X)


# Predict
predictions = model.predict(X)
df['is_anomaly_ml'] = (predictions == -1).astype(int)  # -1 = anomaly

anomalies_ml = df[df['is_anomaly_ml'] == 1]
print(f"Detected anomalies: {len(anomalies_ml):,}")
print(f"Percentage: {(len(anomalies_ml)/len(df)*100):.2f}%")
print()

# Compare the three methods

print("_" * 60)
print("Comparison of the Three Methods")
print("_" * 60)
print()

# Ground truth (from data)
actual_anomalies = df[df['incident_type'] != 'none']

print(f"Actual incidents (Ground Truth): {len(actual_anomalies):,}")
print()

# Comparison
methods = {
    'Z-Score': 'is_anomaly_zscore',
    'IQR': 'is_anomaly_iqr',
    'ML (Isolation Forest)': 'is_anomaly_ml'
}

for method_name, column in methods.items():
    detected = df[df[column] == 1]
    
    # How many actual incidents were detected?
    true_positives = len(df[(df[column] == 1) & (df['incident_type'] != 'none')])
    false_positives = len(df[(df[column] == 1) & (df['incident_type'] == 'none')])
    
    precision = true_positives / len(detected) if len(detected) > 0 else 0
    recall = true_positives / len(actual_anomalies) if len(actual_anomalies) > 0 else 0
    
    print(f"{method_name}:")
    print(f"  Detected: {len(detected):,} cases")
    print(f"  True Positive: {true_positives:,}")
    print(f"  False Positive: {false_positives:,}")
    print(f"  Precision: {precision:.2%}")
    print(f"  Recall: {recall:.2%}")
    print()


df.to_csv('data/data_with_anomalies.csv', index=False, encoding='utf-8-sig')


Method 1: Z-Score
Detected anomalies: 1,721
Percentage: 5.12%

____________________________________________________________
Method 2: IQR
Normal range: 67.0 - 123.0
Detected anomalies: 2,061
Percentage: 6.13%

____________________________________________________________
Method 3: Isolation Forest (Machine Learning)
Detected anomalies: 2,351
Percentage: 7.00%

____________________________________________________________
Comparison of the Three Methods
____________________________________________________________

Actual incidents (Ground Truth): 2,291

Z-Score:
  Detected: 1,721 cases
  True Positive: 1,721
  False Positive: 0
  Precision: 100.00%
  Recall: 75.12%

IQR:
  Detected: 2,061 cases
  True Positive: 2,060
  False Positive: 1
  Precision: 99.95%
  Recall: 89.92%

ML (Isolation Forest):
  Detected: 2,351 cases
  True Positive: 1,773
  False Positive: 578
  Precision: 75.41%
  Recall: 77.39%



In [23]:
#  Visualization

os.makedirs('visualizations', exist_ok=True)

fig, axes = plt.subplots(2, 2, figsize=(15, 12))

# Z-Score Plot
axes[0, 0].scatter(df[df['is_anomaly_zscore']==0]['heart_rate'],
                   df[df['is_anomaly_zscore']==0]['activity_level'],
                   alpha=0.3, s=10, label='Normal')
axes[0, 0].scatter(df[df['is_anomaly_zscore']==1]['heart_rate'],
                   df[df['is_anomaly_zscore']==1]['activity_level'],
                   alpha=0.7, s=20, label='Anomaly')
axes[0, 0].set_title('Z-Score Method')
axes[0, 0].set_xlabel('Heart Rate')
axes[0, 0].set_ylabel('Activity Level')
axes[0, 0].legend()
axes[0, 0].grid(True, alpha=0.3)

# IQR Plot
axes[0, 1].scatter(df[df['is_anomaly_iqr']==0]['heart_rate'],
                   df[df['is_anomaly_iqr']==0]['activity_level'],
                   alpha=0.3, s=10, label='Normal')
axes[0, 1].scatter(df[df['is_anomaly_iqr']==1]['heart_rate'],
                   df[df['is_anomaly_iqr']==1]['activity_level'],
                   alpha=0.7, s=20, label='Anomaly')
axes[0, 1].set_title('IQR Method')
axes[0, 1].set_xlabel('Heart Rate')
axes[0, 1].set_ylabel('Activity Level')
axes[0, 1].legend()
axes[0, 1].grid(True, alpha=0.3)

# Isolation Forest Plot
axes[1, 0].scatter(df[df['is_anomaly_ml']==0]['heart_rate'],
                   df[df['is_anomaly_ml']==0]['activity_level'],
                   alpha=0.3, s=10, label='Normal')
axes[1, 0].scatter(df[df['is_anomaly_ml']==1]['heart_rate'],
                   df[df['is_anomaly_ml']==1]['activity_level'],
                   alpha=0.7, s=20, label='Anomaly')
axes[1, 0].set_title('Isolation Forest')
axes[1, 0].set_xlabel('Heart Rate')
axes[1, 0].set_ylabel('Activity Level')
axes[1, 0].legend()
axes[1, 0].grid(True, alpha=0.3)

# Ground Truth Plot
axes[1, 1].scatter(df[df['incident_type']=='none']['heart_rate'],
                   df[df['incident_type']=='none']['activity_level'],
                   alpha=0.3, s=10, label='Normal')
axes[1, 1].scatter(df[df['incident_type']!='none']['heart_rate'],
                   df[df['incident_type']!='none']['activity_level'],
                   alpha=0.7, s=20, label='Incident')
axes[1, 1].set_title('Ground Truth')
axes[1, 1].set_xlabel('Heart Rate')
axes[1, 1].set_ylabel('Activity Level')
axes[1, 1].legend()
axes[1, 1].grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('visualizations/07_anomaly_detection_comparison.png', dpi=300)
plt.close()





In [None]:
#model

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.preprocessing import StandardScaler
import joblib


df = pd.read_csv("data/enhanced_data.csv")

features = [
    'heart_rate',
    'activity_level',
    'is_high_hr',
    'is_high_activity',
    'hr_to_activity_ratio',
    'is_school_time',
    'in_safe_zone'
]

X = df[features]
y = df['is_dangerous']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


model = RandomForestClassifier(
    n_estimators=100,
    max_depth=10,
    random_state=42
)

model.fit(X_train_scaled, y_train)

In [24]:
y_pred = model.predict(X_test_scaled)

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")
print()

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print()

print("Classification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 0.9958

Confusion Matrix:
[[6254    8]
 [  20  438]]

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      6262
           1       0.98      0.96      0.97       458

    accuracy                           1.00      6720
   macro avg       0.99      0.98      0.98      6720
weighted avg       1.00      1.00      1.00      6720



In [25]:
model = RandomForestClassifier(
    n_estimators=300,
    class_weight='balanced',
    random_state=42
)
model.fit(X_train_scaled, y_train)


y_pred = model.predict(X_test_scaled)

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")
print()

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print()

print("Classification Report:")
print(classification_report(y_test, y_pred))


Accuracy: 0.9915

Confusion Matrix:
[[6225   37]
 [  20  438]]

Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.99      1.00      6262
           1       0.92      0.96      0.94       458

    accuracy                           0.99      6720
   macro avg       0.96      0.98      0.97      6720
weighted avg       0.99      0.99      0.99      6720

