# Data Analysis using REST API

This notebook demonstrates how to access and analyze road accident data using the REST API instead of direct database access.

**Why use the API approach?**
- No database credentials needed
- Works with remote deployments
- Clean separation of concerns
- Built-in data validation
- Easy to share notebooks with team members

**Prerequisites:**
- FastAPI server running on `http://localhost:8000`
- Python libraries: `requests`, `pandas`, `numpy`, `plotly`, `sklearn`

In [None]:
# Import Required Libraries
import requests
import pandas as pd
import numpy as np
from typing import Dict, List
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA, FactorAnalysis
from sklearn.cluster import KMeans

print("All libraries imported successfully")

## Configure API Client

In [None]:
# API Configuration
API_BASE_URL = 'http://localhost:8000'  # Change to production URL
API_VERSION = 'v1'
BASE_ENDPOINT = f'{API_BASE_URL}/api/{API_VERSION}'

print(f'API Endpoint: {BASE_ENDPOINT}')

# Helper function to make requests
def api_request(endpoint: str, params: Dict = None) -> List[Dict]:
    """Make API request and return data"""
    url = f'{BASE_ENDPOINT}{endpoint}'
    try:
        response = requests.get(url, params=params, timeout=30)
        response.raise_for_status()
        return response.json()
    except requests.exceptions.RequestException as e:
        print(f'Error: {e}')
        return []

print('API client ready')

## 1. Check API Health

In [None]:
# Check if API is running
try:
    response = requests.get(f'{API_BASE_URL}/health', timeout=5)
    print('API Status:', response.status_code)
    print(response.json())
except:
    print('API not responding - make sure it\'s running:')
    print('  python -m uvicorn src.api.main:app --reload')

## 2. Fetch Accidents Data

In [None]:
# Get accidents with parameters
data = api_request('/accidents', params={
    'annee': 2022,
    'limit': 1000
})

df_accidents = pd.DataFrame(data)
print(f'Fetched {len(df_accidents)} records from API')
print(f'\nColumns: {df_accidents.columns.tolist()}')
print(f'\nFirst rows:')
df_accidents.head()

## 3. Get Filtered Accidents

In [None]:
# Get accidents from specific year and department
data = api_request('/accidents', params={
    'annee': 2021,
    'gravite_min': 3,  # Only serious/fatal
    'limit': 500
})

df_serious = pd.DataFrame(data)
print(f'Serious accidents (2021): {len(df_serious)}')
print(f'\nData sample:')
print(df_serious[['date_accident', 'heure', 'gravite_max', 'nombre_personnes']].head(10))

## 4. Get Danger Scores

In [None]:
# Get danger scores for communes
data = api_request('/danger-scores', params={'limit': 50})

df_danger = pd.DataFrame(data)
print(f'Top {len(df_danger)} dangerous communes:')
print(df_danger[['nom_com', 'score_danger', 'nombre_accidents']].head(15))

## 5. Get Statistics

In [None]:
# Get commune statistics
data = api_request('/stats/communes', params={'limit': 30})

df_communes = pd.DataFrame(data)
print('Top communes by accident count:')
print(df_communes[['nom_com', 'nombre_accidents', 'accidents_pour_100k_hab']].head(10))

In [None]:
# Get usager (person) statistics by age/gender
data = api_request('/stats/usagers', params={'limit': 20})

df_usagers = pd.DataFrame(data)
print('Accident distribution by age/gender:')
print(df_usagers.head(15))

## 6. Get Heatmap Data (for geospatial analysis)

In [None]:
# Get geolocation data for accidents
data = api_request('/heatmap', params={
    'annee': 2022,
    'limit': 5000
})

df_heatmap = pd.DataFrame(data)
print(f'Geospatial data: {len(df_heatmap)} records')
print(f'Coordinates available: latitude, longitude')
print(df_heatmap[['latitude', 'longitude', 'gravite_max']].head())

## 7. Exploratory Data Analysis

In [None]:
# Use the large dataset for EDA
df = df_accidents.copy()

print('Data Shape:', df.shape)
print('\nData Types:')
print(df.dtypes)
print('\nMissing Values:')
print(df.isnull().sum())

## 8. Create Visualizations

In [None]:
# Accidents by day of week
day_counts = df['jour_semaine'].value_counts()

plt.figure(figsize=(10, 5))
day_counts.plot(kind='bar', color='steelblue')
plt.title('Accidents by Day of Week')
plt.xlabel('Day')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# Accidents by hour
hour_counts = df['heure'].value_counts().sort_index()

plt.figure(figsize=(12, 5))
hour_counts.plot(kind='line', marker='o', color='darkred')
plt.title('Accidents by Hour of Day')
plt.xlabel('Hour')
plt.ylabel('Count')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

In [None]:
# Gravity distribution
gravity_counts = df['gravite_max'].value_counts().sort_index()
gravity_labels = {0: 'Unknown', 1: 'Uninjured', 2: 'Minor', 3: 'Serious', 4: 'Fatal'}

plt.figure(figsize=(10, 5))
gravity_counts.plot(kind='bar', color='darkred')
plt.title('Accident Severity Distribution')
plt.xlabel('Severity Level')
plt.ylabel('Count')
plt.xticks(ticks=range(5), labels=[gravity_labels.get(i, '') for i in range(5)], rotation=45)
plt.tight_layout()
plt.show()

## 9. Feature Engineering

In [None]:
# Create derived features
df['is_night'] = ((df['heure'] >= 22) | (df['heure'] <= 5)).astype(int)
df['is_weekend'] = df['jour_semaine'].isin(['Saturday', 'Sunday']).astype(int)
df['is_fatal'] = (df['gravite_max'] == 4).astype(int)
df['is_serious'] = (df['gravite_max'] >= 3).astype(int)

print('New features created:')
print(f'  - is_night: {df["is_night"].sum()} night accidents')
print(f'  - is_weekend: {df["is_weekend"].sum()} weekend accidents')
print(f'  - is_fatal: {df["is_fatal"].sum()} fatal accidents')
print(f'  - is_serious: {df["is_serious"].sum()} serious/fatal accidents')

## 10. Analysis Examples

In [None]:
# Analyze night vs day
night_stats = df.groupby('is_night').agg({
    'gravite_max': ['mean', 'count'],
    'is_fatal': 'sum'
})

print('Night vs Day Statistics:')
night_stats.columns = ['avg_gravity', 'count', 'fatalities']
night_stats.index = ['Day', 'Night']
print(night_stats)
print(f'\nNight accidents are {night_stats.loc["Night", "avg_gravity"] / night_stats.loc["Day", "avg_gravity"]:.2f}x more severe')

In [None]:
# Analyze weekend vs weekday
weekend_stats = df.groupby('is_weekend').agg({
    'gravite_max': ['mean', 'count'],
    'is_fatal': 'sum'
})

print('Weekday vs Weekend Statistics:')
weekend_stats.columns = ['avg_gravity', 'count', 'fatalities']
weekend_stats.index = ['Weekday', 'Weekend']
print(weekend_stats)
print(f'\nWeekend fatality rate: {weekend_stats.loc["Weekend", "fatalities"] / weekend_stats.loc["Weekend", "count"]*100:.2f}%')
print(f'Weekday fatality rate: {weekend_stats.loc["Weekday", "fatalities"] / weekend_stats.loc["Weekday", "count"]*100:.2f}%')

## 11. Advanced Analysis with API Data

In [None]:
# Combine multiple API calls for deeper analysis
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

# Select numeric columns for PCA
numeric_cols = df.select_dtypes(include=[np.number]).columns
X = df[numeric_cols].dropna()

# Standardize and apply PCA
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)

print(f'PCA Explained Variance: {pca.explained_variance_ratio_.sum():.2%}')
print(f'Component 1: {pca.explained_variance_ratio_[0]:.2%}')
print(f'Component 2: {pca.explained_variance_ratio_[1]:.2%}')

## 12. Export Results

In [None]:
# Save results from API
df_accidents.to_csv('accidents_from_api.csv', index=False)
df_danger.to_csv('danger_scores_from_api.csv', index=False)
df_communes.to_csv('commune_stats_from_api.csv', index=False)

print('Results saved:')
print('  - accidents_from_api.csv')
print('  - danger_scores_from_api.csv')
print('  - commune_stats_from_api.csv')

## Summary

**Key Advantages of Using the API:**

1. **No Database Credentials Needed** - Just use the API endpoint
2. **Works Remotely** - API can be on production server
3. **Data Validation** - API handles validation
4. **Consistent Format** - Always get clean JSON/DataFrames
5. **Easy Sharing** - Can share notebooks without DB access

**Available Endpoints:**
- `/accidents` - Get raw accident records
- `/danger-scores` - Get danger scores by commune
- `/stats/communes` - Aggregate by commune
- `/stats/departements` - Aggregate by department
- `/stats/usagers` - Demographics analysis
- `/heatmap` - Geospatial data
- And more in the Swagger docs at `/docs`

**Next Steps:**
- Browse API documentation at `http://localhost:8000/docs`
- Build custom analyses on top of API data
- Create production workflows using the REST API