## 2. Time Series Analysis

Analyze temporal patterns and trends in accident occurrence over time.

# Create daily time series
daily_accidents = df.groupby('Start_Time').size().reset_index()
daily_accidents.columns = ['Date', 'Count']
daily_accidents.set_index('Date', inplace=True)

# Plot time series with trend
plt.figure(figsize=(15, 6))
daily_accidents.plot()
plt.title('Daily Accident Counts')
plt.xlabel('Date')
plt.ylabel('Number of Accidents')
plt.show()

# Add rolling averages
fig = go.Figure()
fig.add_trace(go.Scatter(x=daily_accidents.index, y=daily_accidents['Count'],
                        mode='lines', name='Daily Accidents'))

# Add 7-day moving average
rolling_7 = daily_accidents['Count'].rolling(window=7).mean()
fig.add_trace(go.Scatter(x=daily_accidents.index, y=rolling_7,
                        mode='lines', name='7-day Moving Average',
                        line=dict(color='red')))

# Add 30-day moving average
rolling_30 = daily_accidents['Count'].rolling(window=30).mean()
fig.add_trace(go.Scatter(x=daily_accidents.index, y=rolling_30,
                        mode='lines', name='30-day Moving Average',
                        line=dict(color='green')))

fig.update_layout(title='Accident Trends with Moving Averages',
                 xaxis_title='Date',
                 yaxis_title='Number of Accidents')
fig.show()

# Seasonal decomposition
from statsmodels.tsa.seasonal import seasonal_decompose

# Resample to daily frequency if needed
daily_series = daily_accidents['Count'].resample('D').sum()

# Perform seasonal decomposition
decomposition = seasonal_decompose(daily_series, period=365)

# Plot decomposition
fig, (ax1, ax2, ax3, ax4) = plt.subplots(4, 1, figsize=(15, 12))

decomposition.observed.plot(ax=ax1)
ax1.set_title('Observed')
decomposition.trend.plot(ax=ax2)
ax2.set_title('Trend')
decomposition.seasonal.plot(ax=ax3)
ax3.set_title('Seasonal')
decomposition.resid.plot(ax=ax4)
ax4.set_title('Residual')

plt.tight_layout()
plt.show()

## 3. Clustering Analysis

Perform spatial clustering to identify accident hotspots and patterns.

from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

# Prepare data for clustering
X = df[['Start_Lat', 'Start_Lng']].copy()
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Find optimal number of clusters
silhouette_scores = []
K = range(2, 11)
for k in K:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(X_scaled)
    score = silhouette_score(X_scaled, kmeans.labels_)
    silhouette_scores.append(score)

# Plot silhouette scores
plt.figure(figsize=(10, 6))
plt.plot(K, silhouette_scores, 'bo-')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('Silhouette Score')
plt.title('Silhouette Score vs Number of Clusters')
plt.show()

# Perform clustering with optimal k
optimal_k = K[np.argmax(silhouette_scores)]
kmeans = KMeans(n_clusters=optimal_k, random_state=42)
df['Cluster'] = kmeans.fit_predict(X_scaled)

# Visualize clusters on map
cluster_map = folium.Map(location=[37.0902, -95.7129], zoom_start=4)

# Create a color map for clusters
colors = ['red', 'blue', 'green', 'purple', 'orange', 'darkred',
          'lightred', 'beige', 'darkblue', 'darkgreen']

# Add points colored by cluster
for idx, row in df.sample(10000).iterrows():
    folium.CircleMarker(
        location=[row['Start_Lat'], row['Start_Lng']],
        radius=3,
        color=colors[row['Cluster'] % len(colors)],
        fill=True
    ).add_to(cluster_map)

display(cluster_map)

# Analyze cluster characteristics
cluster_stats = df.groupby('Cluster').agg({
    'Severity': 'mean',
    'Duration': 'mean',
    'Start_Lat': 'mean',
    'Start_Lng': 'mean',
    'ID': 'count'
}).round(2)

cluster_stats.columns = ['Avg Severity', 'Avg Duration', 'Center Lat', 
                        'Center Lng', 'Number of Accidents']
display(cluster_stats)

## 4. Predictive Modeling

Build and evaluate models to predict accident severity.

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns

# Prepare features for modeling
feature_columns = ['Hour', 'DayOfWeek', 'Month', 'IsWeekend',
                  'Temperature(F)', 'Humidity(%)', 'Pressure(in)',
                  'Visibility(mi)', 'Wind_Speed(mph)']

# Encode categorical variables
le = LabelEncoder()
df['Weather_Encoded'] = le.fit_transform(df['Weather_Condition'])
df['Sunrise_Sunset_Encoded'] = le.fit_transform(df['Sunrise_Sunset'])
feature_columns.extend(['Weather_Encoded', 'Sunrise_Sunset_Encoded'])

X = df[feature_columns]
y = df['Severity']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train_scaled, y_train)

# Make predictions
y_pred = rf_model.predict(X_test_scaled)

# Print classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Plot confusion matrix
plt.figure(figsize=(10, 8))
sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

# Feature importance
importance_df = pd.DataFrame({
    'feature': feature_columns,
    'importance': rf_model.feature_importances_
}).sort_values('importance', ascending=False)

plt.figure(figsize=(10, 6))
sns.barplot(data=importance_df, x='importance', y='feature')
plt.title('Feature Importance for Severity Prediction')
plt.show()

## 5. Interactive Dashboard

Create an interactive dashboard to explore accident patterns.

# Create an interactive dashboard using plotly
from plotly.subplots import make_subplots

# Create the dashboard layout
dashboard = make_subplots(
    rows=3, cols=2,
    subplot_titles=('Accidents by Hour', 'Severity Distribution',
                   'Weather Impact', 'Monthly Trend',
                   'Accident Hotspots', 'Feature Importance'),
    specs=[[{"type": "scatter"}, {"type": "pie"}],
           [{"type": "bar"}, {"type": "scatter"}],
           [{"type": "scattermapbox", "colspan": 2}, None]]
)

# 1. Accidents by Hour
hourly_counts = df.groupby('Hour').size()
dashboard.add_trace(
    go.Scatter(x=hourly_counts.index, y=hourly_counts.values,
               mode='lines+markers', name='Hourly Pattern'),
    row=1, col=1
)

# 2. Severity Distribution
severity_counts = df['Severity'].value_counts()
dashboard.add_trace(
    go.Pie(labels=severity_counts.index, values=severity_counts.values,
           name='Severity'),
    row=1, col=2
)

# 3. Weather Impact
weather_counts = df['Weather_Condition'].value_counts().head(10)
dashboard.add_trace(
    go.Bar(x=weather_counts.index, y=weather_counts.values,
           name='Weather Conditions'),
    row=2, col=1
)

# 4. Monthly Trend
monthly_counts = df.groupby('Month').size()
dashboard.add_trace(
    go.Scatter(x=monthly_counts.index, y=monthly_counts.values,
               mode='lines+markers', name='Monthly Trend'),
    row=2, col=2
)

# 5. Accident Hotspots
dashboard.add_trace(
    go.Scattermapbox(
        lat=df['Start_Lat'].sample(1000),
        lon=df['Start_Lng'].sample(1000),
        mode='markers',
        marker=dict(size=5, color='red'),
        name='Accidents'
    ),
    row=3, col=1
)

# Update layout
dashboard.update_layout(
    height=1200,
    showlegend=True,
    mapbox=dict(
        style="carto-positron",
        zoom=3,
        center=dict(lat=37.0902, lon=-95.7129)
    ),
    title_text="US Accidents Analysis Dashboard"
)

dashboard.show()