In [86]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split

# Sample dataset creation (replace this with your actual data loading process)
np.random.seed(42)

# Assume original data has the following columns
data = pd.DataFrame({
    'time_of_day': np.random.choice(['Morning', 'Afternoon', 'Evening', 'Night'], size=1000),
    'weather': np.random.choice(['Clear', 'Rain', 'Fog', 'Storm'], size=1000),
    'road_type': np.random.choice(['Highway', 'City', 'Rural'], size=1000),
    'vehicle_type': np.random.choice(['Car', 'Bike', 'Bus', 'Truck'], size=1000),
    'district': np.random.choice(['Chennai', 'Coimbatore', 'Madurai', 'Tiruchirappalli', 'Tirunelveli'], size=1000),
    'accident_occurred': np.random.choice([0, 1], size=1000)  # Target variable
})

# Step 1: Prepare the dataset
# Save the original 'district' column before encoding
original_districts = data['district']

# One-hot encoding for categorical features excluding 'district'
categorical_columns = ['time_of_day', 'weather', 'road_type', 'vehicle_type']  # Exclude 'district'
data = pd.get_dummies(data, columns=categorical_columns, drop_first=True)

# Define features and target variable
X = data.drop('accident_occurred', axis=1)  # Features
y = data['accident_occurred']               # Target

# Step 2: Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 3: Scale the feature data
scaler = StandardScaler()

# Convert X_train and X_test into DataFrame after scaling
X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns)
X_test_scaled = pd.DataFrame(scaler.transform(X_test), columns=X_train.columns)

# Step 4: Train RandomForestClassifier
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train_scaled, y_train)

# Step 5: Make predictions on the test set
y_pred = model.predict(X_test_scaled)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy * 100:.2f}%')
print('Confusion Matrix:')
print(confusion_matrix(y_test, y_pred))
print('Classification Report:')
print(classification_report(y_test, y_pred, zero_division=0))

# Step 6: Prepare 2024 data for predictions
data_2024 = pd.DataFrame()
data_2024['time_of_day'] = np.random.choice(['Morning', 'Afternoon', 'Evening', 'Night'], size=100)
data_2024['weather'] = np.random.choice(['Clear', 'Rain', 'Fog', 'Storm'], size=100)
data_2024['road_type'] = np.random.choice(['Highway', 'City', 'Rural'], size=100)
data_2024['vehicle_type'] = np.random.choice(['Car', 'Bike', 'Bus', 'Truck'], size=100)
data_2024['district'] = np.random.choice(['Chennai', 'Coimbatore', 'Madurai', 'Tiruchirappalli', 'Tirunelveli'], size=100)

# Save the original 'district' column for 2024 data before encoding
original_districts_2024 = data_2024['district']

# One-hot encoding for categorical features in 2024 data (excluding 'district')
data_2024 = pd.get_dummies(data_2024, columns=categorical_columns, drop_first=True)

# Handle missing columns for model input
missing_cols = set(X_train.columns) - set(data_2024.columns)
for col in missing_cols:
    data_2024[col] = 0  # Add missing columns

# Ensure the column order matches X_train
data_2024 = data_2024[X_train.columns]

# Scale the 2024 data
data_2024_scaled = pd.DataFrame(scaler.transform(data_2024), columns=X_train.columns)

# Step 7: Make predictions for 2024
y_pred_2024 = model.predict(data_2024_scaled)

# Step 8: Create predictions DataFrame
# Use the original 'district' column from 2024 data
predictions_df = pd.DataFrame({'district': original_districts_2024, 'predictions': y_pred_2024})

# Merge with accident counts
merged_predictions = predictions_df.groupby('district')['predictions'].sum().reset_index()
merged_predictions.rename(columns={'predictions': 'total_accidents_2024'}, inplace=True)

# Display total accidents by district
print(merged_predictions)


ValueError: could not convert string to float: 'Tiruchirappalli'

ValueError: could not convert string to float: 'Tiruchirappalli'