In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler

# Load the dataset
try:
    data = pd.read_csv('roadacc.csv')
    print("Data loaded successfully.")
except FileNotFoundError:
    print("Error: The file 'roadacc.csv' was not found.")
    exit()

# Add a 'year' column (for demonstration)
np.random.seed(42)
data['year'] = np.random.choice([2020, 2021, 2022, 2023], size=len(data))

# Add a 'district' column with Tamil Nadu districts
tamil_nadu_districts = [
    'Chennai', 'Coimbatore', 'Madurai', 'Tiruchirappalli', 'Salem', 'Tirunelveli', 
    'Thoothukudii', 'Vellore', 'Erode', 'Tiruppur', 'Dindigul', 'Kanyakumari', 
    'Thanjavur', 'Virudhunagar', 'Sivaganga', 'Ramanathapuram', 'Karur', 
    'Nagapattinam', 'Theni', 'Cuddalore', 'Krishnagiri', 'Perambalur', 'Namakkal', 
    'Dharmapuri', 'Tiruvarur', 'Pudukkottai', 'Nilgiris', 'Ariyalur', 'Tirupattur', 
    'Kallakurichi', 'Tenkasi', 'Ranipet', 'Chengalpattu', 'Villupuram', 'Kanchipuram', 
    'Tiruvannamalai', 'Mayiladuthurai', 'Thiruvallur'
]

# Assigning districts randomly to the dataset
data['district'] = np.random.choice(tamil_nadu_districts, size=len(data))

# Display the first few rows of the dataset
print(data.head())

# Fill missing values using forward fill
data.ffill(inplace=True)

# Ensure all required categorical columns are present
categorical_columns = ['time_of_day', 'weather', 'road_type', 'vehicle_type', 'district']
for col in categorical_columns:
    if col not in data.columns:
        print(f"Error: Column '{col}' not found in dataset.")
        exit()

# Ensure the target column exists
if 'is_accident' not in data.columns:
    print("Error: Target column 'is_accident' not found in dataset.")
    exit()

# Split the dataset by year to simulate time-based prediction
train_data = data[data['year'] < 2023]  # Data up to 2022
test_data = data[data['year'] == 2023]  # For 2023 prediction

# Check if test_data is empty
if test_data.empty:
    print("Warning: No data available for the year 2023. Exiting...")
    exit()

# Combine train and test data for one-hot encoding
combined_data = pd.concat([train_data, test_data], keys=['train', 'test'], names=['set', 'index'])

# One-hot encode categorical columns
combined_data = pd.get_dummies(combined_data, columns=categorical_columns, drop_first=True)

# Split back into train and test datasets
train_data = combined_data.xs('train')
test_data = combined_data.xs('test')

# Separate features and target variable for both train and test sets
X_train = train_data.drop(columns=['is_accident', 'year'])  # Features (up to 2022)
y_train = train_data['is_accident']  # Target variable (up to 2022)

X_test = test_data.drop(columns=['is_accident', 'year'])  # Features (for 2023)
y_test = test_data['is_accident']  # True accident data for 2023

# Check data types of X_train
print("Data types of features in X_train before conversion:")
print(X_train)

# Convert boolean columns to integers (0 and 1)
bool_columns = X_train.select_dtypes(include=['bool']).columns
X_train[bool_columns] = X_train[bool_columns].astype(int)

# Ensure all columns are numeric
X_train = X_train.select_dtypes(include=[np.number])  # Keep only numeric columns

# Check for NaN values in X_train
if X_train.isnull().values.any():
    print("Warning: NaN values detected in X_train. Filling with zeros.")
    X_train.fillna(0, inplace=True)  # Fill NaN values with zero (or consider imputation strategies)

# Remove any constant columns
X_train = X_train.loc[:, (X_train != X_train.iloc[0]).any()]  # Keep only non-constant columns

# Check for any remaining non-numeric columns
if X_train.empty:
    print("Error: No numeric features available in the training set. Exiting...")
    exit()

# Scale the feature data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)

# Only scale X_test if it's not empty
if not X_test.empty:
    # Convert boolean columns in X_test to integers
    bool_columns_test = X_test.select_dtypes(include=['bool']).columns
    X_test[bool_columns_test] = X_test[bool_columns_test].astype(int)

    # Ensure all columns are numeric
    X_test = X_test.select_dtypes(include=[np.number])  # Keep only numeric columns

    # Check for NaN values in X_test
    if X_test.isnull().values.any():
        print("Warning: NaN values detected in X_test. Filling with zeros.")
        X_test.fillna(0, inplace=True)  # Fill NaN values with zero

    # Remove any constant columns
    X_test = X_test.loc[:, (X_test != X_test.iloc[0]).any()]  # Keep only non-constant columns

    # Check for any remaining non-numeric columns in X_test
    if X_test.empty:
        print("Error: No numeric features available in the test set. Exiting...")
        exit()

    # Ensure X_test has the same features as X_train
    missing_cols = set(X_train.columns) - set(X_test.columns)
    for col in missing_cols:
        X_test[col] = 0  # Add missing columns with value 0

    X_test = X_test[X_train.columns]  # Reorder X_test to match the feature order in X_train

    # Scale the test data with the same scaler
    X_test = scaler.transform(X_test)

# Create a RandomForestClassifier model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Make predictions for 2023
if not X_test.empty:
    y_pred = model.predict(X_test)

    # Calculate the accuracy
    accuracy = accuracy_score(y_test, y_pred)
    print(f'Accuracy: {accuracy * 100:.2f}%')

    # Confusion Matrix
    cm = confusion_matrix(y_test, y_pred)
    print('Confusion Matrix:')
    print(cm)

    # Classification Report
    cr = classification_report(y_test, y_pred)
    print('Classification Report:')
    print(cr)

else:
    print("No test data available for prediction.")

# Now we will simulate data for the year 2024
np.random.seed(42)  # Set seed for reproducibility
data_2024 = pd.DataFrame()

# Generate random features for 2024 predictions
data_2024['time_of_day'] = np.random.choice(['Morning', 'Afternoon', 'Evening', 'Night'], size=100)
data_2024['weather'] = np.random.choice(['Clear', 'Rain', 'Fog', 'Storm'], size=100)
data_2024['road_type'] = np.random.choice(['Highway', 'City', 'Rural'], size=100)
data_2024['vehicle_type'] = np.random.choice(['Car', 'Bike', 'Bus', 'Truck'], size=100)
data_2024['district'] = np.random.choice(tamil_nadu_districts, size=100)

# Add a 'year' column for 2024
data_2024['year'] = 2024

# Perform one-hot encoding for the 2024 dataset
data_2024 = pd.get_dummies(data_2024, columns=categorical_columns, drop_first=True)

# Align the columns of data_2024 with X_train
missing_cols = set(X_train.columns) - set(data_2024.columns)
for col in missing_cols:
    data_2024[col] = 0  # Add missing columns with value 0

data_2024 = data_2024[X_train.columns]  # Reorder to match X_train's column order

# Scale the 2024 data using the same scaler
data_2024_scaled = scaler.transform(data_2024)

# Make predictions for 2024
y_pred_2024 = model.predict(data_2024_scaled)

# Display predictions
print(f'Predictions for 2024: {y_pred_2024}')


Data loaded successfully.
   Sl No      District  Total Accidents 2021  Total Deaths 2021  \
0    1.0      ARIYALUR                 545.0              174.0   
1    2.0  CHENGALPATTU                1614.0              472.0   
2    3.0  CHENNAI CITY                5035.0              999.0   
3    4.0    COIMBATORE                2792.0              841.0   
4    5.0     CUDDALORE                2927.0              550.0   

   Total Accidents 2022  Total Deaths 2022  Total Accidents 2023  \
0                   562                184                   584   
1                  3044                937                  3402   
2                  3453                507                  3654   
3                  3544               1057                  3657   
4                  3426                478                  3121   

   Total Deaths 2023  Unnamed: 8 time_of_day weather road_type vehicle_type  \
0                190         NaN     morning   sunny   highway          car   
1   

AttributeError: 'numpy.ndarray' object has no attribute 'columns'