In [1]:
import pandas as pd

chunksize = 100000
cleaned_file_path = 'cleaned_dataset.csv'
is_first_chunk = True
for chunk in pd.read_csv('US_Accidents_March23.csv', chunksize=chunksize):
    chunk.drop(columns=['End_Lat', 'End_Lng'], inplace=True)
    num_cols = ['Wind_Chill(F)', 'Precipitation(in)']
    for col in num_cols:
        if col in chunk.columns:
            chunk[col].fillna(chunk[col].mean(), inplace=True)
    chunk['Start_Time'] = pd.to_datetime(chunk['Start_Time'], errors='coerce')
    chunk['End_Time'] = pd.to_datetime(chunk['End_Time'], errors='coerce')
    chunk['Duration_minutes'] = (chunk['End_Time'] - chunk['Start_Time']).dt.total_seconds() / 60
    numerical_cols = ['Temperature(F)', 'Distance(mi)', 'Visibility(mi)', 'Wind_Speed(mph)']
    for col in numerical_cols:
        if col in chunk.columns:
            chunk = chunk[chunk[col].between(chunk[col].quantile(0.01), chunk[col].quantile(0.99))]
    chunk['Weather_Condition'] = chunk['Weather_Condition'].astype('category').cat.codes
    chunk['Sunrise_Sunset'] = chunk['Sunrise_Sunset'].map({'Day': 1, 'Night': 0})
    chunk['Civil_Twilight'] = chunk['Civil_Twilight'].map({'Day': 1, 'Night': 0})
    chunk['Nautical_Twilight'] = chunk['Nautical_Twilight'].map({'Day': 1, 'Night': 0})
    chunk['Astronomical_Twilight'] = chunk['Astronomical_Twilight'].map({'Day': 1, 'Night': 0})
    chunk.dropna(inplace=True)
    for col in chunk.select_dtypes(include='object').columns:
        chunk[col] = chunk[col].astype('category')
    chunk.to_csv(cleaned_file_path, mode='w' if is_first_chunk else 'a', index=False, header=is_first_chunk)
    is_first_chunk = False


In [7]:
cleaned_file_path = 'cleaned_dataset.csv'

# Load the cleaned dataset for splitting
data = pd.read_csv(cleaned_file_path)

# Define the target column and features
target = 'Severity'
features = data.drop(columns=[target]).columns

In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load the cleaned dataset for splitting
data = pd.read_csv(cleaned_file_path)

# Define the target column and features
target = 'Severity'
features = data.drop(columns=[target]).columns

# Split the data into train, validation, and test sets
train_data, temp_data = train_test_split(data, test_size=0.25, random_state=42, stratify=data[target])
validation_data, test_data = train_test_split(temp_data, test_size=0.4, random_state=42, stratify=temp_data[target])

# Save the splits to separate CSV files
train_data.to_csv('train_dataset.csv', index=False)
validation_data.to_csv('validation_dataset.csv', index=False)
test_data.to_csv('test_dataset.csv', index=False)

print("Data splitting completed. Train, validation, and test datasets saved.")


NameError: name 'cleaned_file_path' is not defined

In [4]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
import joblib

# Select only numeric columns for features
features = data.select_dtypes(include=['float64', 'int64']).columns.drop(target)

X_train = train_data[features]
y_train = train_data[target]
X_validation = validation_data[features]
y_validation = validation_data[target]
X_test = test_data[features]
y_test = test_data[target]

model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

validation_predictions = model.predict(X_validation)
accuracy = accuracy_score(y_validation, validation_predictions)
precision = precision_score(y_validation, validation_predictions, average='weighted')
recall = recall_score(y_validation, validation_predictions, average='weighted')
f1 = f1_score(y_validation, validation_predictions, average='weighted')

print("Validation Metrics:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

print("Classification Report:")
print(classification_report(y_validation, validation_predictions))

joblib.dump(model, 'random_forest_model.joblib')
print("Model saved as random_forest_model.joblib")



Validation Metrics:
Accuracy: 0.8708
Precision: 0.8599
Recall: 0.8708
F1 Score: 0.8591
Classification Report:
              precision    recall  f1-score   support

           1       0.81      0.37      0.51      9307
           2       0.89      0.96      0.92    811052
           3       0.76      0.53      0.63    160878
           4       0.54      0.27      0.36     24488

    accuracy                           0.87   1005725
   macro avg       0.75      0.53      0.60   1005725
weighted avg       0.86      0.87      0.86   1005725

Model saved as random_forest_model.joblib


In [5]:
test_predictions = model.predict(X_test)
test_accuracy = accuracy_score(y_test, test_predictions)
test_precision = precision_score(y_test, test_predictions, average='weighted')
test_recall = recall_score(y_test, test_predictions, average='weighted')
test_f1 = f1_score(y_test, test_predictions, average='weighted')

print("Test Metrics:")
print(f"Accuracy: {test_accuracy:.4f}")
print(f"Precision: {test_precision:.4f}")
print(f"Recall: {test_recall:.4f}")
print(f"F1 Score: {test_f1:.4f}")

print("Test Classification Report:")
print(classification_report(y_test, test_predictions))


Test Metrics:
Accuracy: 0.8708
Precision: 0.8600
Recall: 0.8708
F1 Score: 0.8592
Test Classification Report:
              precision    recall  f1-score   support

           1       0.81      0.36      0.50      6204
           2       0.89      0.96      0.92    540702
           3       0.76      0.53      0.63    107252
           4       0.54      0.27      0.36     16326

    accuracy                           0.87    670484
   macro avg       0.75      0.53      0.60    670484
weighted avg       0.86      0.87      0.86    670484



In [1]:
import matplotlib.pyplot as plt

feature_importances = model.feature_importances_
feature_names = features
sorted_idx = feature_importances.argsort()

plt.figure(figsize=(10, 6))
plt.barh(range(len(sorted_idx)), feature_importances[sorted_idx], align='center')
plt.yticks(range(len(sorted_idx)), [feature_names[i] for i in sorted_idx])
plt.xlabel("Feature Importance")
plt.title("Random Forest Feature Importances")
plt.show()

NameError: name 'model' is not defined