
### Step 1: Import Libraries and Load Data

In [29]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import CategoricalNB
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import LabelEncoder

# Load the data
df = pd.read_csv('accidentsFull.csv') 

# Create the INJURY column
df['INJURY'] = df['MAX_SEV_IR'].apply(lambda x: 'Yes' if x in [1, 2] else 'No')


### Step 2: Initial Prediction with No Further Information

In [30]:
most_common_class = df['INJURY'].value_counts().idxmax()
print(f"The prediction should be {most_common_class} because it is the most frequent class.")

The prediction should be Yes because it is the most frequent class.



### Step 3: Analysis with First 12 Records

In [31]:
pivot_table = pd.pivot_table(df.head(12), values='INJURY', index=['WEATHER_R', 'TRAF_CON_R'], aggfunc=lambda x: (x == 'Yes').mean())
print(pivot_table)

                        INJURY
WEATHER_R TRAF_CON_R          
1         0           0.666667
          1           0.000000
          2           0.000000
2         0           0.166667
          1           0.000000


In [32]:
# Encoding categorical variables for the first 12 records
le = LabelEncoder()
X = df[['WEATHER_R', 'TRAF_CON_R']].head(12).apply(le.fit_transform)
y = df['INJURY'].head(12).apply(lambda x: 1 if x == 'Yes' else 0)

# Naive Bayes classifier
model = CategoricalNB()
model.fit(X, y)
predictions = model.predict(X)
probabilities = model.predict_proba(X)

print("Predictions:", predictions)
print("Probabilities:", probabilities)

Predictions: [0 0 0 0 0 0 0 0 0 0 0 0]
Probabilities: [[0.61403509 0.38596491]
 [0.80680571 0.19319429]
 [0.87743733 0.12256267]
 [0.73170732 0.26829268]
 [0.61403509 0.38596491]
 [0.80680571 0.19319429]
 [0.80680571 0.19319429]
 [0.61403509 0.38596491]
 [0.80680571 0.19319429]
 [0.80680571 0.19319429]
 [0.80680571 0.19319429]
 [0.64516129 0.35483871]]



### Step 4: Entire Dataset Analysis

In [35]:
# Drop 'INJURY' and 'MAX_SEV_IR', and encode categorical variables
X = pd.get_dummies(df.drop(['INJURY', 'MAX_SEV_IR'], axis=1), drop_first=True)
y = df['INJURY'].apply(lambda x: 1 if x == 'Yes' else 0)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)

# Ensure the number of columns in X_train and X_test match
if X_train.shape[1] != X_test.shape[1]:
    X_train = X_train.iloc[:, :X_test.shape[1]]

# Naive Bayes classifier
model = CategoricalNB()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# Confusion matrix
print(confusion_matrix(y_test, y_pred))

IndexError: index 14 is out of bounds for axis 1 with size 13

In [None]:
print(X_test.shape)
print(X_train.shape)

(16874, 23)
(25309, 23)
