
### Step 1: Import Libraries and Load Data

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import CategoricalNB
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import LabelEncoder

# Load the data
df = pd.read_csv('accidentsFull.csv') 

# Create the INJURY column
df['INJURY'] = df['MAX_SEV_IR'].apply(lambda x: 'Yes' if x in [1, 2] else 'No')


### Step 2: Initial Prediction with No Further Information

In [3]:
most_common_class = df['INJURY'].value_counts().idxmax()
print(f"The prediction should be {most_common_class} because it is the most frequent class.")

The prediction should be Yes because it is the most frequent class.



### Step 3: Analysis with First 12 Records

In [4]:
for predictor in ['WEATHER_R', 'TRAF_CON_R']:
    df_head = df.head(12)
    pivot_table = df_head.pivot_table(index='INJURY', columns=predictor, aggfunc='size', fill_value=0)
    propTable = pivot_table.apply(lambda x: x / sum(x), axis=1)
    print(propTable)


WEATHER_R         1         2
INJURY                       
No         0.333333  0.666667
Yes        0.666667  0.333333
TRAF_CON_R         0         1         2
INJURY                                  
No          0.666667  0.222222  0.111111
Yes         1.000000  0.000000  0.000000


In [5]:
# Encoding categorical variables for the first 12 records
le = LabelEncoder()
X = df[['WEATHER_R', 'TRAF_CON_R']].head(12).apply(le.fit_transform)
y = df['INJURY'].head(12).apply(lambda x: 1 if x == 'Yes' else 0)

# Naive Bayes classifier
model = CategoricalNB()
model.fit(X, y)
predictions = model.predict(X)
probabilities = model.predict_proba(X)

print("Predictions:", predictions)
print("Probabilities:", probabilities)

Predictions: [0 0 0 0 0 0 0 0 0 0 0 0]
Probabilities: [[0.61403509 0.38596491]
 [0.80680571 0.19319429]
 [0.87743733 0.12256267]
 [0.73170732 0.26829268]
 [0.61403509 0.38596491]
 [0.80680571 0.19319429]
 [0.80680571 0.19319429]
 [0.61403509 0.38596491]
 [0.80680571 0.19319429]
 [0.80680571 0.19319429]
 [0.80680571 0.19319429]
 [0.64516129 0.35483871]]



### Step 4: Entire Dataset Analysis

In [6]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder

# Assuming 'categorical_columns' is a list of your categorical column names
categorical_columns = ['WEATHER_R', 'TRAF_CON_R']  # Add all your categorical columns here

# Create the ColumnTransformer with OneHotEncoder for categorical variables
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_columns)
    ])

# Create a pipeline that encodes then fits the model
pipeline = make_pipeline(preprocessor, CategoricalNB())

# Split your dataset first (ensure 'df' does not include the target variable 'INJURY')
X_train, X_test, y_train, y_test = train_test_split(df.drop(['INJURY', 'MAX_SEV_IR'], axis=1), df['INJURY'].apply(lambda x: 1 if x == 'Yes' else 0), test_size=0.4, random_state=42)

# Fit the pipeline
pipeline.fit(X_train, y_train)

# Predict using the pipeline to ensure consistent preprocessing
y_pred = pipeline.predict(X_test)

# Confusion matrix
print(confusion_matrix(y_test, y_pred))

[[1290 6983]
 [1107 7494]]


In [7]:
# Calculate the overall error rate from the confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
total_predictions = conf_matrix.sum()
correct_predictions = np.trace(conf_matrix)
error_rate = 1 - (correct_predictions / total_predictions)
print(f"Overall error rate: {error_rate:.2f}")

Overall error rate: 0.48


In [8]:
# Assuming 'most_common_class_error_rate' is the error rate of always predicting the most common class
most_common_class_error_rate = max(df['INJURY'].value_counts(normalize=True))  # This needs to be calculated based on your dataset

percentage_improvement = ((most_common_class_error_rate - error_rate) / most_common_class_error_rate) * 100
print(f"Percentage improvement over the naive rule: {percentage_improvement:.2f}%")

Percentage improvement over the naive rule: 5.77%


### Zero Probability in Conditional Probabilities

**Issue:** `P(INJURY = No | SPD_LIM = 5)` is zero because no data shows accidents at 5 mph without injuries.

**Solution:** Use **Laplace Smoothing** to adjust counts, preventing zero probabilities and allowing for predictions on rare conditions.