In [5]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.utils import resample
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

# Load the data
df = pd.read_csv('Data.csv')

# Define predictors and target
exclude_columns = ["Model Year", "No Injuries Reported", "Minor", "Moderate", "Serious", "Severity"]
predictors = [col for col in df.columns if col not in exclude_columns]
X = df[predictors]
y = df["Severity"]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Number of bootstrap samples
n_iterations = 10
n_size = len(X_train)

# Lists to store the results
logistic_accuracies = []
decision_tree_accuracies = []
random_forest_accuracies = []
xgboost_accuracies = []

for i in range(n_iterations):
    # Create a bootstrap sample
    X_sample, y_sample = resample(X_train_scaled, y_train, n_samples=n_size, replace=True)
    
    # Train Logistic Regression
    logistic_model = LogisticRegression(penalty='l1', solver='saga', max_iter=5000, random_state=42)
    logistic_model.fit(X_sample, y_sample)
    y_pred_logistic = logistic_model.predict(X_test_scaled)
    logistic_accuracies.append(accuracy_score(y_test, y_pred_logistic))
    
    # Train Decision Tree
    decision_tree_model = DecisionTreeClassifier(random_state=42)
    decision_tree_model.fit(X_sample, y_sample)
    y_pred_decision_tree = decision_tree_model.predict(X_test_scaled)
    decision_tree_accuracies.append(accuracy_score(y_test, y_pred_decision_tree))
    
    # Train Random Forest
    random_forest_model = RandomForestClassifier(n_estimators=100, random_state=42)
    random_forest_model.fit(X_sample, y_sample)
    y_pred_random_forest = random_forest_model.predict(X_test_scaled)
    random_forest_accuracies.append(accuracy_score(y_test, y_pred_random_forest))
    
    # Train XGBoost
    xgboost_model = XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss')
    xgboost_model.fit(X_sample, y_sample)
    y_pred_xgboost = xgboost_model.predict(X_test_scaled)
    xgboost_accuracies.append(accuracy_score(y_test, y_pred_xgboost))

# Calculate the mean and standard deviation of the accuracies
logistic_mean_accuracy = np.mean(logistic_accuracies)
logistic_std_accuracy = np.std(logistic_accuracies)
decision_tree_mean_accuracy = np.mean(decision_tree_accuracies)
decision_tree_std_accuracy = np.std(decision_tree_accuracies)
random_forest_mean_accuracy = np.mean(random_forest_accuracies)
random_forest_std_accuracy = np.std(random_forest_accuracies)
xgboost_mean_accuracy = np.mean(xgboost_accuracies)
xgboost_std_accuracy = np.std(xgboost_accuracies)

print(f"Logistic Regression Mean Accuracy: {logistic_mean_accuracy:.4f} ± {logistic_std_accuracy:.4f}")
print(f"Decision Tree Mean Accuracy: {decision_tree_mean_accuracy:.4f} ± {decision_tree_std_accuracy:.4f}")
print(f"Random Forest Mean Accuracy: {random_forest_mean_accuracy:.4f} ± {random_forest_std_accuracy:.4f}")
print(f"XGBoost Mean Accuracy: {xgboost_mean_accuracy:.4f} ± {xgboost_std_accuracy:.4f}")

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Logistic Regression Mean Accuracy: 0.8050 ± 0.0066
Decision Tree Mean Accuracy: 0.8763 ± 0.0118
Random Forest Mean Accuracy: 0.9158 ± 0.0125
XGBoost Mean Accuracy: 0.9029 ± 0.0118
