In [1]:
# Import Required Modules
import pandas as pd
import numpy as np
from statsmodels.formula.api import logit

In [3]:
# Load and Clean Dataset
# Load dataset
cars = pd.read_csv("data/car_insurance.csv")

# Display basic info
cars.info()

# Fill missing values with the mean
cars["credit_score"].fillna(cars["credit_score"].mean(), inplace=True)
cars["annual_mileage"].fillna(cars["annual_mileage"].mean(), inplace=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 18 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   id                   10000 non-null  int64  
 1   age                  10000 non-null  int64  
 2   gender               10000 non-null  int64  
 3   driving_experience   10000 non-null  object 
 4   education            10000 non-null  object 
 5   income               10000 non-null  object 
 6   credit_score         9018 non-null   float64
 7   vehicle_ownership    10000 non-null  float64
 8   vehicle_year         10000 non-null  object 
 9   married              10000 non-null  float64
 10  children             10000 non-null  float64
 11  postal_code          10000 non-null  int64  
 12  annual_mileage       9043 non-null   float64
 13  vehicle_type         10000 non-null  object 
 14  speeding_violations  10000 non-null  int64  
 15  duis                 10000 non-null  

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  cars["credit_score"].fillna(cars["credit_score"].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  cars["annual_mileage"].fillna(cars["annual_mileage"].mean(), inplace=True)


In [4]:
# Fit Logit Models for Each Feature
# Drop 'id' and 'outcome' to get feature columns
features = cars.drop(columns=["id", "outcome"]).columns

# Initialize list to store models
models = []

# Fit a logistic regression model for each feature
for col in features:
    model = logit(f"outcome ~ {col}", data=cars).fit(disp=0)
    models.append(model)


In [5]:
# Calculate Accuracy for Each Model
# List to store accuracies
accuracies = []

# Calculate confusion matrix and accuracy
for model in models:
    conf_matrix = model.pred_table()
    tn, fp = conf_matrix[0]
    fn, tp = conf_matrix[1]
    accuracy = (tn + tp) / (tn + fp + fn + tp)
    accuracies.append(accuracy)


In [6]:
# Identify Best Feature Based on Accuracy
best_feature = features[accuracies.index(max(accuracies))]
best_accuracy = max(accuracies)

# Store results in a DataFrame
best_feature_df = pd.DataFrame({
    "best_feature": [best_feature],
    "best_accuracy": [best_accuracy]
})

best_feature_df


Unnamed: 0,best_feature,best_accuracy
0,driving_experience,0.7771
