### Imports


In [26]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report, accuracy_score
from sklearn.impute import KNNImputer


### Data Collection


In [27]:
# Load the dataset
file_path = "data/insurance.csv"  # Update the file path if needed
df = pd.read_csv(file_path)

### Data Preprocessing

1. Explore the data


In [28]:

# Step 1: Exploring the dataset
print("Initial Data Info:")
df.info()
print("\nMissing Values:")
print(df.isnull().sum())
print("\nSummary Statistics:")
print(df.describe())
print("\nUnique Values in Categorical Columns:")
categorical_columns = df.select_dtypes(include=['object']).columns
for col in categorical_columns:
    print(f"{col}: {df[col].nunique()} unique values")



Initial Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 382154 entries, 0 to 382153
Data columns (total 12 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   id                    382154 non-null  int64  
 1   Gender                382154 non-null  object 
 2   Age                   382144 non-null  float64
 3   Driving_License       382154 non-null  int64  
 4   Region_Code           382154 non-null  float64
 5   Previously_Insured    382154 non-null  int64  
 6   Vehicle_Age           382154 non-null  object 
 7   Vehicle_Damage        382151 non-null  object 
 8   Annual_Premium        382154 non-null  float64
 9   Policy_Sales_Channel  382154 non-null  float64
 10  Vintage               382154 non-null  int64  
 11  Response              382154 non-null  int64  
dtypes: float64(4), int64(5), object(3)
memory usage: 35.0+ MB

Missing Values:
id                       0
Gender                   0
Age     

2. Handle Missing Data and Naive Feature Removal


In [30]:
# Step 2: Handle Missing Values
# Fill missing Age with the median
#df['Age'].fillna(df['Age'].median(), inplace=True)

# Select numerical features for KNN imputation
num_features = ['Age', 'Annual_Premium', 'Vintage']

knn_imputer = KNNImputer(n_neighbors=3)
df[num_features] = knn_imputer.fit_transform(df[num_features])

# Fill missing Vehicle_Damage with the mode (most common value)
#df['Vehicle_Damage'].fillna(df['Vehicle_Damage'].mode()[0], inplace=True)

# Fill missing Vehicle_Damage with a new category 'Unknown'
df['Vehicle_Damage'] = df['Vehicle_Damage'].fillna('Unknown')

print("\nMissing Values:")
print(df.isnull().sum())

# Drop Driving_License column
df.drop(columns=['Driving_License'], inplace=True)

# Verify its gone
print("\nData after dropping Driving_License:")
print(df.info())


Missing Values:
id                      0
Gender                  0
Age                     0
Driving_License         0
Region_Code             0
Previously_Insured      0
Vehicle_Age             0
Vehicle_Damage          0
Annual_Premium          0
Policy_Sales_Channel    0
Vintage                 0
Response                0
dtype: int64

Data after dropping Driving_License:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 382154 entries, 0 to 382153
Data columns (total 11 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   id                    382154 non-null  int64  
 1   Gender                382154 non-null  object 
 2   Age                   382154 non-null  float64
 3   Region_Code           382154 non-null  float64
 4   Previously_Insured    382154 non-null  int64  
 5   Vehicle_Age           382154 non-null  object 
 6   Vehicle_Damage        382154 non-null  object 
 7   Annual_Premium        382154 non-nul

In [None]:
# Step 3: Encoding Categorical Variables
label_encoders = {}
for col in categorical_columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le  # Store encoders for reference



In [None]:
# Step 4: Feature Scaling
scaler = StandardScaler()
numerical_features = ['Age', 'Region_Code', 'Annual_Premium', 'Policy_Sales_Channel', 'Vintage']
df[numerical_features] = scaler.fit_transform(df[numerical_features])



In [None]:
# Step 5: Handling Imbalanced Data using SMOTE
target_column = 'Response'
print("\nTarget Variable Distribution:")
print(df[target_column].value_counts())

X = df.drop(columns=['id', target_column])  # Dropping ID column
y = df[target_column]

smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

df_resampled = pd.DataFrame(X_resampled, columns=X.columns)
df_resampled[target_column] = y_resampled

print("\nBalanced Target Variable Distribution After SMOTE:")
print(df_resampled[target_column].value_counts())




Target Variable Distribution:
Response
0    319553
1     62601
Name: count, dtype: int64





Balanced Target Variable Distribution After SMOTE:
Response
0    319553
1    319553
Name: count, dtype: int64


In [None]:
# Step 6: Model Training and Evaluation with Naive Bayes
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

nb = GaussianNB()
nb.fit(X_train, y_train)

y_pred = nb.predict(X_test)

print("\nModel Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))



Model Accuracy: 0.8172771510381625

Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.65      0.78     63772
           1       0.74      0.99      0.84     64050

    accuracy                           0.82    127822
   macro avg       0.86      0.82      0.81    127822
weighted avg       0.86      0.82      0.81    127822



In [16]:
import pandas as pd
import numpy as np

# Load the dataset
df = pd.read_csv('data/insurance_orig.csv')

# 1. Set all values in 'Driving_License' to 1
df['Driving_License'] = 1

# 2. Introduce missing data into 'Age' and 'Vehicle_Damage'
np.random.seed(42)  # for reproducibility

# Introduce missing values into 10 entries of 'Age'
age_missing_indices = np.random.choice(df.index, size=10, replace=False)
df.loc[age_missing_indices, 'Age'] = np.nan

# Introduce missing values into 3 entries of 'Vehicle_Damage'
vehicle_damage_missing_indices = np.random.choice(df.index, size=3, replace=False)
df.loc[vehicle_damage_missing_indices, 'Vehicle_Damage'] = np.nan

# Save the modified dataset
df.to_csv('data/insurance.csv', index=False)

print("Data modifications completed. Modified dataset saved as 'insurance.csv'.")


Data modifications completed. Modified dataset saved as 'insurance.csv'.
