In [2]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report, accuracy_score

# Load the dataset
file_path = "insurance.csv"  # Update the file path if needed
df = pd.read_csv(file_path)

# Step 1: Exploring the dataset
print("Initial Data Info:")
df.info()
print("\nMissing Values:")
print(df.isnull().sum())
print("\nSummary Statistics:")
print(df.describe())
print("\nUnique Values in Categorical Columns:")
categorical_columns = df.select_dtypes(include=['object']).columns
for col in categorical_columns:
    print(f"{col}: {df[col].nunique()} unique values")

# Step 2: Encoding Categorical Variables
label_encoders = {}
for col in categorical_columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le  # Store encoders for reference

# Step 3: Feature Scaling
scaler = StandardScaler()
numerical_features = ['Age', 'Region_Code', 'Annual_Premium', 'Policy_Sales_Channel', 'Vintage']
df[numerical_features] = scaler.fit_transform(df[numerical_features])

# Step 4: Handling Imbalanced Data using SMOTE
target_column = 'Response'
print("\nTarget Variable Distribution:")
print(df[target_column].value_counts())

X = df.drop(columns=['id', target_column])  # Dropping ID column
y = df[target_column]

smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

df_resampled = pd.DataFrame(X_resampled, columns=X.columns)
df_resampled[target_column] = y_resampled

print("\nBalanced Target Variable Distribution After SMOTE:")
print(df_resampled[target_column].value_counts())

# Step 5: Model Training and Evaluation with Naive Bayes
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

nb = GaussianNB()
nb.fit(X_train, y_train)

y_pred = nb.predict(X_test)

print("\nModel Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Initial Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 382154 entries, 0 to 382153
Data columns (total 12 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   id                    382154 non-null  int64  
 1   Gender                382154 non-null  object 
 2   Age                   382154 non-null  int64  
 3   Driving_License       382154 non-null  int64  
 4   Region_Code           382154 non-null  float64
 5   Previously_Insured    382154 non-null  int64  
 6   Vehicle_Age           382154 non-null  object 
 7   Vehicle_Damage        382154 non-null  object 
 8   Annual_Premium        382154 non-null  float64
 9   Policy_Sales_Channel  382154 non-null  float64
 10  Vintage               382154 non-null  int64  
 11  Response              382154 non-null  int64  
dtypes: float64(3), int64(6), object(3)
memory usage: 35.0+ MB

Missing Values:
id                      0
Gender                  0
Age       




Balanced Target Variable Distribution After SMOTE:
Response
0    319553
1    319553
Name: count, dtype: int64

Model Accuracy: 0.8172771510381625

Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.65      0.78     63772
           1       0.74      0.99      0.84     64050

    accuracy                           0.82    127822
   macro avg       0.86      0.82      0.81    127822
weighted avg       0.86      0.82      0.81    127822

