##**Importing Libraries**

In [58]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import  OrdinalEncoder , StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.compose import ColumnTransformer
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score, confusion_matrix,precision_recall_curve



##**Load dataset and perform Data Wrangling**

In [40]:
sales_df=pd.read_csv("/content/Dataset.csv")
sales_df.head()

Unnamed: 0,id,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response
0,1,Male,44,1,28.0,0,> 2 Years,Yes,40454.0,26.0,217,1
1,2,Male,76,1,3.0,0,1-2 Year,No,33536.0,26.0,183,0
2,3,Male,47,1,28.0,0,> 2 Years,Yes,38294.0,26.0,27,1
3,4,Male,21,1,11.0,1,< 1 Year,No,28619.0,152.0,203,0
4,5,Female,29,1,41.0,1,< 1 Year,No,27496.0,152.0,39,0


In [41]:
sales_df.shape

(381109, 12)

In [42]:
sales_df=sales_df.drop(columns=['id'])

In [43]:
sales_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 381109 entries, 0 to 381108
Data columns (total 11 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   Gender                381109 non-null  object 
 1   Age                   381109 non-null  int64  
 2   Driving_License       381109 non-null  int64  
 3   Region_Code           381109 non-null  float64
 4   Previously_Insured    381109 non-null  int64  
 5   Vehicle_Age           381109 non-null  object 
 6   Vehicle_Damage        381109 non-null  object 
 7   Annual_Premium        381109 non-null  float64
 8   Policy_Sales_Channel  381109 non-null  float64
 9   Vintage               381109 non-null  int64  
 10  Response              381109 non-null  int64  
dtypes: float64(3), int64(5), object(3)
memory usage: 32.0+ MB


In [16]:
sales_df.duplicated().sum()

np.int64(269)

In [17]:
sales_df = sales_df.drop_duplicates()


In [44]:
sales_df.duplicated().sum()

np.int64(269)

In [45]:
sales_df.shape

(381109, 11)

## **Feature engineering for Vehicle_Age**

In [46]:

vehicle_age_map = {
    "< 1 Year": 0,
    "1-2 Year": 1,
    "> 2 Years": 2
}

sales_df["Vehicle_Age"] = sales_df["Vehicle_Age"].map(vehicle_age_map)


In [47]:
sales_df.head()

Unnamed: 0,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response
0,Male,44,1,28.0,0,2,Yes,40454.0,26.0,217,1
1,Male,76,1,3.0,0,1,No,33536.0,26.0,183,0
2,Male,47,1,28.0,0,2,Yes,38294.0,26.0,27,1
3,Male,21,1,11.0,1,0,No,28619.0,152.0,203,0
4,Female,29,1,41.0,1,0,No,27496.0,152.0,39,0


##**Split into Train_Test Data**

In [48]:

X=sales_df.drop(columns=['Response'])
y=sales_df['Response']

In [49]:
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.30, stratify=y, random_state=42
)

# Second: split the 30% temp into 15% test and 15% validation
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.50, stratify=y_temp, random_state=42
)

## **defin Catagorical columns**

In [50]:
print(y_train.value_counts(normalize=True))


Response
0    0.877437
1    0.122563
Name: proportion, dtype: float64


In [51]:
categorical_cols = ["Gender", "Vehicle_Damage"]
numeric_cols = [col for col in sales_df.columns if col not in categorical_cols + ["Response"]]

## **Perform Encoding+Scalling on data**

In [52]:
preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OrdinalEncoder(), categorical_cols),
        ("num", StandardScaler(), numeric_cols)
    ]
)

##**🔹 MODEL 1: Logistic Regression**

In [27]:
clf = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("smote", SMOTE(random_state=42)),
    ("classifier", LogisticRegression(max_iter=1000, random_state=42))
])


In [29]:
clf.fit(X_train, y_train)


## **Evaluate(Predict Using Optimal Threshold +Confusion Matrix**

In [30]:


# Get predicted probabilities of positive class
y_prob = clf.predict_proba(X_test)[:, 1]

# Use precision-recall curve to find threshold
precision, recall, thresholds = precision_recall_curve(y_test, y_prob)

# Compute F1-score for each threshold
f1_scores = 2 * (precision * recall) / (precision + recall + 1e-6)  # small epsilon to avoid division by zero

# Find threshold with max F1-score
best_index = np.argmax(f1_scores)
best_thresh = thresholds[best_index]
best_f1 = f1_scores[best_index]

print(f"Optimal threshold: {best_thresh:.3f}")
print(f"F1-score at optimal threshold: {best_f1:.4f}")

# Make predictions using optimal threshold
y_pred_best = (y_prob >= best_thresh).astype(int)

# Evaluate
print("Accuracy:", accuracy_score(y_test, y_pred_best))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_best))
print("Classification Report:\n", classification_report(y_test, y_pred_best))


Optimal threshold: 0.655
F1-score at optimal threshold: 0.4265
Accuracy: 0.749115989216819
Confusion Matrix:
 [[37464 12660]
 [ 1672  5330]]
Classification Report:
               precision    recall  f1-score   support

           0       0.96      0.75      0.84     50124
           1       0.30      0.76      0.43      7002

    accuracy                           0.75     57126
   macro avg       0.63      0.75      0.63     57126
weighted avg       0.88      0.75      0.79     57126



## **🔹 MODEL 2: Random Forest**

In [38]:

clf = Pipeline(steps=[
    ("preprocessor", preprocessor),
    # Keep SMOTE to handle imbalance
    ("smote", SMOTE(random_state=42, sampling_strategy=0.4)),
    ("classifier", RandomForestClassifier(
        n_estimators=300,
        max_depth=None,
        min_samples_split=5,
        class_weight='balanced',
        random_state=42,
        n_jobs=-1
    ))
])


clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

print("Random Forest Accuracy:", accuracy_score(y_test, y_pred))


Random Forest Accuracy: 0.8440990092077163


## **Evaluate(Predict Using Optimal Threshold +Confusion Matrix)**

In [54]:
y_prob = clf.predict_proba(X_test)[:, 1]

# Step 1: Compute precision, recall for different thresholds
precision, recall, thresholds = precision_recall_curve(y_test, y_prob)

# Step 2: Compute F1 for each threshold
f1_scores = 2 * (precision * recall) / (precision + recall + 1e-6)

# Step 3: Find the threshold with maximum F1
best_index = np.argmax(f1_scores)
best_thresh = thresholds[best_index]
best_f1 = f1_scores[best_index]

print(f"🔹 Optimal Threshold: {best_thresh:.3f}")
print(f"🔹 Best F1-score: {best_f1:.4f}")

# Step 4: Make predictions using the best threshold
y_pred_best = (y_prob >= best_thresh).astype(int)

# Step 5: Evaluate model performance
print("\n=== Classification Report ===")
print(classification_report(y_test, y_pred_best))

print("ROC-AUC:", roc_auc_score(y_test, y_prob))


🔹 Optimal Threshold: 0.533
🔹 Best F1-score: 0.7446

=== Classification Report ===
              precision    recall  f1-score   support

           0       0.96      0.97      0.97     50160
           1       0.80      0.70      0.74      7007

    accuracy                           0.94     57167
   macro avg       0.88      0.84      0.86     57167
weighted avg       0.94      0.94      0.94     57167

ROC-AUC: 0.9550994929540727


##**Test Model on Custom Data**

In [55]:
# Example new data point
new_data = pd.DataFrame({
    "Gender": ["Male"],
    "Age": [32],
    "Driving_License": [1],
    "Region_Code": [28],
    "Previously_Insured": [0],
    "Vehicle_Age": [1],
    "Vehicle_Damage": ["Yes"],
    "Annual_Premium": [42000],
    "Policy_Sales_Channel": [152],
    "Vintage": [250]
})

# Predict probability
prob = clf.predict_proba(new_data)[:, 1][0]
print(f"Predicted Probability: {prob:.2f}")

# Apply your best threshold
result = "Will Buy Insurance" if prob >= 0.533 else "Will Not Buy Insurance"
print("Prediction:", result)


Predicted Probability: 0.42
Prediction: Will Not Buy Insurance
