In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, classification_report
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

# Sample Data (Replace with your actual SBI Life dataset)
data = {
    'Age': [25, 40, 30, 60, 22, 45, 35, 55, 28, 50],
    'Occupation': ['Salaried', 'Business', 'Salaried', 'Retired', 'Student', 'Business', 'Salaried', 'Retired', 'Salaried', 'Business'],
    'Website_Activity': [10, 5, 15, 2, 8, 7, 12, 3, 9, 6],
    'Existing_Policies': [0, 2, 1, 3, 0, 1, 2, 2, 1, 1],
    'Income': [50000, 100000, 60000, 80000, 30000, 120000, 70000, 90000, 55000, 110000],
    'Purchased_Policy': [0, 1, 0, 1, 0, 1, 0, 1, 0, 1]  # Target variable (1: Purchased, 0: Not Purchased)
}
df = pd.DataFrame(data)

# Define features and target
features = ['Age', 'Occupation', 'Website_Activity', 'Existing_Policies', 'Income']
target = 'Purchased_Policy'

# Separate features and target variable
X = df[features]
y = df[target]

# Identify categorical and numerical features
categorical_features = ['Occupation']
numerical_features = ['Age', 'Website_Activity', 'Existing_Policies', 'Income']

# Preprocessing pipelines
numerical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')), # Handle missing numerical values
    ('scaler', StandardScaler()) # Scale numerical features
])

categorical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')), # Handle missing categorical values
    ('onehot', OneHotEncoder(handle_unknown='ignore')) # One-hot encode categorical features
])

# Combine preprocessing pipelines using ColumnTransformer
preprocessor = ColumnTransformer([
    ('numerical', numerical_pipeline, numerical_features),
    ('categorical', categorical_pipeline, categorical_features)
])

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Create and train Logistic Regression model
model = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(random_state=42)) # Logistic Regression classifier
])

model.fit(X_train, y_train)

# Evaluate model on test set
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:, 1] # Probabilities for class 1 (Purchased_Policy = 1)

print("Model Evaluation on Test Set:")
print(classification_report(y_test, y_pred))
print(f"AUC-ROC Score: {roc_auc_score(y_test, y_prob)}")

# --- Real-time Propensity Score Prediction for New User ---
def predict_propensity_score(user_data):
    """
    Predicts the propensity score for a new user.

    Args:
        user_data (dict): Dictionary containing user features (keys should match feature names).

    Returns:
        float: Propensity score (probability of purchasing a policy).
    """
    user_df = pd.DataFrame([user_data]) # Convert user data dict to DataFrame
    propensity_score = model.predict_proba(user_df)[:, 1][0] # Predict probability and extract score
    return propensity_score

# Example of predicting propensity score for a new user
new_user_data = {
    'Age': 76,
    'Occupation': 'Business',
    'Website_Activity': 18,
    'Existing_Policies': 4,
    'Income': 7500
}

propensity = predict_propensity_score(new_user_data)
print(f"\nPropensity Score for New User: {propensity:.4f}")

Model Evaluation on Test Set:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         1
           1       1.00      1.00      1.00         1

    accuracy                           1.00         2
   macro avg       1.00      1.00      1.00         2
weighted avg       1.00      1.00      1.00         2

AUC-ROC Score: 1.0

Propensity Score for New User: 0.5198


In [8]:
import pandas as pd
import joblib
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report

# Load dataset
df = pd.read_csv("modified_propensity_score_dataset.csv")

# Ensure required columns exist
required_columns = ['Age', 'Occupation', 'Website_Visits', 'Annual_Income', 'Expenses', 'Credit_Score', 'Propensity_Score']
missing_columns = [col for col in required_columns if col not in df.columns]
if missing_columns:
    raise ValueError(f"Missing columns: {missing_columns}")

# Convert continuous scores to binary labels (1 = High Propensity, 0 = Low Propensity)
df['Propensity_Label'] = (df['Propensity_Score'] >= 0.5).astype(int)

# Define features and target
features = ['Age', 'Occupation', 'Website_Visits', 'Annual_Income', 'Expenses', 'Credit_Score']
target = 'Propensity_Label'  # Now binary (0 or 1)

# Identify categorical and numerical features
categorical_features = ['Occupation']
numerical_features = ['Age', 'Website_Visits', 'Annual_Income', 'Expenses', 'Credit_Score']

# Preprocessing pipelines
numerical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer([
    ('numerical', numerical_pipeline, numerical_features),
    ('categorical', categorical_pipeline, categorical_features)
])

# Split data
X_train, X_test, y_train, y_test = train_test_split(df[features], df[target], test_size=0.2, random_state=42, stratify=df[target])

# Train model
model = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))
])

model.fit(X_train, y_train)

# Model Evaluation
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:, 1]

accuracy = accuracy_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_prob)

print("\n🔹 Model Evaluation Results 🔹")
print(f"✅ Accuracy Score: {accuracy:.4f}")
print(f"✅ ROC-AUC Score: {roc_auc:.4f}")
print("\n🔹 Classification Report 🔹")
print(classification_report(y_test, y_pred))

# Save model
joblib.dump(model, "propensity_model.pkl")
print("\n✅ Model trained and saved successfully as 'propensity_model.pkl'")



🔹 Model Evaluation Results 🔹
✅ Accuracy Score: 0.9100
✅ ROC-AUC Score: 0.9398

🔹 Classification Report 🔹
              precision    recall  f1-score   support

           0       0.96      0.61      0.75        44
           1       0.90      0.99      0.95       156

    accuracy                           0.91       200
   macro avg       0.93      0.80      0.85       200
weighted avg       0.92      0.91      0.90       200


✅ Model trained and saved successfully as 'propensity_model.pkl'
