# Insurance Cross-Selling Business Analysis

This notebook analyzes customer behavior and characteristics to identify opportunities for cross-selling vehicle insurance to health insurance customers.

## 1. Setup and Data Loading

In [None]:
# Install required packages
!pip install -q kaggle pandas numpy matplotlib seaborn scikit-learn xgboost lightgbm

In [None]:
import os
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import roc_auc_score, roc_curve, confusion_matrix, classification_report
from sklearn.metrics import precision_recall_curve, average_precision_score
import xgboost as xgb
import lightgbm as lgb

# Set visualization style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)
plt.rcParams['font.size'] = 10

# Create charts directory
os.makedirs('charts', exist_ok=True)

print("Libraries loaded successfully")

In [None]:
# Download data using Kaggle API (data stays in memory, not saved as files)
import zipfile
from io import BytesIO
import kaggle

# Download the competition data
competition_name = 'playground-series-s4e7'

print("Downloading data from Kaggle API...")
kaggle.api.competition_download_files(competition_name, path='/tmp')

# Extract and load into memory
with zipfile.ZipFile(f'/tmp/{competition_name}.zip', 'r') as zip_ref:
    # Read train.csv directly into DataFrame
    with zip_ref.open('train.csv') as f:
        train_df = pd.read_csv(f)
    
    # Read test.csv directly into DataFrame
    with zip_ref.open('test.csv') as f:
        test_df = pd.read_csv(f)
    
    # Read sample_submission.csv
    with zip_ref.open('sample_submission.csv') as f:
        sample_submission = pd.read_csv(f)

# Clean up downloaded zip file
os.remove(f'/tmp/{competition_name}.zip')

print(f"✓ Data loaded into memory")
print(f"  Training samples: {len(train_df):,}")
print(f"  Test samples: {len(test_df):,}")

## 2. Initial Data Exploration

In [None]:
# Display basic information
print("=" * 80)
print("DATASET OVERVIEW")
print("=" * 80)
print(f"\nTraining Set Shape: {train_df.shape}")
print(f"Test Set Shape: {test_df.shape}")
print("\nColumn Names and Types:")
print(train_df.dtypes)
print("\n" + "=" * 80)

In [None]:
# First few rows
print("First 10 rows of training data:")
train_df.head(10)

In [None]:
# Statistical summary
print("Statistical Summary:")
train_df.describe()

In [None]:
# Check for missing values
print("Missing Values Analysis:")
missing = train_df.isnull().sum()
if missing.sum() == 0:
    print("✓ No missing values found in the dataset")
else:
    print(missing[missing > 0])

In [None]:
# Target variable distribution
print("Target Variable Distribution (Response):")
response_dist = train_df['Response'].value_counts()
response_pct = train_df['Response'].value_counts(normalize=True) * 100

print(f"\nNot Interested (0): {response_dist[0]:,} ({response_pct[0]:.2f}%)")
print(f"Interested (1): {response_dist[1]:,} ({response_pct[1]:.2f}%)")
print(f"\nClass Imbalance Ratio: {response_dist[0] / response_dist[1]:.2f}:1")

## 3. Business-Focused Analysis: Customer Demographics & Behavior

In [None]:
# Chart 1: Overall Response Rate
fig, ax = plt.subplots(figsize=(10, 6))
response_pct = train_df['Response'].value_counts(normalize=True) * 100
bars = ax.bar(['Not Interested', 'Interested'], response_pct.values, 
              color=['#d62728', '#2ca02c'], alpha=0.7, edgecolor='black')

ax.set_ylabel('Percentage of Customers (%)', fontsize=12, fontweight='bold')
ax.set_title('Customer Interest in Vehicle Insurance Cross-Sell', 
             fontsize=14, fontweight='bold', pad=20)
ax.set_ylim(0, 100)

# Add value labels
for bar in bars:
    height = bar.get_height()
    ax.text(bar.get_x() + bar.get_width()/2., height,
            f'{height:.1f}%',
            ha='center', va='bottom', fontsize=11, fontweight='bold')

plt.tight_layout()
plt.savefig('charts/01_overall_response_rate.png', dpi=300, bbox_inches='tight')
plt.show()

print("✓ Chart saved: charts/01_overall_response_rate.png")

In [None]:
# Chart 2: Response Rate by Age Group
# Use a copy to avoid modifying original dataframe
train_copy = train_df.copy()
train_copy['Age_Group'] = pd.cut(train_copy['Age'], 
                               bins=[0, 25, 35, 45, 55, 100],
                               labels=['18-25', '26-35', '36-45', '46-55', '55+'])

age_response = train_copy.groupby('Age_Group')['Response'].agg(['mean', 'count'])
age_response['mean'] = age_response['mean'] * 100

fig, ax1 = plt.subplots(figsize=(12, 6))

# Bar chart for response rate
ax1.bar(age_response.index, age_response['mean'], 
        color='steelblue', alpha=0.7, edgecolor='black', label='Interest Rate')
ax1.set_xlabel('Age Group', fontsize=12, fontweight='bold')
ax1.set_ylabel('Interest Rate (%)', fontsize=12, fontweight='bold', color='steelblue')
ax1.tick_params(axis='y', labelcolor='steelblue')
ax1.set_ylim(0, max(age_response['mean']) * 1.2)

# Line chart for customer count
ax2 = ax1.twinx()
ax2.plot(age_response.index, age_response['count'], 
         color='red', marker='o', linewidth=2, markersize=8, label='Customer Count')
ax2.set_ylabel('Number of Customers', fontsize=12, fontweight='bold', color='red')
ax2.tick_params(axis='y', labelcolor='red')

plt.title('Customer Interest by Age Segment', fontsize=14, fontweight='bold', pad=20)

# Add value labels on bars
for i, (idx, row) in enumerate(age_response.iterrows()):
    ax1.text(i, row['mean'], f"{row['mean']:.1f}%", 
            ha='center', va='bottom', fontsize=10, fontweight='bold')

fig.tight_layout()
plt.savefig('charts/02_age_group_analysis.png', dpi=300, bbox_inches='tight')
plt.show()

print("✓ Chart saved: charts/02_age_group_analysis.png")

In [None]:
# Chart 3: Response Rate by Gender
gender_response = train_df.groupby('Gender')['Response'].agg(['mean', 'count'])
gender_response['mean'] = gender_response['mean'] * 100

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 6))

# Response rate by gender
bars = ax1.bar(gender_response.index, gender_response['mean'], 
               color=['#ff9999', '#66b3ff'], alpha=0.7, edgecolor='black')
ax1.set_xlabel('Gender', fontsize=12, fontweight='bold')
ax1.set_ylabel('Interest Rate (%)', fontsize=12, fontweight='bold')
ax1.set_title('Interest Rate by Gender', fontsize=13, fontweight='bold')

for bar in bars:
    height = bar.get_height()
    ax1.text(bar.get_x() + bar.get_width()/2., height,
            f'{height:.1f}%', ha='center', va='bottom', fontsize=11, fontweight='bold')

# Customer count by gender
bars2 = ax2.bar(gender_response.index, gender_response['count'], 
                color=['#ff9999', '#66b3ff'], alpha=0.7, edgecolor='black')
ax2.set_xlabel('Gender', fontsize=12, fontweight='bold')
ax2.set_ylabel('Number of Customers', fontsize=12, fontweight='bold')
ax2.set_title('Customer Distribution by Gender', fontsize=13, fontweight='bold')

for bar in bars2:
    height = bar.get_height()
    ax2.text(bar.get_x() + bar.get_width()/2., height,
            f'{int(height):,}', ha='center', va='bottom', fontsize=11, fontweight='bold')

plt.tight_layout()
plt.savefig('charts/03_gender_analysis.png', dpi=300, bbox_inches='tight')
plt.show()

print("✓ Chart saved: charts/03_gender_analysis.png")

In [None]:
# Chart 4: Impact of Previous Vehicle Damage on Interest
damage_response = train_df.groupby('Vehicle_Damage')['Response'].agg(['mean', 'count'])
damage_response['mean'] = damage_response['mean'] * 100

fig, ax1 = plt.subplots(figsize=(10, 6))

bars = ax1.bar(damage_response.index, damage_response['mean'], 
               color=['#90ee90', '#ff6b6b'], alpha=0.7, edgecolor='black')
ax1.set_xlabel('Previous Vehicle Damage', fontsize=12, fontweight='bold')
ax1.set_ylabel('Interest Rate (%)', fontsize=12, fontweight='bold')
ax1.set_title('Impact of Vehicle Damage History on Cross-Sell Interest', 
              fontsize=14, fontweight='bold', pad=20)

for i, bar in enumerate(bars):
    height = bar.get_height()
    count = damage_response['count'].iloc[i]
    ax1.text(bar.get_x() + bar.get_width()/2., height,
            f'{height:.1f}%\n({int(count):,} customers)',
            ha='center', va='bottom', fontsize=11, fontweight='bold')

plt.tight_layout()
plt.savefig('charts/04_vehicle_damage_impact.png', dpi=300, bbox_inches='tight')
plt.show()

print("✓ Chart saved: charts/04_vehicle_damage_impact.png")

In [None]:
# Chart 5: Previously Insured Status Impact
insured_response = train_df.groupby('Previously_Insured')['Response'].agg(['mean', 'count'])
insured_response['mean'] = insured_response['mean'] * 100
insured_response.index = ['Not Previously Insured', 'Previously Insured']

fig, ax = plt.subplots(figsize=(10, 6))

bars = ax.bar(insured_response.index, insured_response['mean'], 
              color=['#ffa500', '#4169e1'], alpha=0.7, edgecolor='black')
ax.set_ylabel('Interest Rate (%)', fontsize=12, fontweight='bold')
ax.set_title('Cross-Sell Interest: New vs. Previously Insured Customers', 
             fontsize=14, fontweight='bold', pad=20)

for i, bar in enumerate(bars):
    height = bar.get_height()
    count = insured_response['count'].iloc[i]
    ax.text(bar.get_x() + bar.get_width()/2., height,
            f'{height:.1f}%\n({int(count):,} customers)',
            ha='center', va='bottom', fontsize=11, fontweight='bold')

plt.xticks(rotation=15, ha='right')
plt.tight_layout()
plt.savefig('charts/05_previous_insurance_impact.png', dpi=300, bbox_inches='tight')
plt.show()

print("✓ Chart saved: charts/05_previous_insurance_impact.png")

In [None]:
# Chart 6: Vehicle Age Analysis
vehicle_age_response = train_df.groupby('Vehicle_Age')['Response'].agg(['mean', 'count'])
vehicle_age_response['mean'] = vehicle_age_response['mean'] * 100

# Reorder for logical display
age_order = ['< 1 Year', '1-2 Year', '> 2 Years']
vehicle_age_response = vehicle_age_response.reindex(age_order)

fig, ax1 = plt.subplots(figsize=(12, 6))

bars = ax1.bar(range(len(vehicle_age_response)), vehicle_age_response['mean'], 
               color=['#98d8c8', '#6ab7a8', '#3d9688'], alpha=0.7, edgecolor='black')
ax1.set_xlabel('Vehicle Age', fontsize=12, fontweight='bold')
ax1.set_ylabel('Interest Rate (%)', fontsize=12, fontweight='bold', color='steelblue')
ax1.tick_params(axis='y', labelcolor='steelblue')
ax1.set_xticks(range(len(vehicle_age_response)))
ax1.set_xticklabels(vehicle_age_response.index)

# Add customer count line
ax2 = ax1.twinx()
ax2.plot(range(len(vehicle_age_response)), vehicle_age_response['count'], 
         color='red', marker='o', linewidth=2.5, markersize=10, label='Customer Count')
ax2.set_ylabel('Number of Customers', fontsize=12, fontweight='bold', color='red')
ax2.tick_params(axis='y', labelcolor='red')

plt.title('Customer Interest by Vehicle Age', fontsize=14, fontweight='bold', pad=20)

for i, (idx, row) in enumerate(vehicle_age_response.iterrows()):
    ax1.text(i, row['mean'], f"{row['mean']:.1f}%", 
            ha='center', va='bottom', fontsize=11, fontweight='bold')

fig.tight_layout()
plt.savefig('charts/06_vehicle_age_analysis.png', dpi=300, bbox_inches='tight')
plt.show()

print("✓ Chart saved: charts/06_vehicle_age_analysis.png")

In [None]:
# Chart 7: Premium Amount vs Response
# Use a copy to avoid modifying original dataframe
train_copy = train_df.copy()
train_copy['Premium_Segment'] = pd.qcut(train_copy['Annual_Premium'], 
                                       q=5, 
                                       labels=['Very Low', 'Low', 'Medium', 'High', 'Very High'])

premium_response = train_copy.groupby('Premium_Segment')['Response'].agg(['mean', 'count'])
premium_response['mean'] = premium_response['mean'] * 100

fig, ax1 = plt.subplots(figsize=(12, 6))

bars = ax1.bar(range(len(premium_response)), premium_response['mean'], 
               color=['#d4f1d4', '#a8e6a8', '#7cdb7c', '#50d050', '#24c524'], 
               alpha=0.7, edgecolor='black')
ax1.set_xlabel('Annual Premium Segment', fontsize=12, fontweight='bold')
ax1.set_ylabel('Interest Rate (%)', fontsize=12, fontweight='bold')
ax1.set_xticks(range(len(premium_response)))
ax1.set_xticklabels(premium_response.index)
ax1.set_title('Cross-Sell Interest by Customer Premium Segment', 
              fontsize=14, fontweight='bold', pad=20)

for i, (idx, row) in enumerate(premium_response.iterrows()):
    ax1.text(i, row['mean'], f"{row['mean']:.1f}%", 
            ha='center', va='bottom', fontsize=11, fontweight='bold')

plt.tight_layout()
plt.savefig('charts/07_premium_segment_analysis.png', dpi=300, bbox_inches='tight')
plt.show()

print("✓ Chart saved: charts/07_premium_segment_analysis.png")

In [None]:
# Chart 8: Sales Channel Performance
channel_response = train_df.groupby('Policy_Sales_Channel')['Response'].agg(['mean', 'count'])
channel_response['mean'] = channel_response['mean'] * 100
channel_response = channel_response.sort_values('mean', ascending=False).head(10)

fig, ax = plt.subplots(figsize=(12, 7))

bars = ax.barh(range(len(channel_response)), channel_response['mean'], 
               color='coral', alpha=0.7, edgecolor='black')
ax.set_yticks(range(len(channel_response)))
ax.set_yticklabels([f'Channel {idx}' for idx in channel_response.index])
ax.set_xlabel('Interest Rate (%)', fontsize=12, fontweight='bold')
ax.set_ylabel('Sales Channel', fontsize=12, fontweight='bold')
ax.set_title('Top 10 Sales Channels by Cross-Sell Interest Rate', 
             fontsize=14, fontweight='bold', pad=20)

for i, (idx, row) in enumerate(channel_response.iterrows()):
    ax.text(row['mean'], i, f"  {row['mean']:.1f}% ({int(row['count']):,} customers)", 
            va='center', fontsize=10, fontweight='bold')

plt.tight_layout()
plt.savefig('charts/08_top_sales_channels.png', dpi=300, bbox_inches='tight')
plt.show()

print("✓ Chart saved: charts/08_top_sales_channels.png")

In [None]:
# Chart 9: Customer Relationship Duration (Vintage) Analysis
# Use a copy to avoid modifying original dataframe
train_copy = train_df.copy()
train_copy['Vintage_Segment'] = pd.cut(train_copy['Vintage'], 
                                     bins=[0, 50, 100, 150, 200, 300],
                                     labels=['0-50 days', '51-100 days', '101-150 days', 
                                            '151-200 days', '200+ days'])

vintage_response = train_copy.groupby('Vintage_Segment')['Response'].agg(['mean', 'count'])
vintage_response['mean'] = vintage_response['mean'] * 100

fig, ax = plt.subplots(figsize=(12, 6))

bars = ax.bar(range(len(vintage_response)), vintage_response['mean'], 
              color='mediumpurple', alpha=0.7, edgecolor='black')
ax.set_xlabel('Customer Relationship Duration', fontsize=12, fontweight='bold')
ax.set_ylabel('Interest Rate (%)', fontsize=12, fontweight='bold')
ax.set_xticks(range(len(vintage_response)))
ax.set_xticklabels(vintage_response.index, rotation=30, ha='right')
ax.set_title('Cross-Sell Interest by Customer Tenure', 
             fontsize=14, fontweight='bold', pad=20)

for i, (idx, row) in enumerate(vintage_response.iterrows()):
    ax.text(i, row['mean'], f"{row['mean']:.1f}%\n({int(row['count']):,})", 
            ha='center', va='bottom', fontsize=10, fontweight='bold')

plt.tight_layout()
plt.savefig('charts/09_customer_tenure_analysis.png', dpi=300, bbox_inches='tight')
plt.show()

print("✓ Chart saved: charts/09_customer_tenure_analysis.png")

In [None]:
# Chart 10: Driving License Impact
license_response = train_df.groupby('Driving_License')['Response'].agg(['mean', 'count'])
license_response['mean'] = license_response['mean'] * 100
license_response.index = ['No License', 'Has License']

fig, ax = plt.subplots(figsize=(10, 6))

bars = ax.bar(license_response.index, license_response['mean'], 
              color=['#ff7f0e', '#2ca02c'], alpha=0.7, edgecolor='black')
ax.set_ylabel('Interest Rate (%)', fontsize=12, fontweight='bold')
ax.set_title('Impact of Driving License on Cross-Sell Interest', 
             fontsize=14, fontweight='bold', pad=20)

for i, bar in enumerate(bars):
    height = bar.get_height()
    count = license_response['count'].iloc[i]
    ax.text(bar.get_x() + bar.get_width()/2., height,
            f'{height:.1f}%\n({int(count):,} customers)',
            ha='center', va='bottom', fontsize=11, fontweight='bold')

plt.tight_layout()
plt.savefig('charts/10_driving_license_impact.png', dpi=300, bbox_inches='tight')
plt.show()

print("✓ Chart saved: charts/10_driving_license_impact.png")

In [None]:
# Chart 11: Combined High-Impact Factors
# Analyzing customers with vehicle damage AND not previously insured
segment_analysis = train_df.groupby(['Vehicle_Damage', 'Previously_Insured'])['Response'].agg(['mean', 'count'])
segment_analysis['mean'] = segment_analysis['mean'] * 100

fig, ax = plt.subplots(figsize=(12, 6))

x = np.arange(len(segment_analysis))
labels = [f"{damage}\n{'Prev. Insured' if ins == 1 else 'Not Insured'}" 
          for damage, ins in segment_analysis.index]

colors = ['#ff9999', '#66b3ff', '#99ff99', '#ffcc99']
bars = ax.bar(x, segment_analysis['mean'], color=colors, alpha=0.7, edgecolor='black')

ax.set_ylabel('Interest Rate (%)', fontsize=12, fontweight='bold')
ax.set_xlabel('Customer Segment', fontsize=12, fontweight='bold')
ax.set_title('Cross-Sell Interest by Customer Risk Profile', 
             fontsize=14, fontweight='bold', pad=20)
ax.set_xticks(x)
ax.set_xticklabels(labels, fontsize=10)

for i, bar in enumerate(bars):
    height = bar.get_height()
    count = segment_analysis['count'].iloc[i]
    ax.text(bar.get_x() + bar.get_width()/2., height,
            f'{height:.1f}%\n({int(count):,})',
            ha='center', va='bottom', fontsize=10, fontweight='bold')

plt.tight_layout()
plt.savefig('charts/11_customer_risk_segments.png', dpi=300, bbox_inches='tight')
plt.show()

print("✓ Chart saved: charts/11_customer_risk_segments.png")

## 4. Model Development & Performance Analysis

In [None]:
# Prepare data for modeling
print("Preparing data for predictive modeling...")

# Make a copy for processing
train_processed = train_df.copy()
test_processed = test_df.copy()

# Store IDs
test_ids = test_processed['id']

# Drop ID columns
train_processed = train_processed.drop('id', axis=1)
test_processed = test_processed.drop('id', axis=1)

# Encode categorical variables
label_encoders = {}
categorical_cols = ['Gender', 'Vehicle_Age', 'Vehicle_Damage']

for col in categorical_cols:
    le = LabelEncoder()
    train_processed[col] = le.fit_transform(train_processed[col])
    test_processed[col] = le.transform(test_processed[col])
    label_encoders[col] = le

# Separate features and target
X = train_processed.drop('Response', axis=1)
y = train_processed['Response']
X_test_final = test_processed.copy()

# Split for validation
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, 
                                                    random_state=42, stratify=y)

print(f"✓ Training set: {X_train.shape[0]:,} samples")
print(f"✓ Validation set: {X_val.shape[0]:,} samples")
print(f"✓ Test set: {X_test_final.shape[0]:,} samples")

In [None]:
# Train multiple models
print("\nTraining predictive models...\n")

models = {}
predictions_val = {}
predictions_test = {}
model_scores = {}

# 1. Logistic Regression
print("[1/4] Training Logistic Regression...")
lr_model = LogisticRegression(random_state=42, max_iter=1000)
lr_model.fit(X_train, y_train)
models['Logistic Regression'] = lr_model
predictions_val['Logistic Regression'] = lr_model.predict_proba(X_val)[:, 1]
predictions_test['Logistic Regression'] = lr_model.predict_proba(X_test_final)[:, 1]
model_scores['Logistic Regression'] = roc_auc_score(y_val, predictions_val['Logistic Regression'])
print(f"  ROC-AUC: {model_scores['Logistic Regression']:.4f}")

# 2. Random Forest
print("\n[2/4] Training Random Forest...")
rf_model = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
rf_model.fit(X_train, y_train)
models['Random Forest'] = rf_model
predictions_val['Random Forest'] = rf_model.predict_proba(X_val)[:, 1]
predictions_test['Random Forest'] = rf_model.predict_proba(X_test_final)[:, 1]
model_scores['Random Forest'] = roc_auc_score(y_val, predictions_val['Random Forest'])
print(f"  ROC-AUC: {model_scores['Random Forest']:.4f}")

# 3. XGBoost
print("\n[3/4] Training XGBoost...")
xgb_model = xgb.XGBClassifier(n_estimators=100, random_state=42, 
                              eval_metric='logloss', use_label_encoder=False)
xgb_model.fit(X_train, y_train)
models['XGBoost'] = xgb_model
predictions_val['XGBoost'] = xgb_model.predict_proba(X_val)[:, 1]
predictions_test['XGBoost'] = xgb_model.predict_proba(X_test_final)[:, 1]
model_scores['XGBoost'] = roc_auc_score(y_val, predictions_val['XGBoost'])
print(f"  ROC-AUC: {model_scores['XGBoost']:.4f}")

# 4. LightGBM
print("\n[4/4] Training LightGBM...")
lgb_model = lgb.LGBMClassifier(n_estimators=100, random_state=42)
lgb_model.fit(X_train, y_train)
models['LightGBM'] = lgb_model
predictions_val['LightGBM'] = lgb_model.predict_proba(X_val)[:, 1]
predictions_test['LightGBM'] = lgb_model.predict_proba(X_test_final)[:, 1]
model_scores['LightGBM'] = roc_auc_score(y_val, predictions_val['LightGBM'])
print(f"  ROC-AUC: {model_scores['LightGBM']:.4f}")

print("\n" + "="*80)
print("MODEL TRAINING COMPLETE")
print("="*80)

In [None]:
# Chart 12: Model Performance Comparison
fig, ax = plt.subplots(figsize=(12, 6))

model_names = list(model_scores.keys())
scores = list(model_scores.values())

colors_gradient = ['#e74c3c', '#e67e22', '#f39c12', '#27ae60']
bars = ax.bar(model_names, scores, color=colors_gradient, alpha=0.7, edgecolor='black')

ax.set_ylabel('ROC-AUC Score', fontsize=12, fontweight='bold')
ax.set_xlabel('Prediction Model', fontsize=12, fontweight='bold')
ax.set_title('Model Performance Comparison: Accuracy of Customer Interest Prediction', 
             fontsize=14, fontweight='bold', pad=20)
ax.set_ylim(0.5, 1.0)
ax.axhline(y=0.5, color='red', linestyle='--', alpha=0.3, label='Random Guess (0.5)')

for i, bar in enumerate(bars):
    height = bar.get_height()
    ax.text(bar.get_x() + bar.get_width()/2., height,
            f'{height:.4f}',
            ha='center', va='bottom', fontsize=12, fontweight='bold')

plt.xticks(rotation=15, ha='right')
plt.legend()
plt.tight_layout()
plt.savefig('charts/12_model_performance_comparison.png', dpi=300, bbox_inches='tight')
plt.show()

print("✓ Chart saved: charts/12_model_performance_comparison.png")

In [None]:
# Chart 13: ROC Curves for All Models
fig, ax = plt.subplots(figsize=(10, 8))

colors_roc = ['#e74c3c', '#3498db', '#2ecc71', '#f39c12']

for i, (model_name, y_pred) in enumerate(predictions_val.items()):
    fpr, tpr, _ = roc_curve(y_val, y_pred)
    auc_score = model_scores[model_name]
    ax.plot(fpr, tpr, label=f'{model_name} (AUC = {auc_score:.4f})', 
            linewidth=2.5, color=colors_roc[i])

ax.plot([0, 1], [0, 1], 'k--', linewidth=2, label='Random Guess', alpha=0.3)
ax.set_xlabel('False Positive Rate (Incorrect Predictions)', fontsize=12, fontweight='bold')
ax.set_ylabel('True Positive Rate (Correct Predictions)', fontsize=12, fontweight='bold')
ax.set_title('Model Prediction Accuracy: ROC Curves', 
             fontsize=14, fontweight='bold', pad=20)
ax.legend(loc='lower right', fontsize=11)
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('charts/13_roc_curves_all_models.png', dpi=300, bbox_inches='tight')
plt.show()

print("✓ Chart saved: charts/13_roc_curves_all_models.png")

In [None]:
# Chart 14: Feature Importance (using best model - likely XGBoost or LightGBM)
best_model_name = max(model_scores, key=model_scores.get)
best_model = models[best_model_name]

print(f"Best performing model: {best_model_name}")

# Get feature importance
if hasattr(best_model, 'feature_importances_'):
    feature_importance = pd.DataFrame({
        'feature': X.columns,
        'importance': best_model.feature_importances_
    }).sort_values('importance', ascending=False)
    
    fig, ax = plt.subplots(figsize=(12, 8))
    
    bars = ax.barh(range(len(feature_importance)), feature_importance['importance'], 
                   color='teal', alpha=0.7, edgecolor='black')
    ax.set_yticks(range(len(feature_importance)))
    ax.set_yticklabels(feature_importance['feature'])
    ax.set_xlabel('Importance Score', fontsize=12, fontweight='bold')
    ax.set_ylabel('Customer Characteristic', fontsize=12, fontweight='bold')
    ax.set_title(f'Key Factors Driving Cross-Sell Interest ({best_model_name})', 
                 fontsize=14, fontweight='bold', pad=20)
    
    for i, (idx, row) in enumerate(feature_importance.iterrows()):
        ax.text(row['importance'], i, f"  {row['importance']:.4f}", 
                va='center', fontsize=10, fontweight='bold')
    
    plt.tight_layout()
    plt.savefig('charts/14_feature_importance.png', dpi=300, bbox_inches='tight')
    plt.show()
    
    print("✓ Chart saved: charts/14_feature_importance.png")

In [None]:
# Chart 15: Prediction Confidence Distribution
best_predictions = predictions_val[best_model_name]

fig, ax = plt.subplots(figsize=(12, 6))

# Create bins for prediction probabilities
bins = [0, 0.2, 0.4, 0.6, 0.8, 1.0]
labels = ['Very Low\n(0-20%)', 'Low\n(20-40%)', 'Medium\n(40-60%)', 
          'High\n(60-80%)', 'Very High\n(80-100%)']

pred_df = pd.DataFrame({
    'prediction': best_predictions,
    'actual': y_val
})
pred_df['confidence_bin'] = pd.cut(pred_df['prediction'], bins=bins, labels=labels)

confidence_dist = pred_df.groupby('confidence_bin').size()

bars = ax.bar(range(len(confidence_dist)), confidence_dist.values, 
              color=['#ff6b6b', '#ffa500', '#ffcc00', '#90ee90', '#2ecc71'], 
              alpha=0.7, edgecolor='black')
ax.set_xticks(range(len(confidence_dist)))
ax.set_xticklabels(confidence_dist.index)
ax.set_ylabel('Number of Customers', fontsize=12, fontweight='bold')
ax.set_xlabel('Predicted Interest Level', fontsize=12, fontweight='bold')
ax.set_title('Distribution of Predicted Customer Interest Levels', 
             fontsize=14, fontweight='bold', pad=20)

for i, bar in enumerate(bars):
    height = bar.get_height()
    percentage = (height / len(pred_df)) * 100
    ax.text(bar.get_x() + bar.get_width()/2., height,
            f'{int(height):,}\n({percentage:.1f}%)',
            ha='center', va='bottom', fontsize=10, fontweight='bold')

plt.tight_layout()
plt.savefig('charts/15_prediction_confidence_distribution.png', dpi=300, bbox_inches='tight')
plt.show()

print("✓ Chart saved: charts/15_prediction_confidence_distribution.png")

In [None]:
# Chart 16: Business Impact Analysis - Targeting Efficiency
# Show how many customers to target for different probability thresholds

thresholds = [0.3, 0.4, 0.5, 0.6, 0.7, 0.8]
results = []

for threshold in thresholds:
    targeted = (best_predictions >= threshold).sum()
    conversion_rate = y_val[best_predictions >= threshold].mean() * 100 if targeted > 0 else 0
    coverage = (y_val[best_predictions >= threshold].sum() / y_val.sum()) * 100
    results.append({
        'threshold': f'{int(threshold*100)}%',
        'customers_targeted': targeted,
        'conversion_rate': conversion_rate,
        'interested_captured': coverage
    })

results_df = pd.DataFrame(results)

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))

# Left plot: Customers targeted vs conversion rate
ax1_twin = ax1.twinx()
bars = ax1.bar(range(len(results_df)), results_df['customers_targeted'], 
               color='skyblue', alpha=0.7, edgecolor='black', label='Customers Targeted')
line = ax1_twin.plot(range(len(results_df)), results_df['conversion_rate'], 
                     color='red', marker='o', linewidth=2.5, markersize=10, 
                     label='Conversion Rate')

ax1.set_xlabel('Minimum Confidence Threshold', fontsize=12, fontweight='bold')
ax1.set_ylabel('Number of Customers Targeted', fontsize=12, fontweight='bold', color='skyblue')
ax1_twin.set_ylabel('Conversion Rate (%)', fontsize=12, fontweight='bold', color='red')
ax1.set_xticks(range(len(results_df)))
ax1.set_xticklabels(results_df['threshold'])
ax1.set_title('Targeting Strategy: Volume vs. Precision', fontsize=13, fontweight='bold')
ax1.tick_params(axis='y', labelcolor='skyblue')
ax1_twin.tick_params(axis='y', labelcolor='red')

# Right plot: Coverage of interested customers
bars2 = ax2.bar(range(len(results_df)), results_df['interested_captured'], 
                color='green', alpha=0.7, edgecolor='black')
ax2.set_xlabel('Minimum Confidence Threshold', fontsize=12, fontweight='bold')
ax2.set_ylabel('% of Interested Customers Reached', fontsize=12, fontweight='bold')
ax2.set_xticks(range(len(results_df)))
ax2.set_xticklabels(results_df['threshold'])
ax2.set_title('Market Coverage by Threshold', fontsize=13, fontweight='bold')
ax2.set_ylim(0, 100)

for i, bar in enumerate(bars2):
    height = bar.get_height()
    ax2.text(bar.get_x() + bar.get_width()/2., height,
            f'{height:.1f}%', ha='center', va='bottom', fontsize=10, fontweight='bold')

plt.tight_layout()
plt.savefig('charts/16_targeting_strategy_analysis.png', dpi=300, bbox_inches='tight')
plt.show()

print("✓ Chart saved: charts/16_targeting_strategy_analysis.png")

# Print summary table
print("\nTargeting Strategy Summary:")
print(results_df.to_string(index=False))

## 5. Generate Submission File

In [None]:
# Create submission file using best model
submission = pd.DataFrame({
    'id': test_ids,
    'Response': predictions_test[best_model_name]
})

# Note: Not saving to file as per requirement, keeping in memory
print(f"Submission prepared using {best_model_name}")
print(f"Submission shape: {submission.shape}")
print("\nFirst few predictions:")
print(submission.head(10))

# If you want to save it later, uncomment:
# submission.to_csv('submission.csv', index=False)
# print("\n✓ Submission saved to submission.csv")

## 6. Summary

This analysis has generated comprehensive business insights:

1. **Customer Demographics** - Age and gender impact on cross-sell interest
2. **Vehicle History** - Strong correlation between damage history and interest
3. **Insurance History** - Previous insurance status is a key factor
4. **Vehicle Characteristics** - Age of vehicle influences purchase intent
5. **Financial Segments** - Premium levels and their relationship to cross-sell
6. **Channel Performance** - Which sales channels drive best results
7. **Customer Tenure** - Relationship duration impact
8. **Predictive Models** - Multiple models tested for accuracy
9. **Targeting Strategy** - Optimal thresholds for campaign efficiency

All charts have been saved to the `charts/` directory for business presentation.