In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report, roc_auc_score, roc_curve
import xgboost as xgb
import warnings
warnings.filterwarnings('ignore')

# Set random seed for reproducibility
np.random.seed(42)

# Load data
train_data = pd.read_csv('train_storming_round.csv')
test_data = pd.read_csv('test_storming_round.csv')

print("Train data shape:", train_data.shape)
print("Test data shape:", test_data.shape)

# Preview the training data
print("\nTraining data preview:")
print(train_data.head())

# Check for missing values
print("\nMissing values in training data:")
print(train_data.isnull().sum())

# Create target column first, before any feature engineering
# to prevent target leakage
def create_target(df):
    if 'new_policy_count' in df.columns:
        # Create target column: if new_policy_count is 0, target = 0, else target = 1
        return np.where(df['new_policy_count'] == 0, 0, 1)
    return None

# Create target for training data only
target = create_target(train_data)

# Data preprocessing - modified to avoid using target-related information
def preprocess_data(df, is_training=True):
    # Make a copy to avoid modifying the original
    processed_df = df.copy()
    
    # Convert date columns to datetime format
    date_columns = ['agent_join_month', 'first_policy_sold_month', 'year_month']
    for col in date_columns:
        if col in processed_df.columns:
            processed_df[col] = pd.to_datetime(processed_df[col])
    
    # Create features from date columns
    processed_df['agent_experience_months'] = ((processed_df['year_month'].dt.year - processed_df['agent_join_month'].dt.year) * 12 + 
                                    (processed_df['year_month'].dt.month - processed_df['agent_join_month'].dt.month))
    
    # Calculate months since first policy sold
    processed_df['months_since_first_policy'] = np.where(pd.notnull(processed_df['first_policy_sold_month']),
                                             ((processed_df['year_month'].dt.year - processed_df['first_policy_sold_month'].dt.year) * 12 + 
                                             (processed_df['year_month'].dt.month - processed_df['first_policy_sold_month'].dt.month)),
                                             -1)  # -1 for agents who haven't sold a policy yet
    
    # Extract month and year features
    processed_df['current_month'] = processed_df['year_month'].dt.month
    processed_df['current_year'] = processed_df['year_month'].dt.year
    processed_df['join_month'] = processed_df['agent_join_month'].dt.month
    processed_df['join_year'] = processed_df['agent_join_month'].dt.year
    
    # Calculate ratios and other derived features WITHOUT using new_policy_count
    processed_df['proposal_to_quotation_ratio'] = processed_df['unique_proposal'] / (processed_df['unique_quotations'] + 1)
    processed_df['quotation_to_customer_ratio'] = processed_df['unique_quotations'] / (processed_df['unique_customers'] + 1)
    
    # Remove features that use new_policy_count to prevent target leakage
    # processed_df['policy_per_customer'] = processed_df['new_policy_count'] / (processed_df['unique_customers'] + 1)
    # processed_df['average_premium'] = processed_df['ANBP_value'] / (processed_df['new_policy_count'] + 1)
    
    # Alternative features that don't use the target
    if 'number_of_policy_holders' in processed_df.columns:
        processed_df['cash_payment_ratio'] = processed_df['number_of_cash_payment_policies'] / (processed_df['number_of_policy_holders'] + 1)
    
    # Calculate activity decline features
    processed_df['proposal_decline_7_15'] = processed_df['unique_proposals_last_7_days'] - processed_df['unique_proposals_last_15_days']
    processed_df['proposal_decline_15_21'] = processed_df['unique_proposals_last_15_days'] - processed_df['unique_proposals_last_21_days']
    processed_df['quotation_decline_7_15'] = processed_df['unique_quotations_last_7_days'] - processed_df['unique_quotations_last_15_days']
    processed_df['quotation_decline_15_21'] = processed_df['unique_quotations_last_15_days'] - processed_df['unique_quotations_last_21_days']
    processed_df['customer_decline_7_15'] = processed_df['unique_customers_last_7_days'] - processed_df['unique_customers_last_15_days']
    processed_df['customer_decline_15_21'] = processed_df['unique_customers_last_15_days'] - processed_df['unique_customers_last_21_days']
    
    # Drop original date columns
    drop_cols = ['agent_join_month', 'first_policy_sold_month', 'year_month']
    
    # Remove new_policy_count to prevent target leakage
    if 'new_policy_count' in processed_df.columns:
        drop_cols.append('new_policy_count')
    
    processed_df = processed_df.drop(columns=drop_cols, errors='ignore')
    
    return processed_df

# Preprocess train and test data
train_processed = preprocess_data(train_data)
test_processed = preprocess_data(test_data, is_training=False)

# Define features
X = train_processed.drop(['row_id', 'agent_code'], axis=1, errors='ignore')
y = target  # Use the target we created earlier

# Print feature set to verify no leakage
print("\nFeatures used in the model:")
print(X.columns.tolist())

# Split data for training and validation
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_valid_scaled = scaler.transform(X_valid)

print("\nTrain features shape:", X_train.shape)
print("Validation features shape:", X_valid.shape)

# Check class distribution
print("\nClass distribution in training set:")
print(pd.Series(y_train).value_counts(normalize=True))

# Cross-validation setup
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Initialize XGBoost classifier
xgb_model = xgb.XGBClassifier(
    objective='binary:logistic',
    n_estimators=100,
    learning_rate=0.1,
    max_depth=4,
    min_child_weight=5,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    use_label_encoder=False,
    eval_metric='logloss',
    early_stopping_rounds=20
)

# Train the model
xgb_model.fit(
    X_train, y_train,
    eval_set=[(X_train, y_train), (X_valid, y_valid)],
    verbose=100
)

# Cross-validation
cv_scores = []
for train_idx, valid_idx in cv.split(X, y):
    X_cv_train, X_cv_valid = X.iloc[train_idx], X.iloc[valid_idx]
    y_cv_train, y_cv_valid = y[train_idx], y[valid_idx]
    
    # Scale features
    X_cv_train_scaled = scaler.fit_transform(X_cv_train)
    X_cv_valid_scaled = scaler.transform(X_cv_valid)
    
    # Train model
    model = xgb.XGBClassifier(
        objective='binary:logistic',
        n_estimators=100,
        learning_rate=0.1,
        max_depth=4,
        random_state=42,
        use_label_encoder=False
    )
    model.fit(X_cv_train, y_cv_train)
    
    # Predict
    y_cv_pred = model.predict(X_cv_valid)
    
    # Calculate metrics
    cv_scores.append({
        'accuracy': accuracy_score(y_cv_valid, y_cv_pred),
        'precision': precision_score(y_cv_valid, y_cv_pred),
        'recall': recall_score(y_cv_valid, y_cv_pred),
        'f1': f1_score(y_cv_valid, y_cv_pred),
        'roc_auc': roc_auc_score(y_cv_valid, model.predict_proba(X_cv_valid)[:, 1])
    })

# Print cross-validation results
print("\nCross-validation results:")
for metric in ['accuracy', 'precision', 'recall', 'f1', 'roc_auc']:
    values = [score[metric] for score in cv_scores]
    print(f"{metric}: {np.mean(values):.4f} ± {np.std(values):.4f}")

# Make predictions on validation set
y_pred_proba = xgb_model.predict_proba(X_valid)[:, 1]
y_pred = (y_pred_proba >= 0.5).astype(int)

# Evaluate the model
print("\nModel Evaluation:")
print(f"Accuracy: {accuracy_score(y_valid, y_pred):.4f}")
print(f"Precision: {precision_score(y_valid, y_pred):.4f}")
print(f"Recall: {recall_score(y_valid, y_pred):.4f}")
print(f"F1 Score: {f1_score(y_valid, y_pred):.4f}")
print(f"ROC AUC: {roc_auc_score(y_valid, y_pred_proba):.4f}")

# Classification report
print("\nClassification Report:")
print(classification_report(y_valid, y_pred))

# Plot confusion matrix
plt.figure(figsize=(8, 6))
cm = confusion_matrix(y_valid, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False)
plt.title('Confusion Matrix')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.savefig('confusion_matrix.png')
plt.close()

# Plot ROC curve
plt.figure(figsize=(8, 6))
fpr, tpr, _ = roc_curve(y_valid, y_pred_proba)
plt.plot(fpr, tpr, lw=2, label=f'ROC curve (AUC = {roc_auc_score(y_valid, y_pred_proba):.4f})')
plt.plot([0, 1], [0, 1], 'k--', lw=2)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc="lower right")
plt.savefig('roc_curve.png')
plt.close()

# Plot feature importance
plt.figure(figsize=(12, 8))
xgb.plot_importance(xgb_model, max_num_features=20, height=0.5)
plt.title('XGBoost Feature Importance')
plt.savefig('feature_importance.png')
plt.close()

# Prepare test data for prediction
X_test = test_processed.drop(['row_id', 'agent_code'], axis=1, errors='ignore')
X_test_scaled = scaler.transform(X_test)

# Make predictions on test data
test_pred_proba = xgb_model.predict_proba(X_test)[:, 1]
test_pred = (test_pred_proba >= 0.5).astype(int)

# Create submission file
submission = pd.DataFrame({
    'row_id': test_data['row_id'],
    'target_column': test_pred
})

# Save submission file
submission.to_csv('xgboost_submission.csv', index=False)
print("\nSubmission file created: xgboost_submission.csv")

# Additional threshold optimization (for better performance)
thresholds = np.linspace(0.1, 0.9, 9)
best_threshold = 0.5
best_f1 = 0

for threshold in thresholds:
    y_pred_thresh = (y_pred_proba >= threshold).astype(int)
    f1 = f1_score(y_valid, y_pred_thresh)
    if f1 > best_f1:
        best_f1 = f1
        best_threshold = threshold

print(f"\nBest threshold: {best_threshold:.2f} with F1 score: {best_f1:.4f}")

# Apply best threshold to test predictions
test_pred_optimized = (test_pred_proba >= best_threshold).astype(int)
submission_optimized = pd.DataFrame({
    'row_id': test_data['row_id'],
    'target_column': test_pred_optimized
})

# Save optimized submission file
submission_optimized.to_csv('submission2.csv', index=False)
print("\nOptimized submission file created: xgboost_submission_optimized.csv")

Train data shape: (15308, 23)
Test data shape: (914, 23)

Training data preview:
   row_id agent_code  agent_age agent_join_month first_policy_sold_month  \
0       1   455ca878         45         2/1/2021                9/1/2023   
1       2   c823ce77         48         4/1/2022                2/1/2024   
2       3   62154055         53         5/1/2020                9/1/2023   
3       4   c58bfa6e         44         7/1/2019                3/1/2022   
4       5   b1e5f770         20         9/1/2020                2/1/2023   

  year_month  unique_proposals_last_7_days  unique_proposals_last_15_days  \
0   1/1/2023                             3                              6   
1   1/1/2023                             1                              4   
2   1/1/2023                             3                              5   
3   1/1/2023                             1                              0   
4   1/1/2023                             0                              6   


<Figure size 1200x800 with 0 Axes>