In [1]:
# 1. Imports and Configurations
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from collections import Counter
from itertools import combinations

# Preprocessing & Feature Engineering
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif, RFE
from sklearn.inspection import permutation_importance

# Imbalance Handling
from imblearn.over_sampling import SMOTE, ADASYN, BorderlineSMOTE
from imblearn.combine import SMOTEENN, SMOTETomek
from imblearn.pipeline import Pipeline as ImbPipeline

# Models & Tuning
import lightgbm as lgb
import optuna
import shap

# Metrics
from sklearn.metrics import (
    f1_score, roc_auc_score, roc_curve, precision_recall_curve, confusion_matrix,
    precision_score, recall_score, accuracy_score
)

# Configurations and Style
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("mako")


  from .autonotebook import tqdm as notebook_tqdm


In [None]:
print("Loading data...")
train_df = pd.read_csv(r'C:\Users\rapha\Documents\VSCode\Kaggle\Intellectra\Datasets\Processed (2)\Datasetstrain_improved.csv')
test_df = pd.read_csv(r'C:\Users\rapha\Documents\VSCode\Kaggle\Intellectra\Datasets\Processed (2)\Datasetstest_improved.csv')

N_SPLITS = 5
RANDOM_STATE = 42
OPTUNA_TRIALS = 50  # adjust for runtime

Loading data...


In [None]:
# Feature Engineering
print("\n2. Creating advanced features...")
# --- Feature Engineering ---
def create_advanced_features(df):
    """Create advanced features for better model performance."""
    df = df.copy()
    # Behavioral Patterns
    df['transaction_momentum'] = df['total_transactions'] / (df['recency_days'] + 1)
    df['spending_acceleration'] = df['total_spent'] / (df['customer_lifetime_days'] + 1)
    df['value_consistency'] = df['avg_transaction_value'] / (df['transaction_value_std'].fillna(0) + 1)
    
    # Engagement Metrics
    df['days_since_last_purchase'] = df['recency_days']
    df['purchase_intensity'] = df['total_transactions'] / (df['membership_tenure_days'] + 1)
    df['loyalty_ratio'] = df['customer_lifetime_days'] / (df['membership_tenure_days'] + 1)
    
    # Customer Lifecycle Features
    df['is_new_customer'] = (df['membership_tenure_days'] < 30).astype(int)
    df['is_dormant'] = (df['recency_days'] > 90).astype(int)
    df['is_high_value'] = (df['total_spent'] > df['total_spent'].quantile(0.8)).astype(int)
    
    # Temporal Features
    df['avg_monthly_transactions'] = df['total_transactions'] / ((df['customer_lifetime_days'] / 30) + 1)
    df['avg_monthly_spend'] = df['total_spent'] / ((df['customer_lifetime_days'] / 30) + 1)
    
    # Diversity Features
    df['product_exploration'] = df['unique_products'] / (df['total_transactions'] + 1)
    df['channel_diversity'] = df['unique_sources'] / (df['total_transactions'] + 1)
    
    # Family-based Features
    if all(col in df.columns for col in ['has_children', 'NoOfChild', 'eldest_child_age', 'youngest_child_age']):
        df['child_factor'] = df['has_children'] * (df['eldest_child_age'].fillna(0) + df['youngest_child_age'].fillna(0)) / 2
        df['family_spending_per_child'] = df['total_spent'] / (df['NoOfChild'] + 1)
    elif 'NoOfChild' in df.columns:
        df['family_spending_per_child'] = df['total_spent'] / (df['NoOfChild'] + 1)
    
    # Interaction Features
    if all(col in df.columns for col in ['recency_score', 'frequency_score', 'monetary_score']):
        df['rfm_interaction'] = df['recency_score'] * df['frequency_score'] * df['monetary_score']
    
    if all(col in df.columns for col in ['avg_transaction_value', 'total_transactions', 'customer_lifetime_days']):
        df['transaction_frequency'] = df['total_transactions'] / (df['customer_lifetime_days'] + 1) * 30
        df['value_frequency_ratio'] = df['avg_transaction_value'] * df['transaction_frequency']

    return df
train_df = create_advanced_features(train_df)
test_df = create_advanced_features(test_df)

In [None]:
# Preprocessing
print("\n3. Preprocessing data for modeling...")
def create_target_encoding_features(df_train, df_test, categorical_cols, target_col):
    """Create target encoding features with cross-validation."""
    df_train_out = df_train.copy()
    df_test_out = df_test.copy()
    
    kf = KFold(n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_STATE)
    
    for col in categorical_cols:
        if col in df_train_out.columns:
            df_train_out[f'{col}_target_enc'] = 0
            for train_idx, val_idx in kf.split(df_train_out):
                train_fold, val_fold = df_train_out.iloc[train_idx], df_train_out.iloc[val_idx]
                target_mean = train_fold.groupby(col)[target_col].mean()
                df_train_out.loc[val_idx, f'{col}_target_enc'] = val_fold[col].map(target_mean).fillna(target_mean.mean())
            
            full_train_mean = df_train_out.groupby(col)[target_col].mean()
            df_test_out[f'{col}_target_enc'] = df_test_out[col].map(full_train_mean).fillna(full_train_mean.mean())
    
    return df_train_out, df_test_out

# --- Data Preprocessing ---
def preprocess_for_modeling(df_train, df_test):
    """Prepares train and test dataframes for modeling."""
    X = df_train.drop('next_buy', axis=1)
    y = df_train['next_buy']
    X_test = df_test.copy()

    categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()
    
    cols_to_drop = [
        'MemberID', 'first_transaction', 'last_transaction', 'JoinDate',
        'EldestKidDOB', 'YoungestKidDOB', 'City', 'preferred_source',
        'preferred_source_grouped'
    ]
    X.drop(columns=cols_to_drop, errors='ignore', inplace=True)
    X_test.drop(columns=cols_to_drop, errors='ignore', inplace=True)

    categorical_cols = [c for c in categorical_cols if c not in cols_to_drop]

    X_temp = X.copy()
    X_temp['next_buy'] = y
    X_temp, X_test = create_target_encoding_features(X_temp, X_test, categorical_cols, 'next_buy')
    X = X_temp.drop('next_buy', axis=1)

    remaining_categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()
    X = pd.get_dummies(X, columns=remaining_categorical_cols, dummy_na=True, drop_first=True)
    X_test = pd.get_dummies(X_test, columns=remaining_categorical_cols, dummy_na=True, drop_first=True)

    X, X_test = X.align(X_test, join='left', axis=1, fill_value=0)
    
    # Ensure test set has all columns from train set after alignment
    for col in X.columns:
        if col not in X_test.columns:
             X_test[col] = 0
    X_test = X_test[X.columns] # Ensure same column order

    print(f"Data preprocessed. Number of features: {X.shape[1]}")
    return X, y, X_test

X, y, X_test = preprocess_for_modeling(train_df, test_df)
# Handle infinite values that may result from feature engineering
X.replace([np.inf, -np.inf], np.nan, inplace=True)
X_test.replace([np.inf, -np.inf], np.nan, inplace=True)

In [None]:
 # Exploratory Analysis
print("\n4. Running exploratory analysis...")
# --- Analysis & Exploration Functions ---
def compare_sampling_strategies(X, y):
    """Compare different sampling strategies."""
    print("\nOriginal class distribution:", Counter(y))
    sampling_strategies = {
        'SMOTE': SMOTE(random_state=RANDOM_STATE),
        'ADASYN': ADASYN(random_state=RANDOM_STATE),
        'BorderlineSMOTE': BorderlineSMOTE(random_state=RANDOM_STATE),
        'SMOTEENN': SMOTEENN(random_state=RANDOM_STATE),
        'SMOTETomek': SMOTETomek(random_state=RANDOM_STATE)
    }
    results = {}
    imputer = SimpleImputer(strategy='median')
    X_imputed = imputer.fit_transform(X)
    
    for name, sampler in sampling_strategies.items():
        try:
            X_resampled, y_resampled = sampler.fit_resample(X_imputed, y)
            results[name] = {'X_shape': X_resampled.shape, 'class_distribution': Counter(y_resampled)}
            print(f"{name}: {Counter(y_resampled)}")
        except Exception as e:
            print(f"Error with {name}: {e}")
    return results

def custom_cost_sensitive_learning(y):
    """Implement cost-sensitive learning approach."""
    class_counts = Counter(y)
    total_samples = len(y)
    weight_for_0 = total_samples / (2 * class_counts[0])
    weight_for_1 = total_samples / (2 * class_counts[1]) * 2
    class_weights = {0: weight_for_0, 1: weight_for_1}
    print(f"Custom class weights: {class_weights}")
    return class_weights

plt.figure(figsize=(6, 6))
y.value_counts().plot(kind='pie', autopct='%1.1f%%', colors=sns.color_palette('viridis', 2))
plt.title('Original Class Distribution')
plt.ylabel('')
plt.show()

sampling_results = compare_sampling_strategies(X, y)
custom_weights = custom_cost_sensitive_learning(y)

In [None]:
# Hyperparameter Tuning
print(f"\n5. Optimizing LightGBM hyperparameters with Optuna ({OPTUNA_TRIALS} trials)...")
# --- Modeling & Tuning ---
def optimize_lightgbm(X, y, n_trials=10):
    """Optimize LightGBM hyperparameters using Optuna."""
    def objective(trial):
        params = {
            'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
            'max_depth': trial.suggest_int('max_depth', 3, 15),
            'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
            'subsample': trial.suggest_float('subsample', 0.6, 1.0),
            'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
            'reg_alpha': trial.suggest_float('reg_alpha', 0, 10),
            'reg_lambda': trial.suggest_float('reg_lambda', 0, 10),
            'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
            'random_state': RANDOM_STATE,
            'class_weight': 'balanced'
        }
        pipeline = ImbPipeline([
            ('imputer', SimpleImputer(strategy='median')),
            ('scaler', StandardScaler()),
            ('smote', SMOTE(random_state=RANDOM_STATE)),
            ('classifier', lgb.LGBMClassifier(**params))
        ])
        
        skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=RANDOM_STATE)
        scores = []
        for train_idx, val_idx in skf.split(X, y):
            X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
            X_val, y_val = X.iloc[val_idx], y.iloc[val_idx]
            
            pipeline.fit(X_train, y_train)
            y_pred = pipeline.predict_proba(X_val)[:, 1]
            score = roc_auc_score(y_val, y_pred)
            scores.append(score)
        
        return np.mean(scores)
    
    study = optuna.create_study(direction='maximize')
    study.optimize(objective, n_trials=n_trials, n_jobs=-1)
    return study.best_params
best_lgbm_params = optimize_lightgbm(X, y, n_trials=OPTUNA_TRIALS)
print(f"Best LightGBM parameters found: {best_lgbm_params}")

In [None]:
 # Model Training and Prediction
print("\n6. Training model with Stratified K-Fold cross-validation...")
def train_and_predict(X, y, X_test, lgbm_params):
    """Train the model using Stratified Cross-Validation and make predictions."""
    pipeline = ImbPipeline([
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler()),
        ('smote', SMOTE(random_state=RANDOM_STATE)),
        ('classifier', lgb.LGBMClassifier(**lgbm_params))
    ])

    skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_STATE)
    
    oof_preds = np.zeros(len(X))
    test_preds = np.zeros(len(X_test))
    feature_importances = pd.DataFrame(index=X.columns)
    metrics = {'f1': [], 'roc_auc': [], 'precision': [], 'recall': [], 'accuracy': []}

    for fold, (train_idx, val_idx) in enumerate(skf.split(X, y), 1):
        print(f"--- Fold {fold}/{N_SPLITS} ---")
        X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
        X_val, y_val = X.iloc[val_idx], y.iloc[val_idx]

        pipeline.fit(X_train, y_train)
        val_proba = pipeline.predict_proba(X_val)[:, 1]
        val_class = pipeline.predict(X_val)

        oof_preds[val_idx] = val_proba
        test_preds += pipeline.predict_proba(X_test)[:, 1] / N_SPLITS
        feature_importances[f'fold_{fold}'] = pipeline.named_steps['classifier'].feature_importances_

        metrics['f1'].append(f1_score(y_val, val_class))
        metrics['roc_auc'].append(roc_auc_score(y_val, val_proba))
        metrics['precision'].append(precision_score(y_val, val_class))
        metrics['recall'].append(recall_score(y_val, val_class))
        metrics['accuracy'].append(accuracy_score(y_val, val_class))

        print(f"Validation F1-Score: {metrics['f1'][-1]:.4f}")
        print(f"Validation ROC AUC: {metrics['roc_auc'][-1]:.4f}\n")
    
    print("--- Cross-Validation Summary ---")
    for name, vals in metrics.items():
        print(f"Average {name.upper()}: {np.mean(vals):.4f} (Std: {np.std(vals):.4f})")
        
    return oof_preds, test_preds, feature_importances, pipeline
oof_preds, test_preds, feature_importances, final_pipeline = train_and_predict(X, y, X_test, best_lgbm_params)

In [None]:
# Evaluation
print("\n7. Evaluating model performance...")
# --- Evaluation & Analysis ---
def plot_evaluation_results(y_true, oof_preds, best_thresh):
    """Plot ROC, Precision-Recall curves and Confusion Matrix."""
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))
    
    # ROC Curve
    fpr, tpr, _ = roc_curve(y_true, oof_preds)
    ax1.plot(fpr, tpr, label=f"AUC = {roc_auc_score(y_true, oof_preds):.4f}")
    ax1.plot([0, 1], [0, 1], 'k--')
    ax1.set(title='ROC Curve (OOF)', xlabel='False Positive Rate', ylabel='True Positive Rate')
    ax1.legend()
    
    # Precision-Recall Curve
    precision, recall, _ = precision_recall_curve(y_true, oof_preds)
    ax2.plot(recall, precision, label='PR Curve')
    ax2.set(title='Precision-Recall Curve', xlabel='Recall', ylabel='Precision')
    ax2.legend()
    plt.tight_layout()
    plt.show()

    # Confusion Matrix
    cm = confusion_matrix(y_true, (oof_preds > best_thresh).astype(int))
    plt.figure(figsize=(7, 5))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                xticklabels=['No Buy','Buy'], yticklabels=['No Buy','Buy'])
    plt.title(f'Confusion Matrix (OOF) at Threshold {best_thresh:.2f}')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.show()
    
def advanced_threshold_optimization(y_true, y_proba):
    """Find optimal threshold considering business metrics."""
    thresholds = np.arange(0.1, 0.9, 0.01)
    metrics = []
    for threshold in thresholds:
        y_pred = (y_proba > threshold).astype(int)
        f1 = f1_score(y_true, y_pred)
        precision = precision_score(y_true, y_pred)
        recall = recall_score(y_true, y_pred)
        business_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
        metrics.append({'threshold': threshold, 'f1': f1, 'precision': precision, 'recall': recall, 'business_score': business_score})
    
    metrics_df = pd.DataFrame(metrics)
    best_threshold = metrics_df.loc[metrics_df['business_score'].idxmax(), 'threshold']
    print(f"Optimal threshold based on business score: {best_threshold:.2f}")
    return best_threshold, metrics_df

def plot_feature_importance(feature_importances_df):
    """Plot top 20 feature importances."""
    feature_importances_df['mean'] = feature_importances_df.mean(axis=1)
    top_feats = feature_importances_df['mean'].nlargest(20)
    plt.figure(figsize=(10, 10))
    sns.barplot(x=top_feats.values, y=top_feats.index)
    plt.title('Top 20 Feature Importances (LGBM)')
    plt.xlabel('Average Importance')
    plt.ylabel('Feature')
    plt.tight_layout()
    plt.show()
best_threshold, _ = advanced_threshold_optimization(y, oof_preds)
plot_evaluation_results(y, oof_preds, best_threshold)
plot_feature_importance(feature_importances)

In [None]:
 # Advanced Analysis
print("\n8. Performing advanced feature analysis...")
def advanced_feature_selection(X, y, model, k_best=50):
    """Perform comprehensive feature selection."""
    imputer = SimpleImputer(strategy='median')
    X_imputed = imputer.fit_transform(X)
    
    selector_f = SelectKBest(score_func=f_classif, k=min(k_best, X.shape[1])).fit(X_imputed, y)
    f_test_features = X.columns[selector_f.get_support()].tolist()

    selector_mi = SelectKBest(score_func=mutual_info_classif, k=min(k_best, X.shape[1])).fit(X_imputed, y)
    mi_features = X.columns[selector_mi.get_support()].tolist()

    rfe = RFE(estimator=model, n_features_to_select=min(k_best, X.shape[1]), step=1).fit(X_imputed, y)
    rfe_features = X.columns[rfe.support_].tolist()
    
    model.fit(X_imputed, y)
    perm_importance = permutation_importance(model, X_imputed, y, n_repeats=5, random_state=RANDOM_STATE, n_jobs=-1)
    importance_df = pd.DataFrame({'feature': X.columns, 'importance': perm_importance.importances_mean}).sort_values('importance', ascending=False)
    perm_features = importance_df.head(k_best)['feature'].tolist()
    
    feature_scores = {f: sum([f in s for s in [f_test_features, mi_features, rfe_features, perm_features]]) for f in X.columns}
    selected_features = [f for f, score in feature_scores.items() if score >= 2]
    
    print(f"Selected {len(selected_features)} features out of {len(X.columns)}")
    return selected_features, feature_scores
    
def explain_model_predictions(pipeline, X, y):
    """Generate SHAP explanations for model predictions."""
    imputer = pipeline.named_steps['imputer']
    scaler = pipeline.named_steps['scaler']
    classifier = pipeline.named_steps['classifier']
    
    # We fit the pipeline on the training data for SHAP
    pipeline.fit(X, y)
    
    X_transformed = scaler.transform(imputer.transform(X))
    X_transformed_df = pd.DataFrame(X_transformed, columns=X.columns)

    explainer = shap.TreeExplainer(classifier)
    shap_values = explainer.shap_values(X_transformed_df.sample(1000, random_state=RANDOM_STATE))

    # For multi-class (even if binary), shap_values can be a list
    plot_values = shap_values[1] if isinstance(shap_values, list) else shap_values

    shap.summary_plot(plot_values, X_transformed_df.sample(1000, random_state=RANDOM_STATE), show=False)
    plt.title('SHAP Feature Importance')
    plt.tight_layout()
    plt.show()
    
    avg_abs_shap = np.abs(plot_values).mean(0)
    feature_importance = pd.DataFrame({'feature': X.columns, 'importance': avg_abs_shap}).sort_values('importance', ascending=False)
    return feature_importance

def detect_feature_interactions(X, y, top_n=10):
    """Detect important feature interactions."""
    imputer = SimpleImputer(strategy='median')
    X_imputed = imputer.fit_transform(X)
    
    selector = SelectKBest(score_func=f_classif, k=min(20, X.shape[1])).fit(X_imputed, y)
    top_features = X.columns[selector.get_support()].tolist()
    
    interaction_scores = []
    for feat1, feat2 in combinations(top_features, 2):
        interaction_term = (X[feat1] * X[feat2]).values.reshape(-1, 1)
        interaction_filled = imputer.fit_transform(interaction_term).ravel()
        score = mutual_info_classif(interaction_filled.reshape(-1, 1), y)[0]
        interaction_scores.append({'feature1': feat1, 'feature2': feat2, 'interaction_score': score})
    
    interaction_df = pd.DataFrame(interaction_scores).sort_values('interaction_score', ascending=False)
    return interaction_df.head(top_n)
selected_features, _ = advanced_feature_selection(X, y, final_pipeline.named_steps['classifier'])

print(f"\nTop features from composite selection: {selected_features[:15]}...")
    
shap_importance = explain_model_predictions(final_pipeline, X, y)
print("\nTop SHAP Feature Importance:\n", shap_importance.head())
    
top_interactions = detect_feature_interactions(X, y)
print("\nTop feature interactions:\n", top_interactions)

In [None]:
# Submission File Generation
print("\n9. Generating submission file...")
# --- Submission Generation ---
def generate_submission(test_df, test_preds, y, submission_path):
    """Generate and save the submission file with an adjusted threshold."""
    original_positive_proportion = y.value_counts(normalize=True).get(1, 0)
    print(f"Original training positive class proportion: {original_positive_proportion:.4f}")

    sorted_test_preds = np.sort(test_preds)[::-1]
    desired_count_of_ones = int(len(test_preds) * original_positive_proportion)
    
    if desired_count_of_ones >= len(sorted_test_preds):
        threshold_for_imbalance = sorted_test_preds[-1]
    else:
        threshold_for_imbalance = sorted_test_preds[desired_count_of_ones]

    print(f"Calculated threshold to match original imbalance: {threshold_for_imbalance:.4f}")

    submission = pd.DataFrame({
        'MemberID': test_df['MemberID'],
        'next_buy_probability': test_preds
    })
    submission['next_buy'] = (submission['next_buy_probability'] > threshold_for_imbalance).astype(int)
    
    submission_file = f'{submission_path}/submission.csv'
    submission[['MemberID', 'next_buy']].to_csv(submission_file, index=False)
    print(f"Submission file created at: {submission_file}")
    print(submission.head())

    print("\nSubmission unique values distribution (after imbalance adjustment):")
    print(submission['next_buy'].value_counts(normalize=True).to_frame('proportion'))
generate_submission(test_df, test_preds, y, r'C:\Users\rapha\Documents\VSCode\Kaggle\Intellectra\Model\Model')
    
print("\n--- Workflow Complete ---")