# Data Exports for PowerBI Dashboard
This notebook prepares and exports the final dataset with model predictions for PowerBI visualization.

In [3]:
import pandas as pd
import numpy as np
import pickle
import json
from sklearn.preprocessing import LabelEncoder

print("Loading data and model...")

# Load the preprocessed data
df = pd.read_csv('../data/processed/preprocessed_churn_data.csv')

# Load the model and metadata
with open('../models/churn_model.pkl', 'rb') as f:
    model = pickle.load(f)
    
with open('../models/model_metadata.json', 'r') as f:
    metadata = json.load(f)
    
# Load the encoders
with open('../models/encoders.pkl', 'rb') as f:
    encoders = pickle.load(f)

print(f"Data loaded: {len(df)} rows")

# Identify categorical columns
categorical_columns = ['value_segment', 'recency_segment', 'risk_segment']

# Create copy of original categorical values before encoding
original_cat_values = {}
for col in categorical_columns:
    original_cat_values[col] = df[col].copy()

# Encode categorical features
for col in categorical_columns:
    if col in encoders:
        df[col] = encoders[col].transform(df[col])
    else:
        # If encoder not found, create new one
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col])
        encoders[col] = le

print("\nCategorical columns encoded:")
for col in categorical_columns:
    print(f"- {col}: {df[col].unique()}")

Loading data and model...
Data loaded: 49358 rows

Categorical columns encoded:
- value_segment: [3 1 2 0]
- recency_segment: [0 2 1 3]
- risk_segment: [0 2 1 3 4]
Data loaded: 49358 rows

Categorical columns encoded:
- value_segment: [3 1 2 0]
- recency_segment: [0 2 1 3]
- risk_segment: [0 2 1 3 4]


In [4]:
# Prepare features for prediction
feature_cols = metadata['feature_columns'] if 'feature_columns' in metadata else [
    col for col in df.columns if col not in ['target_class', 'visitorid']
]

X = df[feature_cols]

print("Adding model predictions...")

# 1. Add model predictions
df['model_churn_probability'] = model.predict_proba(X)[:, 1]
df['model_prediction'] = model.predict(X)  # Binary 0/1 prediction

# 2. Add model performance analysis
df['prediction_correct'] = (df['model_prediction'] == df['target_class'])
df['false_negative'] = (df['model_prediction'] == 0) & (df['target_class'] == 1)
df['false_positive'] = (df['model_prediction'] == 1) & (df['target_class'] == 0)

# 3. Add model confidence
df['model_confidence'] = abs(df['model_churn_probability'] - 0.5) * 2

# Restore original categorical values
for col, original_values in original_cat_values.items():
    df[col] = original_values

print("\nNew columns added:")
print("- model_churn_probability: Probability of churning (0-1)")
print("- model_prediction: Binary prediction (0/1)")
print("- prediction_correct: Whether prediction matches actual class")
print("- false_negative: Missed actual churners")
print("- false_positive: False churn predictions")
print("- model_confidence: Confidence score (0-1)")

# Show sample of predictions
print("\nSample predictions:")
sample_cols = ['value_segment', 'risk_segment', 'model_churn_probability', 'model_prediction', 'model_confidence']
print(df[sample_cols].head())

Adding model predictions...

New columns added:
- model_churn_probability: Probability of churning (0-1)
- model_prediction: Binary prediction (0/1)
- prediction_correct: Whether prediction matches actual class
- false_negative: Missed actual churners
- false_positive: False churn predictions
- model_confidence: Confidence score (0-1)

Sample predictions:
  value_segment risk_segment  model_churn_probability  model_prediction  \
0   No Purchase         High                 0.690216                 1   
1   No Purchase         High                 0.736367                 1   
2   No Purchase       Medium                 0.705747                 1   
3   No Purchase          Low                 0.584194                 1   
4   No Purchase       Medium                 0.558646                 1   

   model_confidence  
0          0.380431  
1          0.472735  
2          0.411494  
3          0.168388  
4          0.117292  


In [5]:
# Verify the new columns
print("Verification of new columns:")
print("\nModel Predictions Summary:")
print(df[['model_churn_probability', 'model_prediction']].describe())

print("\nPrediction Performance:")
print(f"Correct predictions: {df['prediction_correct'].mean():.1%}")
print(f"False negatives: {df['false_negative'].sum()} ({df['false_negative'].mean():.1%})")
print(f"False positives: {df['false_positive'].sum()} ({df['false_positive'].mean():.1%})")

print("\nModel Confidence:")
print(df['model_confidence'].describe())

Verification of new columns:

Model Predictions Summary:
       model_churn_probability  model_prediction
count             49358.000000      49358.000000
mean                  0.587636          0.685279
std                   0.197596          0.464409
min                   0.008969          0.000000
25%                   0.455756          0.000000
50%                   0.610614          1.000000
75%                   0.746175          1.000000
max                   0.943603          1.000000

Prediction Performance:
Correct predictions: 72.7%
False negatives: 11690 (23.7%)
False positives: 1803 (3.7%)

Model Confidence:
count    49358.000000
mean         0.366709
std          0.228952
min          0.000008
25%          0.165536
50%          0.354587
75%          0.548411
max          0.982062
Name: model_confidence, dtype: float64


In [6]:
# Save the enhanced dataset
output_path = '../data/exports/churn_predictions_powerbi.csv'
df.to_csv(output_path, index=False)
print(f"\nEnhanced dataset saved to: {output_path}")
print(f"Total columns: {len(df.columns)}")
print(f"Total rows: {len(df)}")


Enhanced dataset saved to: ../data/exports/churn_predictions_powerbi.csv
Total columns: 75
Total rows: 49358
