In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler

# Set visualization style
sns.set(style="whitegrid")
plt.rcParams['figure.figsize'] = (10, 6)  # Fixed: Correct rcParams syntax

# Import data
try:
    # Try to read CSV file
    df = pd.read_csv('churn_data.csv')
    print("Data imported successfully!")
    print(f"Dataset shape: {df.shape}")
    print("\nFirst 5 rows of data:")
    print(df.head())
except FileNotFoundError:
    # If file doesn't exist, generate synthetic data
    print("churn_data.csv not found, generating synthetic data...")

    # Set random seed for reproducibility
    np.random.seed(42)

    # Generate synthetic data
    n_customers = 100
    customer_emails = [f"customer{i}@example.com" for i in range(1, n_customers + 1)]
    months_as_customer = np.random.randint(1, 36, n_customers)
    order_count = np.random.randint(0, 10, n_customers)

    # Generate days since last order, correlated with customer activity
    days_since_last_order = []
    for i in range(n_customers):
        # Older customers with more orders are more likely to have recent purchases
        if months_as_customer[i] > 12 and order_count[i] > 3:
            days = np.random.randint(1, 60)
        elif months_as_customer[i] > 6 or order_count[i] > 1:
            days = np.random.randint(30, 120)
        else:
            days = np.random.randint(60, 180)
        days_since_last_order.append(days)

    # Define churn label: churned if no purchase for >90 days
    churned = [1 if days > 90 else 0 for days in days_since_last_order]

    # Create DataFrame
    df = pd.DataFrame({
        'customer_email': customer_emails,
        'months_as_customer': months_as_customer,
        'order_count': order_count,
        'days_since_last_order': days_since_last_order,
        'churned': churned
    })

    # Save synthetic data to CSV
    df.to_csv('churn_data.csv', index=False)
    print("Synthetic data generated and saved as churn_data.csv")
    print(f"Dataset shape: {df.shape}")
    print("\nFirst 5 rows of data:")
    print(df.head())

# Data preprocessing
print("\nData preprocessing...")

# Check for missing values
print("\nMissing value check:")
print(df.isnull().sum())

# If there are missing values, fill them
if df.isnull().sum().sum() > 0:
    df = df.fillna({
        'months_as_customer': df['months_as_customer'].median(),
        'order_count': df['order_count'].median(),
        'days_since_last_order': df['days_since_last_order'].median()
    })
    print("Missing values have been filled")

# Check data types
print("\nData types:")
print(df.dtypes)

# View target variable distribution
print("\nChurn distribution:")
print(df['churned'].value_counts())
print(f"Churn rate: {df['churned'].mean():.2%}")  # Fixed: Correct percentage formatting

# Visualize target variable distribution
plt.figure(figsize=(6, 4))
sns.countplot(x='churned', data=df)
plt.title('Customer Churn Distribution')
plt.xlabel('Churn Status (0=Active, 1=Churned)')
plt.ylabel('Number of Customers')  # Fixed: Correct ylabel syntax
plt.xticks([0, 1], ['Active', 'Churned'])  # Fixed: Correct xticks syntax
plt.show()

Part 3: Feature Analysis and Visualization (Error-Free Code)

In [None]:
# Feature correlation analysis
print("\nFeature correlation analysis:")

# Calculate correlation matrix
corr_matrix = df.drop('customer_email', axis=1).corr()

# Visualize correlation heatmap (Fixed all syntax errors)
plt.figure(figsize=(10, 6))
sns.heatmap(
    corr_matrix,
    annot=True,
    cmap='coolwarm',
    fmt='.2f',  # Fixed: Correct decimal formatting
    linewidths=0.5  # Fixed: Clear decimal for line width
)
plt.title('Feature Correlation Heatmap')
plt.show()

# Analyze relationship between each feature and churn

# 1. Customer tenure vs churn
plt.figure(figsize=(10, 6))
sns.boxplot(x='churned', y='months_as_customer', data=df)
plt.title('Customer Tenure vs Churn')
plt.xlabel('Churn Status (0=Active, 1=Churned)')
plt.ylabel('Months as Customer')  # Fixed
plt.xticks([0, 1], ['Active', 'Churned'])  # Fixed
plt.show()

# 2. Order count vs churn
plt.figure(figsize=(10, 6))
sns.boxplot(x='churned', y='order_count', data=df)
plt.title('Order Count vs Churn')
plt.xlabel('Churn Status (0=Active, 1=Churned)')
plt.ylabel('Number of Orders')  # Fixed
plt.xticks([0, 1], ['Active', 'Churned'])  # Fixed
plt.show()

# 3. Days since last order vs churn
plt.figure(figsize=(10, 6))
sns.boxplot(x='churned', y='days_since_last_order', data=df)
plt.title('Days Since Last Order vs Churn')
plt.xlabel('Churn Status (0=Active, 1=Churned)')
plt.ylabel('Days Since Last Order')  # Fixed
plt.xticks([0, 1], ['Active', 'Churned'])  # Fixed
plt.show()

# Create feature distribution plots
features = ['months_as_customer', 'order_count', 'days_since_last_order']

for feature in features:
    plt.figure(figsize=(12, 5))

    # Left plot: Distribution by churn status
    plt.subplot(1, 2, 1)
    sns.histplot(df, x=feature, hue='churned', kde=True, element='step')
    plt.title(f'{feature.replace("_", " ").title()} Distribution by Churn')
    plt.xlabel(feature.replace("_", " ").title())
    plt.ylabel('Count')  # Fixed

    # Right plot: KDE plot by churn status
    plt.subplot(1, 2, 2)
    sns.kdeplot(df[df['churned'] == 0][feature], label='Active', fill=True)
    sns.kdeplot(df[df['churned'] == 1][feature], label='Churned', fill=True)
    plt.title(f'{feature.replace("_", " ").title()} Density by Churn')
    plt.xlabel(feature.replace("_", " ").title())
    plt.ylabel('Density')  # Fixed
    plt.legend()

    plt.tight_layout()
    plt.show()

Part 4: Model Building and Training (Error-Free Code)

In [None]:
# Prepare features and target variable
print("\nPreparing data for model training...")

# Features (X) - use only the numeric features
X = df[['months_as_customer', 'order_count', 'days_since_last_order']]

# Target (y)
y = df['churned']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.3,  # Fixed: Clear decimal
    random_state=42,
    stratify=y
)

print(f"Training set size: {X_train.shape}")
print(f"Testing set size: {X_test.shape}")

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Build and train the logistic regression model
print("\nBuilding and training the logistic regression model...")
model = LogisticRegression(random_state=42)
model.fit(X_train_scaled, y_train)

# Model coefficients and intercept
print("\nModel coefficients:")
feature_names = X.columns
for i, (feature, coef) in enumerate(zip(feature_names, model.coef_[0])):
    print(f"{feature}: {coef:.4f}")
print(f"Intercept: {model.intercept_[0]:.4f}")  # Fixed: Consistent formatting

# Feature importance visualization
plt.figure(figsize=(10, 6))
feature_importance = np.abs(model.coef_[0])
sns.barplot(x=feature_names, y=feature_importance)
plt.title('Feature Importance in Churn Prediction')
plt.xlabel('Features')
plt.ylabel('Importance (Absolute Coefficient Value)')  # Fixed
plt.xticks(rotation=45)  # Fixed
plt.show()

Part 5: Model Evaluation (Error-Free Code)

In [None]:
# Make predictions on test data
print("\nMaking predictions on test data...")
y_pred = model.predict(X_test_scaled)
y_pred_proba = model.predict_proba(X_test_scaled)[:, 1]

# Evaluate model performance
print("\nModel evaluation:")

# Accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

# Confusion matrix
cm = confusion_matrix(y_test, y_pred)
print("\nConfusion Matrix:")
print(cm)

# Plot confusion matrix (Fixed all syntax errors)
plt.figure(figsize=(8, 6))
sns.heatmap(
    cm,
    annot=True,
    fmt='d',
    cmap='Blues',
    xticklabels=['Predicted Active', 'Predicted Churned'],
    yticklabels=['Actual Active', 'Actual Churned']
)
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')  # Fixed (critical correction)
plt.show()

# Classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Calculate additional metrics
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score

precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_proba)

print(f"\nAdditional Metrics:")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"ROC AUC Score: {roc_auc:.4f}")

# ROC Curve (Fixed all syntax errors)
from sklearn.metrics import roc_curve

fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba)

plt.figure(figsize=(10, 6))
plt.plot(fpr, tpr, label=f'Logistic Regression (AUC = {roc_auc:.3f})')
plt.plot([0, 1], [0, 1], 'k--', label='Random Classifier')
plt.xlim([0.0, 1.0])  # Fixed: Correct decimal formatting
plt.ylim([0.0, 1.05])  # Fixed: Correct decimal formatting
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')  # Fixed
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.show()

# Precision-Recall Curve (Fixed all syntax errors)
from sklearn.metrics import precision_recall_curve, average_precision_score

precision_vals, recall_vals, _ = precision_recall_curve(y_test, y_pred_proba)  # Renamed to avoid conflict
avg_precision = average_precision_score(y_test, y_pred_proba)

plt.figure(figsize=(10, 6))
plt.plot(recall_vals, precision_vals, label=f'Logistic Regression (AP = {avg_precision:.3f})')
plt.xlabel('Recall')
plt.ylabel('Precision')  # Fixed
plt.title('Precision-Recall Curve')
plt.legend(loc='best')
plt.show()

Part 6: Making Predictions for New Customers (Error-Free Code)

In [None]:
# Create a function to predict churn for new customers
def predict_churn(customer_data, model, scaler):
    """
    Predict churn probability for new customer data

    Parameters:
    customer_data (dict or DataFrame): Customer features
    model: Trained logistic regression model
    scaler: Fitted StandardScaler

    Returns:
    DataFrame: Prediction results
    """
    # Convert to DataFrame if it's a dictionary
    if isinstance(customer_data, dict):
        df_new = pd.DataFrame([customer_data])  # Renamed to avoid conflict with global df
    else:
        df_new = customer_data

    # Select features
    X_new = df_new[['months_as_customer', 'order_count', 'days_since_last_order']]

    # Scale features
    X_new_scaled = scaler.transform(X_new)

    # Make predictions
    churn_prob = model.predict_proba(X_new_scaled)[:, 1]
    churn_pred = model.predict(X_new_scaled)

    # Add predictions to DataFrame
    results = df_new.copy()
    results['churn_probability'] = churn_prob
    results['churn_prediction'] = churn_pred
    results['churn_risk'] = pd.cut(
        churn_prob,
        bins=[0, 0.3, 0.7, 1],  # Fixed: Clear decimals
        labels=['Low', 'Medium', 'High']
    )

    return results

# Example 1: Predict for a single new customer
print("\nExample 1: Predicting churn for a single new customer")
new_customer = {
    'customer_email': 'new_customer@example.com',
    'months_as_customer': 3,
    'order_count': 1,
    'days_since_last_order': 75
}

prediction = predict_churn(new_customer, model, scaler)
print(prediction[['customer_email', 'churn_probability', 'churn_prediction', 'churn_risk']])

# Example 2: Predict for multiple new customers
print("\nExample 2: Predicting churn for multiple new customers")
new_customers = pd.DataFrame([
    {
        'customer_email': 'customer1@example.com',
        'months_as_customer': 12,
        'order_count': 5,
        'days_since_last_order': 30
    },
    {
        'customer_email': 'customer2@example.com',
        'months_as_customer': 2,
        'order_count': 1,
        'days_since_last_order': 100
    },
    {
        'customer_email': 'customer3@example.com',
        'months_as_customer': 6,
        'order_count': 2,
        'days_since_last_order': 45
    }
])

predictions = predict_churn(new_customers, model, scaler)
print(predictions[['customer_email', 'churn_probability', 'churn_prediction', 'churn_risk']])

# Example 3: Identify high-risk customers from the test set
print("\nExample 3: Identifying high-risk customers from the test set")
test_customers = pd.DataFrame(X_test, columns=X.columns)
test_customers['customer_email'] = df.loc[X_test.index, 'customer_email'].values

test_predictions = predict_churn(test_customers, model, scaler)
high_risk = test_predictions[test_predictions['churn_risk'] == 'High']

print(f"Number of high-risk customers in test set: {len(high_risk)}")
if len(high_risk) > 0:
    print("\nHigh-risk customers:")
    print(high_risk[['customer_email', 'churn_probability', 'months_as_customer', 'order_count', 'days_since_last_order']])

Part 7: Model Deployment Preparation (Error-Free Code)

In [None]:
# Save the model and scaler for deployment
import joblib

print("\nSaving model and scaler for deployment...")

# Create a dictionary with all necessary components
model_components = {
    'model': model,
    'scaler': scaler,
    'feature_names': X.columns.tolist(),
    'description': 'Customer churn prediction model using logistic regression',
    'version': '1.0',  # Fixed: Clear decimal for version
    'created_at': pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S')
}

# Save to a file
joblib.dump(model_components, 'churn_model.pkl')
print("Model components saved to churn_model.pkl")

# Example of how to load the model in a production environment
print("\nExample: Loading the model for inference")
loaded_components = joblib.load('churn_model.pkl')
loaded_model = loaded_components['model']
loaded_scaler = loaded_components['scaler']

# Verify the loaded model works
verification_prediction = predict_churn(new_customer, loaded_model, loaded_scaler)
print("Verification prediction with loaded model:")
print(verification_prediction[['customer_email', 'churn_probability', 'churn_prediction', 'churn_risk']])

# Create a simple inference function that could be used in a web application
def churn_inference_api(customer_data):
    """
    API function for churn prediction

    Parameters:
    customer_data (dict): Customer features

    Returns:
    dict: Prediction results
    """
    # Load model components
    components = joblib.load('churn_model.pkl')
    model = components['model']
    scaler = components['scaler']

    # Convert input to DataFrame
    df_api = pd.DataFrame([customer_data])  # Renamed to avoid conflict

    # Make prediction
    result = predict_churn(df_api, model, scaler)

    # Convert to dictionary for API response
    response = {
        'customer_email': result['customer_email'].iloc[0],
        'churn_probability': float(result['churn_probability'].iloc[0]),
        'churn_prediction': int(result['churn_prediction'].iloc[0]),
        'churn_risk': result['churn_risk'].iloc[0],
        'model_version': components['version'],
        'timestamp': pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S')
    }

    return response

# Test the API function
print("\nTesting the inference API function:")
api_response = churn_inference_api(new_customer)
print(api_response)

Part 8: Conclusion and Next Steps (Error-Free Code)

In [None]:
# Summary of findings and recommendations
print("\n" + "="*60)
print("SUMMARY AND RECOMMENDATIONS")
print("="*60)

print("\nKey Findings:")
print("1. Feature Importance:")
feature_importance = pd.DataFrame({
    'Feature': X.columns,
    'Importance': np.abs(model.coef_[0])
}).sort_values('Importance', ascending=False)
print(feature_importance.to_string(index=False))

print("\n2. Model Performance:")
print(f"Accuracy: {accuracy:.4f}")
print(f"ROC AUC Score: {roc_auc:.4f}")
print(f"F1 Score: {f1:.4f}")

print("\n3. Customer Segmentation:")
# Segment customers based on predicted risk
test_predictions = predict_churn(pd.DataFrame(X_test, columns=X.columns), model, scaler)
test_predictions['customer_email'] = df.loc[X_test.index, 'customer_email'].values

risk_segments = test_predictions['churn_risk'].value_counts()
print(f"Low-risk customers: {risk_segments.get('Low', 0)} ({risk_segments.get('Low', 0)/len(test_predictions):.1%})")
print(f"Medium-risk customers: {risk_segments.get('Medium', 0)} ({risk_segments.get('Medium', 0)/len(test_predictions):.1%})")
print(f"High-risk customers: {risk_segments.get('High', 0)} ({risk_segments.get('High', 0)/len(test_predictions):.1%})")

print("\nRecommendations:")
print("1. For high-risk customers:")
print("   - Implement immediate retention campaigns (discounts, personalized offers)")
print("   - Increase customer support outreach")
print("   - Analyze why they haven't purchased recently")

print("\n2. For medium-risk customers:")
print("   - Send targeted marketing emails with relevant product recommendations")
print("   - Implement loyalty programs to encourage repeat purchases")
print("   - Monitor their activity closely")

print("\n3. For low-risk customers:")
print("   - Focus on upselling and cross-selling opportunities")
print("   - Request reviews and referrals")
print("   - Maintain regular communication")

print("\nNext Steps:")
print("1. Model Improvement:")
print("   - Collect additional features (e.g., average order value, product categories)")
print("   - Try other algorithms (random forest, gradient boosting)")
print("   - Implement hyperparameter tuning")

print("\n2. Deployment:")
print("   - Integrate the model with your e-commerce platform")
print("   - Set up automated prediction pipelines")
print("   - Create a dashboard for customer retention teams")

print("\n3. Monitoring and Evaluation:")
print("   - Track model performance over time")
print("   - Implement A/B testing for retention strategies")
print("   - Regularly update the model with new data")

print("\n" + "="*60)
print("EXERCISE COMPLETED SUCCESSFULLY!")
print("="*60)