# Customer Churn Prediction Analysis

### 1. Import Packages

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report, precision_score, recall_score, f1_score, confusion_matrix
import warnings
warnings.filterwarnings('ignore')

### 2. Load Data and Drop Column

In [2]:
# Load the dataset
df = pd.read_csv('features_anon.csv')

# Drop the Avg_Spend_Last_90 column
df = df.drop('Avg_Spend_Last_90', axis=1)

print(f"Dataset shape after dropping column: {df.shape}")
print(f"Columns: {list(df.columns)}")

Dataset shape after dropping column: (7372, 15)
Columns: ['Recency', 'Frequency', 'AIT', 'Average Order Value', 'is_using_app', 'has_acc_manager', 'Cross_category_Count', 'Spending_Velocity', 'probability_alive', 'T', 'churn_label', 'price_group', 'item_category', 'Monetary', 'Is_credit_limit']


### 3. Eyeballing and Understanding the Data

In [3]:
# Basic info about the dataset
print("Dataset Info:")
df.info()


Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7372 entries, 0 to 7371
Data columns (total 15 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Recency               7372 non-null   int64  
 1   Frequency             7372 non-null   int64  
 2   AIT                   7372 non-null   float64
 3   Average Order Value   7372 non-null   float64
 4   is_using_app          7372 non-null   object 
 5   has_acc_manager       7372 non-null   object 
 6   Cross_category_Count  7372 non-null   int64  
 7   Spending_Velocity     7372 non-null   float64
 8   probability_alive     7372 non-null   float64
 9   T                     7372 non-null   float64
 10  churn_label           7372 non-null   int64  
 11  price_group           7372 non-null   object 
 12  item_category         7372 non-null   object 
 13  Monetary              7372 non-null   float64
 14  Is_credit_limit       7372 non-null   int64  
dtypes: floa

In [4]:
print("\nFirst 5 rows:")
df.head()



First 5 rows:


Unnamed: 0,Recency,Frequency,AIT,Average Order Value,is_using_app,has_acc_manager,Cross_category_Count,Spending_Velocity,probability_alive,T,churn_label,price_group,item_category,Monetary,Is_credit_limit
0,100,95,10.702128,54812.390909,No,Yes,10,0.533489,0.533821,1087.0,0,SEG_1,CAT_1,24288520.0,1
1,92,70,14.811594,18910.583333,No,Yes,10,1.180944,0.871219,1095.0,0,SEG_2,CAT_2,8531163.0,1
2,88,19,53.611111,39324.5,No,Yes,7,0.337601,0.963892,1034.0,0,SEG_1,CAT_3,4619662.0,0
3,567,20,25.789474,55912.051724,No,Yes,8,0.0,0.000477,1038.0,1,SEG_1,CAT_3,14574180.0,0
4,112,11,90.6,8838.416667,Yes,Yes,2,3.659026,0.955087,999.0,0,SEG_1,CAT_1,828703.6,0


In [5]:
print("\nDataset shape:", df.shape)


Dataset shape: (7372, 15)


In [6]:
print("\nMissing values:")
print(df.isnull().sum())



Missing values:
Recency                 0
Frequency               0
AIT                     0
Average Order Value     0
is_using_app            0
has_acc_manager         0
Cross_category_Count    0
Spending_Velocity       0
probability_alive       0
T                       0
churn_label             0
price_group             0
item_category           0
Monetary                0
Is_credit_limit         0
dtype: int64


In [7]:
print("\nTarget variable distribution:")
print(df['churn_label'].value_counts())
print("\nChurn rate:", df['churn_label'].mean())


Target variable distribution:
churn_label
1    4202
0    3170
Name: count, dtype: int64

Churn rate: 0.5699945740640261


In [8]:
# Statistical summary
print("Statistical Summary:")
df.describe()

Statistical Summary:


Unnamed: 0,Recency,Frequency,AIT,Average Order Value,Cross_category_Count,Spending_Velocity,probability_alive,T,churn_label,Monetary,Is_credit_limit
count,7372.0,7372.0,7372.0,7372.0,7372.0,7372.0,7372.0,7372.0,7372.0,7372.0,7372.0
mean,274.514243,36.274688,65.959732,61589.51,5.001085,0.535036,0.7012591,821.546527,0.569995,10196420.0,0.183804
std,251.340779,150.061325,96.042236,138985.1,3.196245,0.79165,0.3310464,298.781367,0.49511,35580780.0,0.38735
min,80.0,1.0,0.0,-142473.0,0.0,-2.371463,6.341675e-60,61.0,0.0,16156.59,0.0
25%,98.0,3.0,12.322101,14742.75,2.0,0.0,0.5267896,622.0,0.0,954675.6,0.0
50%,154.0,9.0,34.563492,32282.88,4.0,0.189821,0.8653902,958.0,1.0,2778671.0,0.0
75%,360.25,28.0,79.833333,65531.99,7.0,0.969551,0.9444884,1065.0,1.0,8171706.0,0.0
max,1113.0,3152.0,1015.0,5741388.0,16.0,15.682577,1.0,1095.0,1.0,1449995000.0,1.0


### 4. Data Cleaning Pipeline

In [9]:
# Separate features and target
X = df.drop('churn_label', axis=1)
y = df['churn_label']

# Identify categorical and numerical columns
categorical_features = X.select_dtypes(include=['object']).columns.tolist()
numerical_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

print(f"Categorical features: {categorical_features}")
print(f"Numerical features: {numerical_features}")

Categorical features: ['is_using_app', 'has_acc_manager', 'price_group', 'item_category']
Numerical features: ['Recency', 'Frequency', 'AIT', 'Average Order Value', 'Cross_category_Count', 'Spending_Velocity', 'probability_alive', 'T', 'Monetary', 'Is_credit_limit']


In [10]:
# Create preprocessing pipeline
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('encoder', LabelEncoder())
])

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

print("Data cleaning pipeline created successfully!")

Data cleaning pipeline created successfully!


### 5. Train-Test Split

In [11]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print(f"Training set shape: {X_train.shape}")
print(f"Test set shape: {X_test.shape}")
print(f"Training set churn rate: {y_train.mean():.3f}")
print(f"Test set churn rate: {y_test.mean():.3f}")

Training set shape: (5897, 14)
Test set shape: (1475, 14)
Training set churn rate: 0.570
Test set churn rate: 0.570


### 6. Base Models Training

In [12]:
# Define base models
base_models = {
    'Logistic Regression': LogisticRegression(random_state=42),
    'Random Forest': RandomForestClassifier(random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(random_state=42),
    'SVM': SVC(random_state=42),
    'KNN': KNeighborsClassifier(),
    'Naive Bayes': GaussianNB()
}

print("Base models defined successfully!")

Base models defined successfully!


### 7. Model Training and Evaluation

In [13]:
# Initialize results storage
results = []

# Train and evaluate each model
for name, model in base_models.items():
    print(f"Training {name}...")
    
    # Create pipeline with preprocessing and model
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', model)
    ])
    
    # Train the model
    pipeline.fit(X_train, y_train)
    
    # Make predictions
    y_pred = pipeline.predict(X_test)
    
    # Calculate metrics (focus on churn class - label 1)
    precision = precision_score(y_test, y_pred, pos_label=1)
    recall = recall_score(y_test, y_pred, pos_label=1)
    f1 = f1_score(y_test, y_pred, pos_label=1)
    
    # Store results
    results.append({
        'Model': name,
        'Precision': precision,
        'Recall': recall,
        'F1-Score': f1
    })
    
    print(f"{name} - Precision: {precision:.3f}, Recall: {recall:.3f}, F1-Score: {f1:.3f}")

print("\nAll models trained successfully!")

Training Logistic Regression...


TypeError: LabelEncoder.fit_transform() takes 2 positional arguments but 3 were given

### 8. Results Comparison Table

In [None]:
# Create results DataFrame
results_df = pd.DataFrame(results)
results_df = results_df.round(3)
results_df = results_df.sort_values('F1-Score', ascending=False)

print("Model Performance Comparison (Focus on Churn - Label 1):")
print("=" * 60)
print(results_df.to_string(index=False))

# Find best performing model
best_model = results_df.iloc[0]['Model']
best_f1 = results_df.iloc[0]['F1-Score']
print(f"\nBest performing model: {best_model} (F1-Score: {best_f1})")

### 9. Visualization of Results

In [None]:
# Create visualization
fig, axes = plt.subplots(1, 3, figsize=(15, 5))

# Precision comparison
axes[0].bar(results_df['Model'], results_df['Precision'], color='skyblue')
axes[0].set_title('Precision Comparison')
axes[0].set_ylabel('Precision')
axes[0].tick_params(axis='x', rotation=45)

# Recall comparison
axes[1].bar(results_df['Model'], results_df['Recall'], color='lightgreen')
axes[1].set_title('Recall Comparison')
axes[1].set_ylabel('Recall')
axes[1].tick_params(axis='x', rotation=45)

# F1-Score comparison
axes[2].bar(results_df['Model'], results_df['F1-Score'], color='salmon')
axes[2].set_title('F1-Score Comparison')
axes[2].set_ylabel('F1-Score')
axes[2].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

### 10. Summary

In [None]:
print("Analysis Summary:")
print("=" * 50)
print(f"Dataset: {df.shape[0]} samples, {df.shape[1]} features (after dropping Avg_Spend_Last_90)")
print(f"Churn rate: {y.mean():.1%}")
print(f"Models evaluated: {len(base_models)}")
print(f"Best model: {best_model}")
print(f"Best F1-Score: {best_f1:.3f}")
print("\nFinal Results Table:")
print(results_df.to_string(index=False))