# Customer Churn Prediction Model
- `The Goal of the Project is to Predict customers likely to churn within the next 6 months. This model will help the company implement counter-intuitive measures to prevent a lot of churn. The company is interested in keeping old customers more than losing them, therefore this model will prioritze recall. The cost of losing customers is more damaging to the company than running discounts and marketing campaigns.`

- `Last year the company's customer database shrunk from 600,000 to 500,000 resulting in a profit decline of USD 50,000,000 whiles the cost of marketing campaigns and initiatives amounted to only USD 400,000.`

- `The end goal for this project, is to build a model that can catch as much churners as it possibly can`

In [1]:
import pandas as pd
import numpy as np
import scipy
from scipy.stats import mannwhitneyu, chi2_contingency
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
import seaborn as sns
import sys

In [2]:
# open csv file
FILE_PATH = '../data/raw/customer_churn.csv'
try:
    if not Path(FILE_PATH).exists():
        raise FileNotFoundError(f'File not found!')
    
    df = pd.read_csv(FILE_PATH)
    if df.empty:
        raise pd.errors.EmptyDataError(f'Dataframe is empty')
except Exception as e:
    print(f'Error: {e}')

In [3]:
df.head(30)

Unnamed: 0,Invoice,StockCode,Description,Quantity,InvoiceDate,Price,Customer ID,Country,Customer_Age,Gender,...,Category,Subcategory,Cost,Discount_Applied,Payment_Method,Promo_Applied,Delivery_Time_Days,Revenue,Profit,Churn_Flag
0,489434,85048,15CM CHRISTMAS GLASS BALL 20 LIGHTS,12,2009-12-01 07:45:00,6.95,13085.0,United Kingdom,26.0,Female,...,Home Decor,Decor,4.469733,0,Credit Card,0,5,83.4,29.763202,0
1,489434,79323P,PINK CHERRY LIGHTS,12,2009-12-01 07:45:00,6.75,13085.0,United Kingdom,29.0,Female,...,Toys,Kids,3.282128,0,PayPal,0,9,81.0,41.614466,1
2,489434,79323W,WHITE CHERRY LIGHTS,12,2009-12-01 07:45:00,6.75,13085.0,United Kingdom,33.0,Female,...,Kitchen,Appliances,4.6126,0,Cash,0,7,81.0,25.648799,1
3,489434,22041,"RECORD FRAME 7"" SINGLE SIZE",48,2009-12-01 07:45:00,2.1,13085.0,United Kingdom,79.0,Female,...,Clothing,Women,1.120376,0,Bank Transfer,0,8,100.8,47.021953,1
4,489434,21232,STRAWBERRY CERAMIC TRINKET BOX,24,2009-12-01 07:45:00,1.25,13085.0,United Kingdom,43.0,Female,...,Clothing,Men,0.848714,0,Cash,0,8,30.0,9.630864,0
5,489434,22064,PINK DOUGHNUT TRINKET POT,24,2009-12-01 07:45:00,1.65,13085.0,United Kingdom,40.0,Female,...,Kitchen,Storage,1.128552,0,Cash,0,7,39.6,12.514755,0
6,489434,21871,SAVE THE PLANET MUG,24,2009-12-01 07:45:00,1.25,13085.0,United Kingdom,21.0,Female,...,Toys,Outdoor,0.724196,0,Credit Card,1,6,30.0,12.619295,1
7,489434,21523,FANCY FONT HOME SWEET HOME DOORMAT,10,2009-12-01 07:45:00,5.95,13085.0,United Kingdom,35.0,Male,...,Toys,Kids,4.500678,0,Cash,1,3,59.5,14.493223,1
8,489435,22350,CAT BOWL,12,2009-12-01 07:46:00,2.55,13085.0,United Kingdom,55.0,Female,...,Stationery,Art,1.773926,0,Cash,0,2,30.6,9.31289,1
9,489435,22349,"DOG BOWL , CHASING BALL DESIGN",12,2009-12-01 07:46:00,3.75,13085.0,United Kingdom,40.0,Male,...,Home Decor,Lights,1.732001,0,Credit Card,0,2,45.0,24.21599,1


In [4]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Quantity,743486.0,10.03642,154.201706,-74215.0,1.0,3.0,10.0,74215.0
Price,743485.0,4.774413,132.680219,-53594.36,1.25,2.1,4.21,25111.09
Customer ID,573570.0,15336.642474,1693.380372,12346.0,13963.0,15289.0,16805.0,18287.0
Customer_Age,573570.0,48.450948,17.873257,18.0,33.0,48.0,64.0,79.0
Cost,743485.0,2.857944,81.024623,-34992.590744,0.712005,1.29253,2.527461,19645.769724
Discount_Applied,743486.0,0.099686,0.299581,0.0,0.0,0.0,0.0,1.0
Promo_Applied,743486.0,0.199984,0.399988,0.0,0.0,0.0,0.0,1.0
Delivery_Time_Days,743486.0,5.002327,2.577885,1.0,3.0,5.0,7.0,9.0
Revenue,743485.0,18.029966,195.06837,-77183.6,3.75,9.95,17.7,77183.6
Profit,743485.0,7.251679,87.173822,-30946.200363,1.390169,3.533854,7.228494,45664.107562


In [5]:
# summary statistics of data
if len(df) < 1:
    raise pd.errors.EmptyDataError(f'DataFrame is Empty!')

summary = {
    'observations' : len(df),
    'features' : len(df.columns)
}
print(summary)

{'observations': 743486, 'features': 24}


# `DATA QUALITY CHECKS`

$MISSING$ $VALUES$

In [19]:
missing = df.isnull().sum()
missing = missing[missing > 0].sort_values(ascending=False)
missing_pct = (missing / len(df)) * 100


summary = pd.DataFrame({
    'missing' : missing,
    'missing_pct' : missing_pct.round(2),
})
summary

Unnamed: 0,missing,missing_pct
Customer_Age,169916,22.85
Customer ID,169916,22.85
Customer_Segment,169916,22.85
Marketing_Channel,169916,22.85
Gender,169916,22.85
Description,3772,0.51
Country,1,0.0
Price,1,0.0
InvoiceDate,1,0.0
Signup_Date,1,0.0


In [16]:
df[df['Customer ID'].isnull()][['Customer ID','Customer_Age','Customer_Segment','Marketing_Channel','Gender','Description']]

Unnamed: 0,Customer ID,Customer_Age,Customer_Segment,Marketing_Channel,Gender,Description
263,,,,,,85123a mixed
283,,,,,,short
284,,,,,,21733 mixed
470,,,,,,
577,,,,,,BLUE PULL BACK RACING CAR
...,...,...,...,...,...,...
743104,,,,,,TRAVEL CARD WALLET VINTAGE TICKET
743105,,,,,,TRAVEL CARD WALLET SUKI
743106,,,,,,PACK OF 12 TRADITIONAL CRAYONS
743107,,,,,,WORLD WAR 2 GLIDERS ASSTD DESIGNS


$INSIGHT$
- Missing values for Customer ID, Customer_Age, Customer_Segment, Marketing_Channel, Gender all have same missing values in common. The observation of missing values suggest systematic omission of values.

In [None]:
# duplicates
duplicates = df.duplicated()
if len(duplicates) == 0:
    print('No missing duplicates')

else:
    df[duplicates]