# Hypothesis Testing for influences on Churn Rate

This script uses hypothesis testing to see if whether or not a feature has an influence on if an account holder decided to churn.

First we load and set up the data in the format needed for testing.

In [1]:
import pandas as pd
from datetime import datetime, timedelta
from functools import reduce
import matplotlib.pyplot as plt

a = pd.read_csv('accounts.csv')
c = pd.read_csv('contracts.csv')
c['closingDate'] = pd.to_datetime(c['closingDate'])
c['paymentDate'] = pd.to_datetime(c['paymentDate'])
c['accountID'] = c['contractID'].apply(lambda x:x.split('-')[1])
c['number'] = c['contractID'].apply(lambda x:x.split('-')[2])
def days_between(d1, d2):
    return (d2 - d1).days
c['days_passed'] = c[['closingDate','paymentDate']].apply(lambda row: days_between(row.closingDate,row.paymentDate),axis=1)
df = pd.merge(a,c,on='accountID')
df = df.sort_values(by='closingDate').reset_index().drop('index',axis=1)

from datetime import datetime, timedelta
def date_minus_days(date,days):
    date_format = "%Y-%m-%d"
    date1 = datetime.strptime(date, date_format)
    return (date1 - timedelta(days)).date().strftime("%Y-%m-%d")

df = df[['closingDate','accountID','region','partnerInvolved','contractSize','contractLength']]
cv = df[['closingDate','accountID']].groupby('accountID').count()==1
churns = list(cv[cv['closingDate']==True].index)
df['churn'] = df['accountID'].apply(lambda x: 1 if x in churns else 0)
data = df[['region','partnerInvolved','contractSize','contractLength','churn']]
data.head()

Unnamed: 0,region,partnerInvolved,contractSize,contractLength,churn
0,APAC,No,70.0,2,0
1,APAC,No,55.0,2,0
2,North America,No,95.0,1,0
3,North America,No,50.0,1,0
4,EMEA,No,50.0,1,0


The features of `region`, `partnerInvolved` and `contractLength` are all categorical in structure. Thus, a Chi2 test on each of these can be used to determine whether or a given feature influences churn. The null hypothesis is that there is no influence on the feature and whether or not a row has a churn in it.

In [6]:
import numpy as np
from scipy.stats import chi2_contingency,chi2
def chi_square_test(feature):
    p = list(data[data['churn']==1][feature].value_counts())
    e = list(data[data['churn']==0][feature].value_counts())
    if len(p)-len(e)>0:
        for k in range(len(p)-len(e)):
            e.append(0)
    if len(p)-len(e)<0:
        for j in range(len(e)-len(p)):
            p.append(0)
    obs = np.array([p,e])
    chi_square_stat, prob, dof, ex = chi2_contingency(obs, correction=False)
    value = chi2.ppf(0.99,dof)
    if (value<chi_square_stat) and (prob<0.01):
        return "Reject Null Hypothesis"
    else:
        return "Fail to reject Null Hypothesis"

In [7]:
chi_square_test('region')

'Fail to reject Null Hypothesis'

So region does NOT have an impact as to whether or not they churned.

In [8]:
chi_square_test('partnerInvolved')

'Reject Null Hypothesis'

If they had a partner involved, it influenced whether or not they churned.

In [9]:
chi_square_test('contractLength')

'Reject Null Hypothesis'

The length of their contract, influences whether or not they churned.

For the feature `contractSize` is a continuous input variable. A hypothesis test to measure this would take a continuous input variable and calculate its influence on a categorical response variable (whether or not they churned).

One solution to this problem is bootstraping the difference between the average contract size of those who churned and the average contract size of those who didn't churn.

That is, we find the actual difference between the mean contractsizes of the two groups, and save it. Then we scramble the set many times, on each time, save the simulated mean contract size difference. 

If only a tiny fraction of simulated mean differences are more extreme than the observed one, we can reject the null hypothesis and say that `dontractSize` influences whether or not a customer churned.

In [11]:
from sklearn.utils import shuffle

df = data.sort_values(by='churn',ascending=True).reset_index().drop('index',axis=1)
class_0_max = max(df.loc[df['churn']==0].index)+1

feature='contractSize'



def mean_diff(data,column):
    mean_p = np.mean(data[data['churn']==1][column])
    mean_e = np.mean(data[data['churn']==0][column])
    return mean_e - mean_p

number_of_permutations = 1000

original_mean_diff = mean_diff(df,feature)

def permutate_mean_differences(data,column,number_of_permutations):
    means = []
    for i in range(number_of_permutations):
        permutation = shuffle(data[column],random_state=i)
        a = np.mean(permutation.loc[:class_0_max])
        b = np.mean(permutation.loc[class_0_max:])
        means.append(a-b)
    return pd.DataFrame(means,columns=['means'])


permutate_mean_differences(df,feature,number_of_permutations)


_ = permutate_mean_differences(df,feature,number_of_permutations)
print(feature)
if original_mean_diff>0:
    p = np.sum(_['means'] >= original_mean_diff) / number_of_permutations
    print('Reject Null Hypothesis')
    print('p = ',p)
elif original_mean_diff<0:
    p = np.sum(_['means'] <= original_mean_diff) / number_of_permutations
    print('Reject Null Hypothesis')
    print('p = ',p)
else:
    print('Fail to Reject Null Hypothesis')
if p>0.05:
    print('Fail to Reject Null Hypothesis')

contractSize
Reject Null Hypothesis
p =  0.004


So this means that only 0.4% of simulated mean differences were more extreme than the observed value, thus it's 99% certain that contractSize influenced Churn rate.