# Defining the problem
- Objective: Determine why and when customers are leaving.
- Business Impact: Assess how churn impacts revenue and long-term growth.

In [10]:
import pandas as pd
from scipy.stats import skew,kurtosis, pearsonr
import numpy as np
import seaborn as sns
from itertools import product

In [3]:
df = pd.read_csv('ecom-user-churn-data.csv')
df.describe()

Unnamed: 0,visitorid,ses_rec,ses_rec_avg,ses_rec_sd,ses_rec_cv,user_rec,ses_n,ses_n_r,int_n,int_n_r,...,int_cat16_n,int_cat17_n,int_cat18_n,int_cat19_n,int_cat20_n,int_cat21_n,int_cat22_n,int_cat23_n,int_cat24_n,target_class
count,49358.0,49358.0,49358.0,49358.0,49358.0,49358.0,49358.0,49358.0,49358.0,49358.0,...,49358.0,49358.0,49358.0,49358.0,49358.0,49358.0,49358.0,49358.0,49358.0,49358.0
mean,707298.6,15.45484,11.231611,2.711961,-0.0201,33.822947,3.366445,0.172372,6.716277,1.720975,...,0.955792,0.773714,0.382977,0.732424,0.503343,0.44702,2.102577,0.03813,0.099579,0.885591
std,407209.8,9.184645,18.162743,6.583917,0.917701,25.237703,7.380573,0.372614,38.528882,1.455885,...,6.086722,5.003517,4.569604,4.977989,3.259194,3.873684,16.273213,0.593681,1.135149,0.318311
min,37.0,0.0,0.0,0.0,-1.0,0.0,2.0,-1.0,2.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,353292.0,7.0,0.0,0.0,-1.0,16.0,2.0,0.060606,2.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
50%,710091.0,16.0,2.25,0.0,0.0,26.0,2.0,0.090909,3.0,1.25,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
75%,1060355.0,23.0,14.25,1.0,0.638646,46.0,3.0,0.166667,6.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,1.0
max,1407573.0,31.0,99.0,47.5,11.525121,99.0,475.0,18.0,5549.0,59.0,...,576.0,445.0,481.0,564.0,317.0,420.0,2282.0,54.0,105.0,1.0


# Data prep

In [7]:
# deal with duplicates, handle missing values and anomalies.
df.isnull().sum().sum()

0

In [9]:
df.duplicated().sum()

0

## Univariate analysis
- For Skewness:
    - A skewness close to 0 indicates a symmetrical distribution.
    - A skewness greater than 1 or less than -1 indicates a highly skewed distribution.
    - A skewness between -1 and -0.5 or between 0.5 and 1 indicates moderate skewness.
- For Kurtosis:
    - A kurtosis greater than 3 indicates a leptokurtic distribution. Traditional interpretations would subtract 3 (excess kurtosis), so a value greater than 0 in this excess kurtosis indicates more outliers than the normal distribution.
    - A kurtosis less than 3 indicates a platykurtic distribution. With excess kurtosis (kurtosis - 3), a value less than 0 indicates fewer outliers. 
    
These metrics are valuable for data preprocessing in machine learning. Highly skewed or kurtotic data may need transformation, such as logarithmic, square root, or box-cox transformation, to meet the assumptions of various statistical models and algorithms.

In [23]:
#univariate analysis
def univ_analysis(x):

    """
    get quartiles, skewness, kurtosis and sparseness.
    """
    distribution_stats = [np.quantile(x, [0,.25,.5,.75,1]), skew(x), kurtosis(x),round((1 - (np.count_nonzero(x)/len(x))),4)]
    return distribution_stats


In [26]:
df.apply(univ_analysis).to_csv('df_stats.csv')

Here comes a question, the combination of predictor variables can have stronger impact in the response variable even if some variables from that combination have no relevant correlation with the response variable? which are the main models where that situation can happen?

Before getting into bivariate analysis, let's check the correlation and se which predictor variables we want to remove.

In [38]:
pairs = list(product(df.columns, repeat = 2))
corr_ls = []

for c in pairs:
    pair_corr, corr_pval = pearsonr(df[c[0]],df[c[1]])
    corr_ls.append([c[0],c[1],pair_corr,corr_pval])



implementing WOE and IV

In [114]:
def calc_iv(df,variable,target):

    
    lst = []
    #Before analyzing the data, missing values within the feature of interest are filled with the string "NULL". This ensures that missing values are treated as a separate category during the IV calculation, allowing for the evaluation of their predictive power.
    df[variable] = df[variable].fillna("NULL")
    #This block iterates through each unique value of the specified feature. For each unique value (val), it computes:
    #The total number of occurrences of val.
    #The count of occurrences where the target variable is 0 ("churn").
    #The count of occurrences where the target variable is 1 ("no churn").
    #These counts are appended to the list lst as a sublist, including the feature name and the value being analyzed.~


    for i in range(df[variable].nunique()):

        variable_val = df[variable].unique()[i]
        total_ocur = len(df[(df[variable] == variable_val)])
        total_churn = len(df[(df[variable] == variable_val) & (df[(df[target] == 0)])])
        total_nochurn = len(df[(df[variable] == variable_val) & (df[(df[target] == 1)])])

        lst.append([variable,variable_val,total_ocur,total_churn,total_nochurn])
        
    #Share: The proportion of observations for each unique value relative to the total number of observations.
    #share churn: The proportion of the "bad" outcome for each unique value.
    #Distribution Good Rate and Distribution Bad Rate: The distribution of good and bad rates across the unique values.
    #WoE (Weight of Evidence): A measure of the predictive power of an independent variable in separating the classes.
    woe_df = pd.DataFrame(data = lst, columns= ['feature','feature_val','total_ocur','total_churn','total_nochurn' ])

    woe_df['share'] = woe_df['total_ocur']/woe_df['total_ocur'].sum()
    woe_df['share_churn'] = woe_df['total_churn']/woe_df['total_ocur'].sum()
    woe_df['distribution_churn'] = woe_df['total_churn']/woe_df['total_churn'].sum()
    woe_df['distribution_nochurn'] = woe_df['total_nochurn']/woe_df['total_nochurn'].sum()
    woe_df['WoE'] = np.log(woe_df['distribution_nochurn']/woe_df['distribution_churn'])
    woe_df = woe_df.replace({'WoE':{np.inf: 0, -np.inf:0}})
    # Calculates the Information Value for each unique value of the feature by multiplying the WoE by the 
    #difference in distributions of the good and bad rates. The IV is a summary measure that quantifies the
    #predictive power of the independent variable.
    print(woe_df)

    woe_df['IV'] = woe_df['WoE']*(woe_df['distribution_nochurn'] - woe_df['distribution_churn'])

    woe_df = woe_df.sort_values(by = ['feature','feature_val'],ascending = [True, True])

    IV = woe_df['IV'].sum()

    return IV,woe_df


                

In [119]:
calc_iv(df,'ses_rec','target_class')
#df['ses_rec'].nunique()

In [87]:
corr_df = pd.DataFrame(data= corr_ls,columns=['from','to','rho','pvalue'])
corr_df[(abs(corr_df['rho']) > .8 ) & (corr_df['from'] != corr_df['to'])]

Unnamed: 0,from,to,rho,pvalue
51,ses_rec,ses_rec_avg,-0.079104,2.404515e-69
52,ses_rec,ses_rec_sd,-0.079306,1.082075e-69
53,ses_rec,ses_rec_cv,-0.090615,1.707000e-90
54,ses_rec,user_rec,0.232466,0.000000e+00
55,ses_rec,ses_n,-0.069020,3.445606e-53
...,...,...,...,...
2395,target_class,int_cat20_n,-0.073814,1.353599e-60
2396,target_class,int_cat21_n,-0.056880,1.164528e-36
2397,target_class,int_cat22_n,-0.065256,1.005240e-47
2398,target_class,int_cat23_n,-0.033202,1.604395e-13


In [97]:
len(df[(df['target_class'] == 0)])

5647

In [71]:
int(n/2) - 1


1