# Predicting customer churn in telecom industry


In [1]:
# Importing the required libraries

import pandas as pd
import numpy as np
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import scale
import warnings
warnings.filterwarnings('ignore')

## Loading the data in the dataframe

In [2]:
pd.set_option('display.max_columns',250)
telecom_df = pd.read_csv("telecom_churn_data.csv")

In [3]:
telecom_df.head()

Unnamed: 0,mobile_number,circle_id,loc_og_t2o_mou,std_og_t2o_mou,loc_ic_t2o_mou,last_date_of_month_6,last_date_of_month_7,last_date_of_month_8,last_date_of_month_9,arpu_6,arpu_7,arpu_8,arpu_9,onnet_mou_6,onnet_mou_7,onnet_mou_8,onnet_mou_9,offnet_mou_6,offnet_mou_7,offnet_mou_8,offnet_mou_9,roam_ic_mou_6,roam_ic_mou_7,roam_ic_mou_8,roam_ic_mou_9,roam_og_mou_6,roam_og_mou_7,roam_og_mou_8,roam_og_mou_9,loc_og_t2t_mou_6,loc_og_t2t_mou_7,loc_og_t2t_mou_8,loc_og_t2t_mou_9,loc_og_t2m_mou_6,loc_og_t2m_mou_7,loc_og_t2m_mou_8,loc_og_t2m_mou_9,loc_og_t2f_mou_6,loc_og_t2f_mou_7,loc_og_t2f_mou_8,loc_og_t2f_mou_9,loc_og_t2c_mou_6,loc_og_t2c_mou_7,loc_og_t2c_mou_8,loc_og_t2c_mou_9,loc_og_mou_6,loc_og_mou_7,loc_og_mou_8,loc_og_mou_9,std_og_t2t_mou_6,std_og_t2t_mou_7,std_og_t2t_mou_8,std_og_t2t_mou_9,std_og_t2m_mou_6,std_og_t2m_mou_7,std_og_t2m_mou_8,std_og_t2m_mou_9,std_og_t2f_mou_6,std_og_t2f_mou_7,std_og_t2f_mou_8,std_og_t2f_mou_9,std_og_t2c_mou_6,std_og_t2c_mou_7,std_og_t2c_mou_8,std_og_t2c_mou_9,std_og_mou_6,std_og_mou_7,std_og_mou_8,std_og_mou_9,isd_og_mou_6,isd_og_mou_7,isd_og_mou_8,isd_og_mou_9,spl_og_mou_6,spl_og_mou_7,spl_og_mou_8,spl_og_mou_9,og_others_6,og_others_7,og_others_8,og_others_9,total_og_mou_6,total_og_mou_7,total_og_mou_8,total_og_mou_9,loc_ic_t2t_mou_6,loc_ic_t2t_mou_7,loc_ic_t2t_mou_8,loc_ic_t2t_mou_9,loc_ic_t2m_mou_6,loc_ic_t2m_mou_7,loc_ic_t2m_mou_8,loc_ic_t2m_mou_9,loc_ic_t2f_mou_6,loc_ic_t2f_mou_7,loc_ic_t2f_mou_8,loc_ic_t2f_mou_9,loc_ic_mou_6,loc_ic_mou_7,loc_ic_mou_8,loc_ic_mou_9,std_ic_t2t_mou_6,std_ic_t2t_mou_7,std_ic_t2t_mou_8,std_ic_t2t_mou_9,std_ic_t2m_mou_6,std_ic_t2m_mou_7,std_ic_t2m_mou_8,std_ic_t2m_mou_9,std_ic_t2f_mou_6,std_ic_t2f_mou_7,std_ic_t2f_mou_8,std_ic_t2f_mou_9,std_ic_t2o_mou_6,std_ic_t2o_mou_7,std_ic_t2o_mou_8,std_ic_t2o_mou_9,std_ic_mou_6,std_ic_mou_7,std_ic_mou_8,std_ic_mou_9,total_ic_mou_6,total_ic_mou_7,total_ic_mou_8,total_ic_mou_9,spl_ic_mou_6,spl_ic_mou_7,spl_ic_mou_8,spl_ic_mou_9,isd_ic_mou_6,isd_ic_mou_7,isd_ic_mou_8,isd_ic_mou_9,ic_others_6,ic_others_7,ic_others_8,ic_others_9,total_rech_num_6,total_rech_num_7,total_rech_num_8,total_rech_num_9,total_rech_amt_6,total_rech_amt_7,total_rech_amt_8,total_rech_amt_9,max_rech_amt_6,max_rech_amt_7,max_rech_amt_8,max_rech_amt_9,date_of_last_rech_6,date_of_last_rech_7,date_of_last_rech_8,date_of_last_rech_9,last_day_rch_amt_6,last_day_rch_amt_7,last_day_rch_amt_8,last_day_rch_amt_9,date_of_last_rech_data_6,date_of_last_rech_data_7,date_of_last_rech_data_8,date_of_last_rech_data_9,total_rech_data_6,total_rech_data_7,total_rech_data_8,total_rech_data_9,max_rech_data_6,max_rech_data_7,max_rech_data_8,max_rech_data_9,count_rech_2g_6,count_rech_2g_7,count_rech_2g_8,count_rech_2g_9,count_rech_3g_6,count_rech_3g_7,count_rech_3g_8,count_rech_3g_9,av_rech_amt_data_6,av_rech_amt_data_7,av_rech_amt_data_8,av_rech_amt_data_9,vol_2g_mb_6,vol_2g_mb_7,vol_2g_mb_8,vol_2g_mb_9,vol_3g_mb_6,vol_3g_mb_7,vol_3g_mb_8,vol_3g_mb_9,arpu_3g_6,arpu_3g_7,arpu_3g_8,arpu_3g_9,arpu_2g_6,arpu_2g_7,arpu_2g_8,arpu_2g_9,night_pck_user_6,night_pck_user_7,night_pck_user_8,night_pck_user_9,monthly_2g_6,monthly_2g_7,monthly_2g_8,monthly_2g_9,sachet_2g_6,sachet_2g_7,sachet_2g_8,sachet_2g_9,monthly_3g_6,monthly_3g_7,monthly_3g_8,monthly_3g_9,sachet_3g_6,sachet_3g_7,sachet_3g_8,sachet_3g_9,fb_user_6,fb_user_7,fb_user_8,fb_user_9,aon,aug_vbc_3g,jul_vbc_3g,jun_vbc_3g,sep_vbc_3g
0,7000842753,109,0.0,0.0,0.0,6/30/2014,7/31/2014,8/31/2014,9/30/2014,197.385,214.816,213.803,21.1,,,0.0,,,,0.0,,,,0.0,,,,0.0,,,,0.0,,,,0.0,,,,0.0,,,,0.0,,,,0.0,,,,0.0,,,,0.0,,,,0.0,,,,0.0,,,,0.0,,,,0.0,,,,0.0,,,,0.0,,0.0,0.0,0.0,0.0,,,0.16,,,,4.13,,,,1.15,,,,5.44,,,,0.0,,,,0.0,,,,0.0,,,,0.0,,,,0.0,,0.0,0.0,5.44,0.0,,,0.0,,,,0.0,,,,0.0,,4,3,2,6,362,252,252,0,252,252,252,0,6/21/2014,7/16/2014,8/8/2014,9/28/2014,252,252,252,0,6/21/2014,7/16/2014,8/8/2014,,1.0,1.0,1.0,,252.0,252.0,252.0,,0.0,0.0,0.0,,1.0,1.0,1.0,,252.0,252.0,252.0,,30.13,1.32,5.75,0.0,83.57,150.76,109.61,0.0,212.17,212.17,212.17,,212.17,212.17,212.17,,0.0,0.0,0.0,,0,0,0,0,0,0,0,0,1,1,1,0,0,0,0,0,1.0,1.0,1.0,,968,30.4,0.0,101.2,3.58
1,7001865778,109,0.0,0.0,0.0,6/30/2014,7/31/2014,8/31/2014,9/30/2014,34.047,355.074,268.321,86.285,24.11,78.68,7.68,18.34,15.74,99.84,304.76,53.76,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,23.88,74.56,7.68,18.34,11.51,75.94,291.86,53.76,0.0,0.0,0.0,0.0,0.0,2.91,0.0,0.0,35.39,150.51,299.54,72.11,0.23,4.11,0.0,0.0,0.0,0.46,0.13,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.23,4.58,0.13,0.0,0.0,0.0,0.0,0.0,4.68,23.43,12.76,0.0,0.0,0.0,0.0,0.0,40.31,178.53,312.44,72.11,1.61,29.91,29.23,116.09,17.48,65.38,375.58,56.93,0.0,8.93,3.61,0.0,19.09,104.23,408.43,173.03,0.0,0.0,2.35,0.0,5.9,0.0,12.49,15.01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.9,0.0,14.84,15.01,26.83,104.23,423.28,188.04,0.0,0.0,0.0,0.0,1.83,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4,9,11,5,74,384,283,121,44,154,65,50,6/29/2014,7/31/2014,8/28/2014,9/30/2014,44,23,30,0,,7/25/2014,8/10/2014,,,1.0,2.0,,,154.0,25.0,,,1.0,2.0,,,0.0,0.0,,,154.0,50.0,,0.0,108.07,365.47,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,,,28.61,7.6,,,0.0,0.0,,0,1,0,0,0,0,2,0,0,0,0,0,0,0,0,0,,1.0,1.0,,1006,0.0,0.0,0.0,0.0
2,7001625959,109,0.0,0.0,0.0,6/30/2014,7/31/2014,8/31/2014,9/30/2014,167.69,189.058,210.226,290.714,11.54,55.24,37.26,74.81,143.33,220.59,208.36,118.91,0.0,0.0,0.0,38.49,0.0,0.0,0.0,70.94,7.19,28.74,13.58,14.39,29.34,16.86,38.46,28.16,24.11,21.79,15.61,22.24,0.0,135.54,45.76,0.48,60.66,67.41,67.66,64.81,4.34,26.49,22.58,8.76,41.81,67.41,75.53,9.28,1.48,14.76,22.83,0.0,0.0,0.0,0.0,0.0,47.64,108.68,120.94,18.04,0.0,0.0,0.0,0.0,46.56,236.84,96.84,42.08,0.45,0.0,0.0,0.0,155.33,412.94,285.46,124.94,115.69,71.11,67.46,148.23,14.38,15.44,38.89,38.98,99.48,122.29,49.63,158.19,229.56,208.86,155.99,345.41,72.41,71.29,28.69,49.44,45.18,177.01,167.09,118.18,21.73,58.34,43.23,3.86,0.0,0.0,0.0,0.0,139.33,306.66,239.03,171.49,370.04,519.53,395.03,517.74,0.21,0.0,0.0,0.45,0.0,0.85,0.0,0.01,0.93,3.14,0.0,0.36,5,4,2,7,168,315,116,358,86,200,86,100,6/17/2014,7/24/2014,8/14/2014,9/29/2014,0,200,86,0,,,,9/17/2014,,,,1.0,,,,46.0,,,,1.0,,,,0.0,,,,46.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.42,,,,2.84,,,,0.0,,,,0.0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,,,,1.0,1103,0.0,0.0,4.17,0.0
3,7001204172,109,0.0,0.0,0.0,6/30/2014,7/31/2014,8/31/2014,9/30/2014,221.338,251.102,508.054,389.5,99.91,54.39,310.98,241.71,123.31,109.01,71.68,113.54,0.0,54.86,44.38,0.0,0.0,28.09,39.04,0.0,73.68,34.81,10.61,15.49,107.43,83.21,22.46,65.46,1.91,0.65,4.91,2.06,0.0,0.0,0.0,0.0,183.03,118.68,37.99,83.03,26.23,14.89,289.58,226.21,2.99,1.73,6.53,9.99,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,29.23,16.63,296.11,236.21,0.0,0.0,0.0,0.0,10.96,0.0,18.09,43.29,0.0,0.0,0.0,0.0,223.23,135.31,352.21,362.54,62.08,19.98,8.04,41.73,113.96,64.51,20.28,52.86,57.43,27.09,19.84,65.59,233.48,111.59,48.18,160.19,43.48,66.44,0.0,129.84,1.33,38.56,4.94,13.98,1.18,0.0,0.0,0.0,0.0,0.0,0.0,0.0,45.99,105.01,4.94,143.83,280.08,216.61,53.13,305.38,0.59,0.0,0.0,0.55,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.8,10,11,18,14,230,310,601,410,60,50,50,50,6/28/2014,7/31/2014,8/31/2014,9/30/2014,30,50,50,30,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,,,,2491,0.0,0.0,0.0,0.0
4,7000142493,109,0.0,0.0,0.0,6/30/2014,7/31/2014,8/31/2014,9/30/2014,261.636,309.876,238.174,163.426,50.31,149.44,83.89,58.78,76.96,91.88,124.26,45.81,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,50.31,149.44,83.89,58.78,67.64,91.88,124.26,37.89,0.0,0.0,0.0,1.93,0.0,0.0,0.0,0.0,117.96,241.33,208.16,98.61,0.0,0.0,0.0,0.0,9.31,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.31,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.98,0.0,0.0,0.0,0.0,127.28,241.33,208.16,104.59,105.68,88.49,233.81,154.56,106.84,109.54,104.13,48.24,1.5,0.0,0.0,0.0,214.03,198.04,337.94,202.81,0.0,0.0,0.86,2.31,1.93,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.93,0.25,0.86,2.31,216.44,198.29,338.81,205.31,0.0,0.0,0.0,0.18,0.0,0.0,0.0,0.0,0.48,0.0,0.0,0.0,5,6,3,4,196,350,287,200,56,110,110,50,6/26/2014,7/28/2014,8/9/2014,9/28/2014,50,110,110,50,6/4/2014,,,,1.0,,,,56.0,,,,1.0,,,,0.0,,,,56.0,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,0.0,,,,0.0,,,,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0.0,,,,1526,0.0,0.0,0.0,0.0


In [4]:
telecom_df.shape

(99999, 226)

# Filtering the high value customers

Definition: Those who have recharged with an amount more than or equal to X, where X is the 70th percentile of the average recharge amount in the first two months (the good phase)



In [5]:
#Identifying the columns with recharge information
telecom_recharge_cols = [col for col in telecom_df.columns if 'rech_' in col]
print(telecom_recharge_cols)

['total_rech_num_6', 'total_rech_num_7', 'total_rech_num_8', 'total_rech_num_9', 'total_rech_amt_6', 'total_rech_amt_7', 'total_rech_amt_8', 'total_rech_amt_9', 'max_rech_amt_6', 'max_rech_amt_7', 'max_rech_amt_8', 'max_rech_amt_9', 'date_of_last_rech_6', 'date_of_last_rech_7', 'date_of_last_rech_8', 'date_of_last_rech_9', 'date_of_last_rech_data_6', 'date_of_last_rech_data_7', 'date_of_last_rech_data_8', 'date_of_last_rech_data_9', 'total_rech_data_6', 'total_rech_data_7', 'total_rech_data_8', 'total_rech_data_9', 'max_rech_data_6', 'max_rech_data_7', 'max_rech_data_8', 'max_rech_data_9', 'count_rech_2g_6', 'count_rech_2g_7', 'count_rech_2g_8', 'count_rech_2g_9', 'count_rech_3g_6', 'count_rech_3g_7', 'count_rech_3g_8', 'count_rech_3g_9', 'av_rech_amt_data_6', 'av_rech_amt_data_7', 'av_rech_amt_data_8', 'av_rech_amt_data_9']


#### Notes:
- Columns starting with total_rech_amt_* indicates the total amount recharged for non-data usage
- Columns starting with total_rech_data_* indicates the total data recharged for data usage
- Columns starting with av_rech_amt_data_* indicates the average amount recharged for data usage
- Net recharge amount for a given month (for e.g month 6) is given by:

   `net_recharge_amt_6 = total_rech_amt_6 + (total_rech_data_6 * av_rech_amt_data_6)`


In [6]:
#Columns required for calculating the net recharge amount for months 6 & 7

recharge_cols = ['total_rech_amt_6','total_rech_amt_7','total_rech_amt_8','total_rech_amt_9',
                 'total_rech_data_6','total_rech_data_7','total_rech_data_8','total_rech_data_9',
                 'av_rech_amt_data_6','av_rech_amt_data_7','av_rech_amt_data_8','av_rech_amt_data_9']


In [7]:
telecom_df[recharge_cols].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99999 entries, 0 to 99998
Data columns (total 12 columns):
total_rech_amt_6      99999 non-null int64
total_rech_amt_7      99999 non-null int64
total_rech_amt_8      99999 non-null int64
total_rech_amt_9      99999 non-null int64
total_rech_data_6     25153 non-null float64
total_rech_data_7     25571 non-null float64
total_rech_data_8     26339 non-null float64
total_rech_data_9     25922 non-null float64
av_rech_amt_data_6    25153 non-null float64
av_rech_amt_data_7    25571 non-null float64
av_rech_amt_data_8    26339 non-null float64
av_rech_amt_data_9    25922 non-null float64
dtypes: float64(8), int64(4)
memory usage: 9.2 MB


#### Notes
- total_rech_data_* and av_rech_amt_data_* columns have null values 
- We decide to replace them with 0 values so that the net recharge amount can be calculated

In [8]:
#Calculating the net recharge amount for all the 4 months

telecom_df['total_recharge_data_amt_6'] = telecom_df['total_rech_data_6'].fillna(0) * telecom_df['av_rech_amt_data_6'].fillna(0)

telecom_df['total_recharge_data_amt_7'] = telecom_df['total_rech_data_7'].fillna(0) * telecom_df['av_rech_amt_data_7'].fillna(0)

telecom_df['total_recharge_data_amt_8'] = telecom_df['total_rech_data_8'].fillna(0) * telecom_df['av_rech_amt_data_8'].fillna(0)

telecom_df['total_recharge_data_amt_9'] = telecom_df['total_rech_data_9'].fillna(0) * telecom_df['av_rech_amt_data_9'].fillna(0)



telecom_df['net_recharge_amt_6'] = telecom_df['total_rech_amt_6'] + telecom_df['total_recharge_data_amt_6']

telecom_df['net_recharge_amt_7'] = telecom_df['total_rech_amt_7'] + telecom_df['total_recharge_data_amt_7']

telecom_df['net_recharge_amt_8'] = telecom_df['total_rech_amt_8'] + telecom_df['total_recharge_data_amt_8']

telecom_df['net_recharge_amt_9'] = telecom_df['total_rech_amt_9'] + telecom_df['total_recharge_data_amt_9']



In [9]:
#Calculating the average recharge amounts for the 2 months
telecom_df['avg_rech_amt_6_7'] = (telecom_df['net_recharge_amt_6']  + telecom_df['net_recharge_amt_7'])/2

In [10]:
##Calculating the 70th percentile of the average recharge amount in the first 2 months
telecom_df["avg_rech_amt_6_7"].describe(percentiles=[.25,.5,.70,.90,.95,.99])

count    99999.000000
mean       485.328068
std        869.423594
min          0.000000
25%        128.000000
50%        275.000000
70%        478.000000
90%       1028.000000
95%       1512.000000
99%       3363.510000
max      61236.000000
Name: avg_rech_amt_6_7, dtype: float64

#### Observation
- The 70th percentile of the average recharge amount in the good phase(first 2 months) is 478

In [11]:
#Filtering the high-value customers based on whether they recharge for an amount greater than or equal to 478
telecom_hv_df = telecom_df[telecom_df['avg_rech_amt_6_7'] >= 478.0]

In [12]:
telecom_hv_df.shape

(30001, 235)

#### Observation
- There are approximately 30K high-value customers

In [13]:
#Dropping the redundant recharge columns
recharge_cols_drop = ['total_rech_data_6','total_rech_data_7','total_rech_data_8','total_rech_data_9',
                      'av_rech_amt_data_6','av_rech_amt_data_7','av_rech_amt_data_8','av_rech_amt_data_9']

telecom_hv_df.drop(recharge_cols_drop, axis=1, inplace=True)

In [14]:
telecom_hv_df.shape

(30001, 227)

# Tagging the churners

Now tag the churned customers (churn=1, else 0) based on the fourth month as follows: Those who have not made any calls (either incoming or outgoing) AND have not used mobile internet even once in the churn phase. The attributes you need to use to tag churners are:

- total_ic_mou_9
- total_og_mou_9
- vol_2g_mb_9
- vol_3g_mb_9

In [15]:
#Checking if any of the above attributes have null values

#  total_ic_mou_9
NullValuesPct = round(100*(telecom_hv_df["total_ic_mou_9"].isnull().sum()/len(telecom_hv_df.index)), 2)
print("% of null values in total_ic_mou_9 : ",NullValuesPct)

#  total_og_mou_9
NullValuesPct = round(100*(telecom_hv_df["total_og_mou_9"].isnull().sum()/len(telecom_hv_df.index)), 2)
print("% of null values in total_og_mou_9: ",NullValuesPct)

#  vol_2g_mb_9
NullValuesPct = round(100*(telecom_hv_df["vol_2g_mb_9"].isnull().sum()/len(telecom_hv_df.index)), 2)
print("% of null values in vol_2g_mb_9 : ",NullValuesPct)

#  vol_3g_mb_9
NullValuesPct = round(100*(telecom_hv_df["vol_3g_mb_9"].isnull().sum()/len(telecom_hv_df.index)), 2)
print("% of null values in vol_3g_mb_9: ",NullValuesPct)

% of null values in total_ic_mou_9 :  0.0
% of null values in total_og_mou_9:  0.0
% of null values in vol_2g_mb_9 :  0.0
% of null values in vol_3g_mb_9:  0.0


In [16]:
# Tagging the churned customers based on the 4th month
telecom_hv_df['churn'] = np.where(telecom_hv_df[['total_ic_mou_9','total_og_mou_9','vol_2g_mb_9','vol_3g_mb_9']].sum(axis=1) == 0, 
                                   1,
                                   0)

In [17]:
#Calculating the total churn %
ChurnPct = round(100*(telecom_hv_df["churn"].sum()/len(telecom_hv_df.index)), 2)
print("% of high-value churn customers: ",ChurnPct)

% of high-value churn customers:  8.14


# Missing value treatment

In [18]:
## Columns with null values
col_with_null = telecom_hv_df.columns[telecom_hv_df.isna().any()].tolist()
print(len(col_with_null))

158


In [19]:
col_to_del=[]
null_col=[]
for col in col_with_null:
    per_null = round((telecom_hv_df[col].isna().sum()/telecom_hv_df.shape[0])*100,2)
    print(col,per_null) 
    if(per_null>30):
        col_to_del.append(col)
    else:
        null_col.append(col)

loc_og_t2o_mou 0.75
std_og_t2o_mou 0.75
loc_ic_t2o_mou 0.75
last_date_of_month_7 0.09
last_date_of_month_8 0.55
last_date_of_month_9 1.22
onnet_mou_6 1.82
onnet_mou_7 1.79
onnet_mou_8 3.91
onnet_mou_9 6.34
offnet_mou_6 1.82
offnet_mou_7 1.79
offnet_mou_8 3.91
offnet_mou_9 6.34
roam_ic_mou_6 1.82
roam_ic_mou_7 1.79
roam_ic_mou_8 3.91
roam_ic_mou_9 6.34
roam_og_mou_6 1.82
roam_og_mou_7 1.79
roam_og_mou_8 3.91
roam_og_mou_9 6.34
loc_og_t2t_mou_6 1.82
loc_og_t2t_mou_7 1.79
loc_og_t2t_mou_8 3.91
loc_og_t2t_mou_9 6.34
loc_og_t2m_mou_6 1.82
loc_og_t2m_mou_7 1.79
loc_og_t2m_mou_8 3.91
loc_og_t2m_mou_9 6.34
loc_og_t2f_mou_6 1.82
loc_og_t2f_mou_7 1.79
loc_og_t2f_mou_8 3.91
loc_og_t2f_mou_9 6.34
loc_og_t2c_mou_6 1.82
loc_og_t2c_mou_7 1.79
loc_og_t2c_mou_8 3.91
loc_og_t2c_mou_9 6.34
loc_og_mou_6 1.82
loc_og_mou_7 1.79
loc_og_mou_8 3.91
loc_og_mou_9 6.34
std_og_t2t_mou_6 1.82
std_og_t2t_mou_7 1.79
std_og_t2t_mou_8 3.91
std_og_t2t_mou_9 6.34
std_og_t2m_mou_6 1.82
std_og_t2m_mou_7 1.79
std_og_t2m_mou

In [20]:
## Dropping columns with more than 73 percent null values
telecom_hv_df=telecom_hv_df.drop(columns=col_to_del)

In [21]:
print(len(col_to_del))
print(telecom_hv_df.shape)

32
(30001, 196)


In [23]:
##Dropping columns with single value
col_to_drop=[]
for col in telecom_hv_df.columns:
   num = telecom_hv_df[col].nunique()
   print(col,num)
   if num ==1:
        col_to_drop.append(col)

mobile_number 30001
circle_id 1
loc_og_t2o_mou 1
std_og_t2o_mou 1
loc_ic_t2o_mou 1
last_date_of_month_6 1
last_date_of_month_7 1
last_date_of_month_8 1
last_date_of_month_9 1
arpu_6 29072
arpu_7 29084
arpu_8 28073
arpu_9 27022
onnet_mou_6 16890
onnet_mou_7 16987
onnet_mou_8 15872
onnet_mou_9 15075
offnet_mou_6 21220
offnet_mou_7 21263
offnet_mou_8 20163
offnet_mou_9 19150
roam_ic_mou_6 4048
roam_ic_mou_7 3385
roam_ic_mou_8 3433
roam_ic_mou_9 3136
roam_og_mou_6 4744
roam_og_mou_7 3994
roam_og_mou_8 4001
roam_og_mou_9 3667
loc_og_t2t_mou_6 10420
loc_og_t2t_mou_7 10394
loc_og_t2t_mou_8 10063
loc_og_t2t_mou_9 9694
loc_og_t2m_mou_6 15625
loc_og_t2m_mou_7 15674
loc_og_t2m_mou_8 15130
loc_og_t2m_mou_9 14593
loc_og_t2f_mou_6 3090
loc_og_t2f_mou_7 3100
loc_og_t2f_mou_8 2989
loc_og_t2f_mou_9 2971
loc_og_t2c_mou_6 1682
loc_og_t2c_mou_7 1764
loc_og_t2c_mou_8 1723
loc_og_t2c_mou_9 1595
loc_og_mou_6 18348
loc_og_mou_7 18502
loc_og_mou_8 17582
loc_og_mou_9 17116
std_og_t2t_mou_6 11102
std_og_t2t_mou_

In [24]:
print(col_to_drop)


['circle_id', 'loc_og_t2o_mou', 'std_og_t2o_mou', 'loc_ic_t2o_mou', 'last_date_of_month_6', 'last_date_of_month_7', 'last_date_of_month_8', 'last_date_of_month_9', 'std_og_t2c_mou_6', 'std_og_t2c_mou_7', 'std_og_t2c_mou_8', 'std_og_t2c_mou_9', 'std_ic_t2o_mou_6', 'std_ic_t2o_mou_7', 'std_ic_t2o_mou_8', 'std_ic_t2o_mou_9']


In [25]:
##Dropping columns with single values
telecom_hv_df=telecom_hv_df.drop(columns=col_to_drop)

In [26]:
### Analyzing columns with null values
col_with_null = telecom_hv_df.columns[telecom_hv_df.isna().any()].tolist()
for col in col_with_null:
    num = telecom_hv_df[col].nunique()
    no_na= round((telecom_hv_df[col].isna().sum()/len(telecom_hv_df.index))*100,2)
    print(col,num,no_na)

onnet_mou_6 16890 1.82
onnet_mou_7 16987 1.79
onnet_mou_8 15872 3.91
onnet_mou_9 15075 6.34
offnet_mou_6 21220 1.82
offnet_mou_7 21263 1.79
offnet_mou_8 20163 3.91
offnet_mou_9 19150 6.34
roam_ic_mou_6 4048 1.82
roam_ic_mou_7 3385 1.79
roam_ic_mou_8 3433 3.91
roam_ic_mou_9 3136 6.34
roam_og_mou_6 4744 1.82
roam_og_mou_7 3994 1.79
roam_og_mou_8 4001 3.91
roam_og_mou_9 3667 6.34
loc_og_t2t_mou_6 10420 1.82
loc_og_t2t_mou_7 10394 1.79
loc_og_t2t_mou_8 10063 3.91
loc_og_t2t_mou_9 9694 6.34
loc_og_t2m_mou_6 15625 1.82
loc_og_t2m_mou_7 15674 1.79
loc_og_t2m_mou_8 15130 3.91
loc_og_t2m_mou_9 14593 6.34
loc_og_t2f_mou_6 3090 1.82
loc_og_t2f_mou_7 3100 1.79
loc_og_t2f_mou_8 2989 3.91
loc_og_t2f_mou_9 2971 6.34
loc_og_t2c_mou_6 1682 1.82
loc_og_t2c_mou_7 1764 1.79
loc_og_t2c_mou_8 1723 3.91
loc_og_t2c_mou_9 1595 6.34
loc_og_mou_6 18348 1.82
loc_og_mou_7 18502 1.79
loc_og_mou_8 17582 3.91
loc_og_mou_9 17116 6.34
std_og_t2t_mou_6 11102 1.82
std_og_t2t_mou_7 11254 1.79
std_og_t2t_mou_8 10251 3.91
s

In [None]:
dates=['date_of_last_rech_6','date_of_last_rech_7','date_of_last_rech_8','date_of_last_rech_9']
for i, col in enumerate(col_with_null):
    if(col not in dates):
        plt.figure(i)
        sns.distplot(telecom_hv_df[~telecom_hv_df[col].isnull()][col])

In [27]:
##Either impute or drop the values
for col in col_with_null:
    #telecom_hv_df = telecom_hv_df[~telecom_hv_df[col].isnull()]
    telecom[col].fillna(value=0,inplace=True)
print(telecom_hv_df.shape)
print(round((telecom_hv_df.shape[0]/30001)*100,2))

(27121, 180)
90.4


In [29]:
col_with_null = telecom_hv_df.columns[telecom_hv_df.isna().any()].tolist()
print(col_with_null)

[]


In [30]:
#Calculating the total churn %
ChurnPct = round(100*(telecom_hv_df["churn"].sum()/len(telecom_hv_df.index)), 2)
print("% of high-value churn customers: ",ChurnPct)

% of high-value churn customers:  2.67
