In [1]:
# Importing the required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
# To ignore warnings
import warnings
warnings.filterwarnings("ignore")

In [2]:
#Reading telecom churn data from csv file
telecom_data = pd.read_csv("telecom_churn_data.csv")

In [3]:
#Inspecting columns
telecom_data.head()

Unnamed: 0,mobile_number,circle_id,loc_og_t2o_mou,std_og_t2o_mou,loc_ic_t2o_mou,last_date_of_month_6,last_date_of_month_7,last_date_of_month_8,last_date_of_month_9,arpu_6,...,sachet_3g_9,fb_user_6,fb_user_7,fb_user_8,fb_user_9,aon,aug_vbc_3g,jul_vbc_3g,jun_vbc_3g,sep_vbc_3g
0,7000842753,109,0.0,0.0,0.0,6/30/2014,7/31/2014,8/31/2014,9/30/2014,197.385,...,0,1.0,1.0,1.0,,968,30.4,0.0,101.2,3.58
1,7001865778,109,0.0,0.0,0.0,6/30/2014,7/31/2014,8/31/2014,9/30/2014,34.047,...,0,,1.0,1.0,,1006,0.0,0.0,0.0,0.0
2,7001625959,109,0.0,0.0,0.0,6/30/2014,7/31/2014,8/31/2014,9/30/2014,167.69,...,0,,,,1.0,1103,0.0,0.0,4.17,0.0
3,7001204172,109,0.0,0.0,0.0,6/30/2014,7/31/2014,8/31/2014,9/30/2014,221.338,...,0,,,,,2491,0.0,0.0,0.0,0.0
4,7000142493,109,0.0,0.0,0.0,6/30/2014,7/31/2014,8/31/2014,9/30/2014,261.636,...,0,0.0,,,,1526,0.0,0.0,0.0,0.0


In [4]:
#Imputing NaN values with 0 for totat recharge amounnt of data and voice calls
telecom_data['av_rech_amt_data_6'].fillna(0,inplace=True)
telecom_data['av_rech_amt_data_7'].fillna(0,inplace=True)
telecom_data['total_rech_amt_7'].fillna(0,inplace=True)
telecom_data['total_rech_amt_6'].fillna(0,inplace=True)

In [5]:
#Imputing NaN values with 0 for totat no of recharges for data
telecom_data['total_rech_data_7'].fillna(0,inplace=True)
telecom_data['total_rech_data_6'].fillna(0,inplace=True)

In [6]:
#Step1 : Deriving new metrics
#Calculating total recharge amount for data
telecom_data['total_rech_data_amt_7'] = telecom_data['av_rech_amt_data_7'] * telecom_data['total_rech_data_7']
telecom_data['total_rech_data_amt_6'] = telecom_data['av_rech_amt_data_6'] * telecom_data['total_rech_data_6']

In [7]:
#Step2 : Filtering hig value customers
#Calculating average amount of total recharge done for both Jun(6) and July(7)
telecom_data['avg_rech_6_7'] = (telecom_data['total_rech_data_amt_7'] + telecom_data['total_rech_data_amt_6'] + telecom_data['total_rech_amt_6'] + telecom_data['total_rech_amt_7'])/4

In [8]:
#Inspecting average recharge amount
telecom_data['avg_rech_6_7'].head()

0    279.50
1    153.00
2    120.75
3    135.00
4    150.50
Name: avg_rech_6_7, dtype: float64

In [9]:
#Filtering out high value customers who have done average recharge greater than or equal to 70th percentile of average amount of June and July
telecom_data_high_cust = telecom_data[(telecom_data['avg_rech_6_7'] >= round(np.percentile(telecom_data['avg_rech_6_7'],70),2))]

In [10]:
#Inspecting final data frame of high value customers
telecom_data_high_cust.shape

#We got total 30001 high value customers which is very close to 29.9k customers as mentioned in problem statement

(30001, 229)

In [11]:
#Step3 : Tag churners and remove attributes of the churn phase

#Inspecting null values for total incomming for month of August
telecom_data_high_cust['total_ic_mou_9'].isna().sum()

0

In [12]:
#Inspecting null values for total outgoing for month of August
telecom_data_high_cust['total_og_mou_9'].isna().sum()

0

In [13]:
#Inspecting null values for usage of 2g mobile data for month of August
telecom_data_high_cust['vol_2g_mb_9'].isna().sum()

0

In [14]:
#Inspecting null values for usage of 3g mobile data for month of August
telecom_data_high_cust['vol_3g_mb_9'].isna().sum()

0

In [15]:
#Creating a custom function to tag customers as churn =1/0 based on certain criteria
def isChurn(x):
    if x['total_ic_mou_9'] == 0 and x['total_og_mou_9'] == 0 and x['vol_2g_mb_9'] == 0 and x['vol_3g_mb_9'] == 0:
        return 1
    else:
        return 0

In [16]:
#Tagging customers as churn/Not churn by applying above custom function
telecom_data_high_cust['churn'] = telecom_data_high_cust.apply(isChurn,axis=1)

In [17]:
#Inspecting the total count of churned and not churned customers
telecom_data_high_cust['churn'].value_counts()

#We can conclude that we have total 2.4k churned customers.

0    27560
1     2441
Name: churn, dtype: int64

In [18]:
#Removing attrbiutes from final data frame for churn phase( with _9 suffix)
telecom_data_high_cust.drop([col for col in telecom_data_high_cust.columns if "_9" in col], axis=1, inplace=True)

In [19]:
telecom_data_high_cust.columns

Index(['mobile_number', 'circle_id', 'loc_og_t2o_mou', 'std_og_t2o_mou',
       'loc_ic_t2o_mou', 'last_date_of_month_6', 'last_date_of_month_7',
       'last_date_of_month_8', 'arpu_6', 'arpu_7',
       ...
       'fb_user_8', 'aon', 'aug_vbc_3g', 'jul_vbc_3g', 'jun_vbc_3g',
       'sep_vbc_3g', 'total_rech_data_amt_7', 'total_rech_data_amt_6',
       'avg_rech_6_7', 'churn'],
      dtype='object', length=176)