## Importing Libraries and Data

In [1]:
# Importing the required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.decomposition import PCA
from imblearn.over_sampling import SMOTE
import xgboost as xgb

from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

import warnings
warnings.filterwarnings('ignore')

ModuleNotFoundError: No module named 'imblearn'

In [None]:
# Reading the dataset
churn_data = pd.read_csv("telecom_churn_data.csv")

In [None]:
# Settings for viewing the entire columns
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [None]:
churn_data.shape

In [None]:
# Checking the data
churn_data.head(10)

## Data Cleaning

#### `Missing value percentage`

In [None]:
def nullPercentage(churn_data):
  return round((churn_data.isnull().sum()/len(churn_data)*100),2).sort_values(ascending=False)

In [None]:
percentage_ = nullPercentage(churn_data)
percentage_

#### `Splitting the categorical and continuous columns`

In [None]:
# Checking the null data again
print(nullPercentage(churn_data))

In [None]:
# Categorical variables
categorical_columns = churn_data.select_dtypes(exclude=['int','float']).columns
categorical_columns.values

In [None]:
# Continuous variables
continuous_columns = churn_data.select_dtypes(include=['int','float']).columns
continuous_columns.values

In [None]:
# Checking if there are any non_standard missing values
columns_ = churn_data.columns
for col in columns_:
  print ('{} : Unique values {}\n'.format(col, churn_data[col].unique()))

In [None]:
# Converting the categorical columns into datetime type
for col in categorical_columns.values:
  churn_data[col] = pd.to_datetime(churn_data[col])

In [None]:
# Converting the categorical features to object type.
churn_data[['mobile_number','circle_id']] = churn_data[['mobile_number','circle_id']].astype(object)

#### `Imputing the meaningfull missing data by 0`

In [None]:
# Checking the null values in total recharge amount.
churn_data[['total_rech_amt_6','total_rech_amt_7','total_rech_amt_8','total_rech_amt_9']].isnull().sum()

In [None]:
# Checking the null values in average reacharge data and total recharge data.
churn_data[['av_rech_amt_data_6','av_rech_amt_data_7','av_rech_amt_data_8','av_rech_amt_data_9','total_rech_data_6','total_rech_data_7','total_rech_data_8','total_rech_data_9']].isnull().sum()

In [None]:
# Imputing the recharge data columns with 0.
churn_data[['av_rech_amt_data_6','av_rech_amt_data_7','av_rech_amt_data_8','av_rech_amt_data_9']] = churn_data[['av_rech_amt_data_6','av_rech_amt_data_7','av_rech_amt_data_8','av_rech_amt_data_9']].fillna(0)
churn_data[['total_rech_data_6','total_rech_data_7','total_rech_data_8','total_rech_data_9']] = churn_data[['total_rech_data_6','total_rech_data_7','total_rech_data_8','total_rech_data_9']].fillna(0)

#### `Dropping the columns which has high percentage of missing values`

In [None]:
# Function to create null_percentage, columns dataframe
def createNullDict(data):
  columns_ = data.columns
  null_percentage_ = round((data.isnull().sum()/len(data)*100),2)
  dataframe = pd.DataFrame({'Columns':columns_,'Percentage':null_percentage_})
  dataframe.reset_index(drop=True,inplace=True)
  return dataframe  

# Dropping the columns which has more than 30% of missing values except 9th month data
# creating a dictionary of columns and their respective missing percentage
null_dataframe = createNullDict(churn_data)
columns_to_drop = null_dataframe.loc[(null_dataframe['Columns'].str.contains('_9')==False) & (null_dataframe['Percentage']>20)]['Columns']

# Printing the value which are dropped
print (columns_to_drop)

# Dropping the columns 
churn_data.drop(columns_to_drop, axis=1, inplace=True)


In [None]:
# Checking the dataframe after dropping
print('Shape is',churn_data.shape)
churn_data.head(10)

#### `Checking the Discrete values`

In [None]:
churn_data.describe(percentiles=[0.25,0.5,0.75,0.90,0.95,0.99])

In [None]:
print(nullPercentage(churn_data))

In [None]:
# Dividing the discrete and continuous varibales
dataFrame =  createNullDict(churn_data)

discrete_variables = dataFrame.loc[(dataFrame['Columns'].str.contains('sachet')) | (dataFrame['Columns'].str.contains('aon')) | (dataFrame['Columns'].str.contains('monthly')) ]['Columns']
discrete_variables.values
for feature in discrete_variables.values:
  index = dataFrame[dataFrame['Columns'] == feature].index.values[0]
  print ('{} Missing value {}'.format(feature, dataFrame._get_value(index,'Percentage')))




`As there are no missing discrete variables we will be imputing the continuous variables using k-NN`

#### `Imputing the missing values`

In [None]:
print(nullPercentage(churn_data))

In [None]:
# Creating the list of columns which has to be imputed
columns_to_impute = []
for col in churn_data.columns:
  if 'ic' in col or 'og' in col or '_mou' in col:
    columns_to_impute.append(col)

In [None]:
# Imputing the columns with mean
for col in columns_to_impute:
  churn_data[col].fillna(churn_data[col].mean(),inplace=True)

In [None]:
# Checking the null percentage
nullPercentage(churn_data)

## Filtering the High Value Customers
High value customers are those who have recharged greater than or equal to 70th percentile of the total recharge done by the customer

In [None]:
[ col for col in churn_data.columns if 'date' in col]

In [None]:
[col for col in churn_data.columns if 'rech' in col]

In [None]:
# Deriving the total_recharge_amount column for 6th and 7th months which is known as good phase.
churn_data['total_recharge_amt_6'] = churn_data['av_rech_amt_data_6'] + churn_data['total_rech_num_6']
churn_data['total_recharge_amt_7'] = churn_data['av_rech_amt_data_7'] + churn_data['total_rech_num_7']
churn_data['total_recharge_amt_8'] = churn_data['av_rech_amt_data_8'] + churn_data['total_rech_num_8']
churn_data['total_recharge_amt_9'] = churn_data['av_rech_amt_data_9'] + churn_data['total_rech_num_9']

In [None]:
# Dropping the columns which are used to derive a new column
churn_data.drop(['av_rech_amt_data_6','av_rech_amt_data_7','total_rech_amt_6','total_rech_amt_7'],axis=1,inplace=True)

In [None]:
# Deriving the average of 6th and 7th month recharge column
churn_data['average_amt_6n7_month'] = (churn_data['total_recharge_amt_6']+churn_data['total_recharge_amt_7'])/2


In [None]:
# Checking the derived column
churn_data['average_amt_6n7_month'].head()

In [None]:
# Calculating the 70th percentile
percentile = churn_data['average_amt_6n7_month'].quantile(0.7)
print(f'70th percentile is {percentile}')

In [None]:
# Retaining the data of high value custormers, that who are greater than 70th percentile
churn_data_new = churn_data[churn_data['average_amt_6n7_month']>=percentile]

In [None]:
churn_data_new.head(10)

In [None]:
nullPercentage(churn_data_new)

## Deriving the churn column

In [None]:
# Checking whether there are any null columns while derive the churn columns.
churn_data_new[['total_ic_mou_9','total_og_mou_9','vol_2g_mb_9','vol_3g_mb_9']].isnull().sum()

In [None]:
churn_data_new['churned_column'] = np.where(churn_data_new[['total_ic_mou_9','total_og_mou_9','vol_2g_mb_9','vol_3g_mb_9']].sum(axis=1) == 0,1,0)

In [None]:
churn_data_new[['total_ic_mou_9','total_og_mou_9','vol_2g_mb_9','vol_3g_mb_9','churned_column']].head(10)

#### `Dropping the 9th month data`

In [None]:
# As it is the churn phase so we are dropping all this data
dropping_columns = [col for col in churn_data_new.columns if '_9' in col]

In [None]:
churn_data_new.drop(dropping_columns,axis=1,inplace=True)

In [None]:
churn_data_new.shape

In [None]:
churn_data_new.head(5)

## Data PreProcessing

#### `Deriving the variable`

In [None]:
# Converting the AON(Age on Network) from days to months
churn_data_new['aon_month'] = churn_data_new['aon']/30

In [None]:
churn_data_new[['aon_month','aon']].head()

In [None]:
# Dropping the aon column as we have derived a column from it.
churn_data_new.drop('aon',axis=1,inplace=True)

In [None]:
# churn_data_new[''] = churn_data_new['total_recharge_amt_6']-churn_data_new['total_recharge_amt_8']

#### `Outliers Treatment`

In [None]:
churn_data_new.describe(percentiles=[0.25,0.5,0.75,0.90,0.95,0.99])

In [None]:
# Excluding the column which does not have the outliers
no_outliers = ['loc_og_t2o_mou','std_og_t2o_mou','loc_ic_t2o_mou','std_og_t2c_mou_6','std_og_t2c_mou_7','std_og_t2c_mou_8','std_ic_t2o_mou_6','std_ic_t2o_mou_7','std_ic_t2o_mou_8','mobile_number','circle_id','churned_column','last_date_of_month_6','last_date_of_month_7','last_date_of_month_8']

In [None]:
churn_data_new[no_outliers].describe()

In [None]:
columns_to_treat = []
for col in churn_data_new.columns:
  if col not in  discrete_variables.values and col not in no_outliers and 'date' not in col:
    columns_to_treat.append(col)


In [None]:
churn_data_new[columns_to_treat].describe()

In [None]:
# Checking the percentage of outliers
for col in columns_to_treat:
  q = churn_data_new[col].quantile(0.99)
  percentage = churn_data_new[ churn_data_new[col] >= q ].shape[0]/churn_data_new.shape[0]*100
  print (f"Column: {col} Percentage:{percentage} ")


`Log Transforming the outliers as we have very less percentage`

In [None]:
# Checking the data after dropping
churn_data_new.describe(percentiles=[0.25,0.5,0.75,0.90,0.95,0.99])

In [None]:
# Replacing the negative arpu values with zero
cols = ['arpu_6','arpu_7','arpu_8']
for col in cols:
  churn_data_new[churn_data_new[col]<0] = 0

In [None]:
churn_data_new[columns_to_treat].dtypes

In [None]:
# Applying the log transformation to treat the outliers
churn_data_new[columns_to_treat] = churn_data_new[columns_to_treat].apply(lambda x:np.log(1+x))

In [None]:
churn_data_new.describe(percentiles=[0.25,0.5,0.75,0.90,0.95,0.99])

In [None]:
print(churn_data_new.shape)

#### `Graphs`

In [None]:
plt.figure(figsize=(10,10))
f, axes = plt.subplots(2, 2, figsize=(7, 7), sharex=True)
sns.distplot(churn_data_new['total_recharge_amt_6'],bins=50,rug=False,kde=False,ax=axes[0, 0])
sns.distplot(churn_data_new['total_recharge_amt_7'],bins=50,rug=False,kde=False,ax=axes[0, 1])
sns.distplot(churn_data_new['total_recharge_amt_8'],bins=50,rug=False,kde=False,ax=axes[1, 0])

plt.show()


In [None]:
# Checking the correlation between the churn V\s other columns
plt.figure(figsize=(30,30))
churn_data_new.corr()['churned_column'].sort_values(ascending = False).plot(kind='bar')

In [None]:
sns.boxplot(churn_data_new['churned_column'],churn_data_new['aon_month'])

## Model building

#### `Treating the imbalance dataset`

In [None]:
# Removing all the object columns
categorical_columns_ = churn_data_new.select_dtypes(exclude=['int','float']).columns


In [None]:
for col in categorical_columns_:
  churn_data_new.drop(col,axis=1,inplace=True)

In [None]:
churn_data_new.head(10)
print(churn_data_new.shape)

In [None]:
# We can see that the dataset is imbalance since 92% of customers did't churned.
churn_data_new['churned_column'].value_counts()/churn_data_new.shape[0] * 100


In [None]:
main_df = churn_data_new

In [None]:
X = main_df.drop(['churned_column'],axis=1)
y = main_df['churned_column']

In [None]:
# Spliting the data as train and test sets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, train_size=0.7, random_state=1)
print("Dimension of X_train:", X_train.shape)
print("Dimension of X_test:", X_test.shape)

In [None]:
from imblearn.over_sampling import SMOTE

sm = SMOTE(kind = "regular")
X_train_,y_train_ = sm.fit_sample(X_train,y_train)

In [None]:
# Checking the shape and also the balanced data after treated using the SMOTE
print("Dimension of X_tr Shape:", X_train_.shape)
print("Dimension of y_tr Shape:", y_train_.shape)

print("Imbalance in Training dataset:",(y_train_ != 0).sum()/(y_train_ == 0).sum())

In [None]:
# Confusion metrics
def confusion_matrix_(y_test,y_preds):
  cm1 = confusion_matrix(y_test, y_preds)
  print('Confusion Matrix : \n', cm1)

  total1=sum(sum(cm1))
  #####from confusion matrix calculate accuracy
  accuracy1=(cm1[0,0]+cm1[1,1])/total1
  print ('Accuracy : ', accuracy1)

  sensitivity1 = cm1[0,0]/(cm1[0,0]+cm1[0,1])
  print('Sensitivity : ', sensitivity1 )

  specificity1 = cm1[1,1]/(cm1[1,0]+cm1[1,1])
  print('Specificity : ', specificity1)


#### `Logistic Regression`

In [None]:
lr = LogisticRegression()
lr.fit(X_train_, y_train_)

In [None]:
y_pred = lr.predict(X_test)
metrics.accuracy_score(y_test, y_pred)

In [None]:
confusion_matrix_(y_test,y_pred)  

#### `Random Forest`

Random Forest with default parameters

In [None]:
# Splitting and applying the SMOTE to balance the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, train_size=0.7, random_state=1)

sm = SMOTE(kind = "regular")
X_train_sm, y_train_sm = sm.fit_sample(X_train,y_train)

In [None]:
rfc = RandomForestClassifier()

In [None]:
rfc.fit(X_train_sm,y_train_sm)

In [None]:
predictions = rfc.predict(X_test)

In [None]:
confusion_matrix_(y_test, predictions)

In [None]:
# Dictionary of importance of features from the model.
feature_dict={}
num = len(X_train.columns) - 1
for i in range(0,num):
   feature_dict[X_train.columns[i]] = rfc.feature_importances_[i] 

In [None]:
# Get the 10 most important features
def keyfunction(k):
    return feature_dict[k]
for key in sorted(feature_dict, key=keyfunction, reverse=True)[:10]:
    print (f"{key} : {feature_dict[key]}")

Tuning the hyperparameter

In [None]:
# Tuning the hyper-parameter using the GridSearchCV
# Create the parameter grid based on the results of random search 
param_grid = {
    'max_depth': [10, 20, 30],
    'min_samples_leaf': range(1, 5, 10),
    'min_samples_split': range(2, 10, 20),
    'n_estimators': [50, 75, 100], 
    'max_features': [10, 20, 30]
}
# Create a based model
rf = RandomForestClassifier()
# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                          cv = 3, n_jobs = -1,verbose = 1)

In [None]:
# Fitting the GridSearchCV to the training data
grid_search.fit(X_train_sm, y_train_sm)

In [None]:
# printing the optimal accuracy score and hyperparameters
print('The accuracy obtained by best parameters is',grid_search.best_score_,'using',grid_search.best_params_)

Random Forest with optimal parameters obatained from Hyperparameter tuning

In [None]:
rfc = RandomForestClassifier(bootstrap=True,
                             max_depth=8,
                             min_samples_leaf=1, 
                             min_samples_split=2,
                             max_features=30,
                             n_estimators=75)

In [None]:
rfc.fit(X_train_sm, y_train_sm)

In [None]:
predictions_hp = rfc.predict(X_test)

In [None]:
confusion_matrix_(y_test, predictions_hp)

#### `PCA`



In [None]:
# Doing the train test split and then applying SMOTE to balance the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, train_size=0.7, random_state=1)

# Applying SMOTE 
sm = SMOTE(kind = "regular")
X_train_sm,y_train_sm = sm.fit_sample(X_train,y_train)
print(X_train_sm.shape)
print(y_train_sm.shape)

In [None]:
# Applying PCA on train data
pca = PCA(random_state=100)
pca.fit(X_train_sm)

In [None]:
X_train_pca = pca.fit_transform(X_train_sm)

In [None]:
X_train_pca.shape

In [None]:
X_train_sm.shape

In [None]:
# Scree plot to check the variance explained by different PCAs
fig = plt.figure(figsize = (12,8))
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('Number of principal components')
plt.ylabel('Explained variance - cumulative')
plt.show()

In [None]:
# From the graph we can see that around 30 features can explain 90% of data
pca_30 = PCA(n_components=30)
X_train_pca_30 = pca_30.fit_transform(X_train_sm)
X_test_pca_30 = pca_30.transform(X_test)

In [None]:
print(X_train_pca_30.shape)
print(y_train_sm.shape)
print(X_test_pca_30.shape)
print(y_test.shape)

#### `XGBoost`

In [None]:
# Applying the XGBoost on the features obtained after doing the PCA
train = xgb.DMatrix(X_train_pca_30,label=y_train_sm)
test=xgb.DMatrix(X_test_pca_30,label=y_test)

In [None]:
param={
    'max_depth':10,
    'eta':0.3,
    'objective':'multi:softmax',
    'num_class':2
}
epochs=10

In [None]:
model = xgb.train(param, train, epochs)

In [None]:
y_pred_xg = model.predict(test)

In [None]:
confusion_matrix_(y_test,y_pred_xg)

#### `Decision Tree`

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
clf = DecisionTreeClassifier( criterion = "entropy",class_weight= "balanced", random_state = 100, max_depth = 3, min_samples_leaf = 5)

In [None]:
clf.fit(X_train_pca_30, y_train_sm)

In [None]:
y_pred = clf.predict(X_test_pca_30)

In [None]:
confusion_matrix_(y_test, y_pred)

#### `Random Forest with PCA and default parameters`

In [None]:
rfc = RandomForestClassifier()

In [None]:
rfc.fit(X_train_pca_30, y_train_sm)

In [None]:
predictions = rfc.predict(X_test_pca_30)

In [None]:
confusion_matrix_(y_test, predictions)

#### `Random Forest with PCA and hyperparameter tuning`

In [None]:
param_grid = {
    'max_depth': [2, 4, 8],
    'min_samples_leaf': range(1, 5, 10),
    'min_samples_split': range(2, 10, 20),
    'n_estimators': [50, 75, 100], 
    'max_features': [10, 20, 30]
}
# Create a based model
rf = RandomForestClassifier()
# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                          cv = 3, n_jobs = -1,verbose = 1)

In [None]:
grid_search.fit(X_train_pca_30, y_train_sm)

In [None]:
# printing the optimal accuracy score and hyperparameters
print('The accuracy obtained by best parameters is',grid_search.best_score_,'using',grid_search.best_params_)

Using the optimal parameters obtained above

In [None]:
rfc = RandomForestClassifier(bootstrap=True,
                             max_depth=8,
                             min_samples_leaf=1, 
                             min_samples_split=2,
                             max_features=30,
                             n_estimators=100)

In [None]:
rfc.fit(X_train_pca_30, y_train_sm)

In [None]:
predictions_hp = rfc.predict(X_test_pca_30)

In [None]:
confusion_matrix_(y_test, predictions_hp)

# Conclusion

Random Forest is the best model which gives a sensitivity of 97% 

Top 10 most important features which are needed to predict the churn were


- roam_ic_mou_8 (Minutes of usage of  Roaming Incoming calls on 8th month)
- roam_og_mou_8 (Minutes of usage of  Roaming Outgoing calls on 8th month)
- total_ic_mou_8 (Total incoming calles on 8th month)
- total_rech_amt_8 (Total recharge amount on 8th month)
- last_day_rch_amt_8 (Last recharge amount on 8th month) 
- total_recharge_amt_8 (Total recharge amount)
- av_rech_amt_data_8 ( Average recharge amount of data)
- arpu_8 (Average revenue per user)
- total_rech_data_8 (Total recharge of data)
- total_og_mou_8 (Total outgoing minutes of usage)


