#  Importing Packages

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

from sklearn.preprocessing import (StandardScaler, LabelEncoder)
from sklearn.model_selection import (train_test_split, GridSearchCV)

from xgboost import XGBClassifier
from sklearn.ensemble import (GradientBoostingClassifier,BaggingClassifier)

from sklearn.metrics import accuracy_score

# Data Reading

In [None]:
train = pd.read_csv("../input/customerattritionprediction/train.csv")
test = pd.read_csv("../input/customerattritionprediction/test.csv")
totaldfs = [train,test]

In [None]:
train.head()

In [None]:
train.info()

train dataset has 10 missing values in GrandPayment column.

In [None]:
test.info()

test dataset has 1 missing value in GrandPayment column

In [None]:
train.describe()

In [None]:
train[train['GrandPayment'].isnull()] #to examine the missing values

These customers are seem to be **unsatisfied** by the services. They have least service span & below average Quarterly Payment.

In [None]:
test[test['GrandPayment'].isnull()]

This customer is also seem to be unsatisfied as she also have very less time span.

Now I have added a **Missing value indicator column** in both test dataset & train dataset. This column gives extra information to our model about unsatisfied customers & its accuracy increases.
I have imputed **minimum value** of GrandPayment for these missing values. Minimum value of GrandPayment will be most appropriate for them.

In [None]:
# Adding Missing Indicator Column
train['GrandPayment' + '_was_missing'] = train['GrandPayment'].isnull()
test['GrandPayment' + '_was_missing'] = test['GrandPayment'].isnull() 

train['GrandPayment'].fillna(train['GrandPayment'].min(),inplace=True)
test['GrandPayment'].fillna(test['GrandPayment'].min(),inplace=True)

# Data Preprocessing

**Feature Selection:** I have dropped the features like **FilmSubscription** & **Married**, as removing them increases accuracy & these features have very little importance as well (as we will see in heatmap ahead).

In [None]:
cat_cols = ['sex','Aged','TotalDependents','MobileService',
            'CyberProtection','HardwareSupport','TechnicalAssistance',
            'GrandPayment_was_missing']    # Categorical Features
num_cols = ['ServiceSpan','QuarterlyPayment', 'GrandPayment'] # Numerical Features

In [None]:
#Encoding of Categorical Features
for df in totaldfs:
    for col in cat_cols:
        label_encoder = LabelEncoder()
        df[col] = label_encoder.fit_transform(df[col].astype('str'))
        
train['CustomerAttrition'] = label_encoder.fit_transform(train['CustomerAttrition'])

I tried Various encodings for Multi-categorical variables & the following encoding is giving me the best accuracy. This encoding is a result of imply that these variables are **ordinal variables**.

In [None]:
multi_cols = ['4GService','SettlementProcess']

def map_4G(element):
  if element == 'Satellite Broadband':
    return 1
  if element == 'Wifi Broadband':
    return 0
  elif element == 'No':
    return 0
train['4GService'] = train['4GService'].map(map_4G)
test['4GService'] = test['4GService'].map(map_4G)

def map_SP(element):
  if element == 'Electronic':
    return 2
  if element == 'Check':
    return 0
  if element == 'Bank':
    return 1
  elif element == 'Card':
    return 0
train['SettlementProcess'] = train['SettlementProcess'].map(map_SP)
test['SettlementProcess'] = test['SettlementProcess'].map(map_SP)

In [None]:
X = train[cat_cols + num_cols + multi_cols]
y = train.CustomerAttrition
X_test = test[cat_cols + num_cols + multi_cols]
#X_train, X_val, train_y, test_y = train_test_split(X, y, test_size = 0.3, random_state = 0)

# Exploratory Data Analysis

In [None]:
# Checking Correlation between features
plt.figure(figsize=(15,10)) 
sns.heatmap(pd.concat([X,y], axis=1).corr(),annot=True,cmap='cubehelix_r')
plt.show()

# Model Building

**Hyperparameter Tuning**

In [None]:
# GradientBoosting Algorithm is giving best results. So I have selected it.
'''
modelcheck6 = GradientBoostingClassifier(learning_rate=0.01,
                                         n_estimators=500,
                                         min_samples_split=2,
                                         max_depth=4,
                                         #max_leaf_nodes=8
                                        )
      
parameters = [{
              'min_samples_split': [2,5,10,20,50,100],
              'learning_rate': [0.1,0.5,0.01,0.05,0.001,0.005],
              'n_estimators':[10,25,50,75,100,250,500,750,1000]
              #'max_depth': [1,2,3,4,5,6,7,8,9,10]
               }]

grid_search = GridSearchCV(estimator = modelcheck6,
                           param_grid = parameters,
                           scoring = 'accuracy',
                           cv = 10,
                           n_jobs = -1)

grid_search = grid_search.fit(X, y)
best_accuracy = grid_search.best_score_
best_parameters = grid_search.best_params_
print("Best Accuracy: {:.2f} %".format(best_accuracy*100))
print("Best Parameters:", best_parameters)
'''

In [None]:
model = GradientBoostingClassifier(learning_rate=0.0501,
                                   n_estimators=49,
                                   min_samples_split=101,
                                   max_depth=6)
model.fit(X,y)

In [None]:
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = model, X = X, y = y, cv = 10)
print("Accuracy: {:.2f} %".format(accuracies.mean()*100))        

# Making Submission File

In [None]:
'''prediction = model.predict(X_test)
y_test_pred = label_encoder.inverse_transform(model.predict(X_test))
StackingSubmission = pd.DataFrame({ 'ID': test.ID,
                            'CustomerAttrition': y_test_pred })
StackingSubmission.to_csv("Solution.csv",index=False)
StackingSubmission.head()'''