In [2]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.base import BaseEstimator,TransformerMixin
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder,MinMaxScaler,PolynomialFeatures
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier, GradientBoostingClassifier,VotingClassifier,StackingClassifier
import xgboost as xgb

from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score

import warnings

In [3]:
warnings.filterwarnings('ignore')

In [4]:
data = pd.read_csv('Telco-Customer-Churn.csv')

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


In [6]:
data.describe()

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges
count,7043.0,7043.0,7043.0
mean,0.162147,32.371149,64.761692
std,0.368612,24.559481,30.090047
min,0.0,0.0,18.25
25%,0.0,9.0,35.5
50%,0.0,29.0,70.35
75%,0.0,55.0,89.85
max,1.0,72.0,118.75


In [7]:
[(i,data[i].unique()) for i in data.drop(columns = ['customerID','TotalCharges','Churn']).select_dtypes(object).columns]

[('gender', array(['Female', 'Male'], dtype=object)),
 ('Partner', array(['Yes', 'No'], dtype=object)),
 ('Dependents', array(['No', 'Yes'], dtype=object)),
 ('PhoneService', array(['No', 'Yes'], dtype=object)),
 ('MultipleLines', array(['No phone service', 'No', 'Yes'], dtype=object)),
 ('InternetService', array(['DSL', 'Fiber optic', 'No'], dtype=object)),
 ('OnlineSecurity', array(['No', 'Yes', 'No internet service'], dtype=object)),
 ('OnlineBackup', array(['Yes', 'No', 'No internet service'], dtype=object)),
 ('DeviceProtection',
  array(['No', 'Yes', 'No internet service'], dtype=object)),
 ('TechSupport', array(['No', 'Yes', 'No internet service'], dtype=object)),
 ('StreamingTV', array(['No', 'Yes', 'No internet service'], dtype=object)),
 ('StreamingMovies',
  array(['No', 'Yes', 'No internet service'], dtype=object)),
 ('Contract', array(['Month-to-month', 'One year', 'Two year'], dtype=object)),
 ('PaperlessBilling', array(['Yes', 'No'], dtype=object)),
 ('PaymentMethod',
  

In [8]:
data.isna().sum() 

customerID          0
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64

In [9]:
data.duplicated().sum()

0

In [None]:
data['customerID'].apply(lambda x: x.split('-')[1]).nunique()

In [None]:
# customer ID doesn't have any unique information or classification. So dropping the column

In [None]:
# Imputation with 'TotalCharges' Feature:
-----------------------------------------
# Total Charges has null Value as space ' ', so replaced that with NaN value and then did fillna by grouping
# Datatype of 'TotalCharges' column is Object; which needs to be converted to numeric

In [None]:
'''
pd.to_numeric(data['TotalCharges'])
data.loc[753]
data[data['TotalCharges']==' ']
data['TotalCharges'].replace(' ', np.nan, regex= True, inplace = True)
data['TotalCharges'].isna().sum()
data.groupby(['InternetService','OnlineSecurity','DeviceProtection','StreamingTV','StreamingMovies','Contract'])['TotalCharges'].agg(pd.Series.mode)
data['TotalCharges'] = data.groupby(['InternetService','OnlineSecurity','DeviceProtection','StreamingTV','StreamingMovies','Contract'])['TotalCharges'].transform(lambda x: x.fillna(x.mode()[0]))
data['TotalCharges'] = data['TotalCharges'].astype(float)
'''

In [None]:
data[data['TotalCharges']==' ']

In [None]:
data['TotalCharges'].replace(' ', np.nan, regex= True, inplace = True)

In [None]:
data['TotalCharges'] = data.groupby(['InternetService','OnlineSecurity','DeviceProtection','StreamingTV','StreamingMovies','Contract'])['TotalCharges'].transform(lambda x: x.fillna(x.mode()[0]))

In [None]:
data['TotalCharges'] = data['TotalCharges'].astype(float)

In [9]:
cat_col = data.drop(columns = ['customerID','Churn','TotalCharges']).select_dtypes(object).columns

In [None]:
cat_col

In [None]:
data.select_dtypes([float,int]).columns

In [4]:
class custom_transformer(BaseEstimator,TransformerMixin):
    def __init__(self):
        pass
    def fit(self, X, y = None):
        return self
    def transform(self, X, y = None):
        print(X.info())
        # Total Charges has null Value as space ' ', so replaced that with NaN value and then did fillna by grouping, changed the data type to numeric
        X['TotalCharges'].replace(' ', np.nan, regex= True, inplace = True)
        X['TotalCharges'] = X.groupby(['InternetService','OnlineSecurity','DeviceProtection','StreamingTV','StreamingMovies','Contract'])['TotalCharges'].transform(lambda x: x.fillna(x.mode()[0]))
        X['TotalCharges'] = pd.to_numeric(X['TotalCharges'])
        print(X.info())
        return X

In [5]:
custom_preprocessing = Pipeline([
    ('custom preprocessing',custom_transformer())
])

In [6]:
categorical_preprocessing = Pipeline([
    ('Categorical Imputation',SimpleImputer(strategy = 'most_frequent')),
    ('Encoding',OneHotEncoder(handle_unknown='ignore',sparse= False))
])

In [7]:
numerical_preprocessing = Pipeline([
    ('Numerical Imputation',SimpleImputer(strategy='mean')),
    ('Scaling',MinMaxScaler())
])

In [10]:
Final_preprocessing = ColumnTransformer(
transformers = 
[
    ('custom', custom_preprocessing,['TotalCharges','InternetService','OnlineSecurity','DeviceProtection','StreamingTV','StreamingMovies','Contract']),
    ('Categorical',categorical_preprocessing,cat_col),
    ('numerical',numerical_preprocessing,['SeniorCitizen', 'tenure', 'MonthlyCharges','TotalCharges'])
])

In [11]:
data['TotalCharges'].unique()

array(['29.85', '1889.5', '108.15', ..., '346.45', '306.6', '6844.5'],
      dtype=object)

In [14]:
Package = Pipeline([
    ('Preprocesing',Final_preprocessing)
    #('Polynomial',PolynomialFeatures(degree = 2))
    #('model',LogisticRegression())
])

In [16]:
Package

In [17]:
x = data.drop(columns = ['customerID','Churn'])
y = data['Churn']

In [None]:
y.value_counts()

In [None]:
y.value_counts(normalize = True)*100

In [None]:
# Since Data is unbalanced, its good to do balancing - UnderSampling for space & time constraint

In [20]:
rus = RandomUnderSampler(random_state= 50)

In [21]:
rus_x,rus_y = rus.fit_resample(x,y)

In [None]:
rus_y.value_counts().sum()

In [22]:
train_x,test_x,train_y,test_y = train_test_split(rus_x,rus_y,test_size = 0.2,random_state = 50)

In [23]:
Package1.fit(train_x,train_y)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2990 entries, 783 to 1712
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   TotalCharges      2990 non-null   object
 1   InternetService   2990 non-null   object
 2   OnlineSecurity    2990 non-null   object
 3   DeviceProtection  2990 non-null   object
 4   StreamingTV       2990 non-null   object
 5   StreamingMovies   2990 non-null   object
 6   Contract          2990 non-null   object
dtypes: object(7)
memory usage: 186.9+ KB
None
<class 'pandas.core.frame.DataFrame'>
Int64Index: 2990 entries, 783 to 1712
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   TotalCharges      2990 non-null   float64
 1   InternetService   2990 non-null   object 
 2   OnlineSecurity    2990 non-null   object 
 3   DeviceProtection  2990 non-null   object 
 4   StreamingTV       2990 non-null   ob

ValueError: Cannot use mean strategy with non-numeric data:
could not convert string to float: ' '

In [None]:
data[data['TotalCharges'] == 'No internet service']

In [None]:
train_pred = Package.predict(train_x)
test_pred = Package.predict(test_x)

In [None]:
confusion_matrix(train_y,train_pred),confusion_matrix(test_y,test_pred)

In [None]:
print(classification_report(train_y,train_pred),classification_report(test_y,test_pred))

In [None]:
model_ = {
'Logistic' :LogisticRegression(),
'KNN' :KNeighborsClassifier(),
'Naive' :GaussianNB(),
'Decision' :DecisionTreeClassifier(max_depth= 10),
'RandomForest' :RandomForestClassifier(),
'AdaBoost' : AdaBoostClassifier(),
'GradientBoost' : GradientBoostingClassifier(),
#'XtremeBoost' : xgb.XGBClassifier(),
'Voting_hard': VotingClassifier(estimators=[('lr', LogisticRegression()), 
                                                ('KNN', KNeighborsClassifier()), 
                                                ('Naive', GaussianNB())], 
                                    voting='hard'),
'Voting_soft': VotingClassifier(estimators=[('lr', LogisticRegression()), 
                                                ('Ada', AdaBoostClassifier()), 
                                                ('Naive', GaussianNB())], 
                                    voting='soft'),
'Stacking_1': StackingClassifier(estimators=[('lr', LogisticRegression()), 
                                                ('KNN', KNeighborsClassifier()), 
                                                ('Naive', GaussianNB())], 
                                     final_estimator=KNeighborsClassifier()),
'Stacking_2': StackingClassifier(estimators=[('lr', LogisticRegression()), 
                                                ('Ada', AdaBoostClassifier()), 
                                                ('Naive', GaussianNB())], 
                                     final_estimator=LogisticRegression())
}

In [None]:
res = []
for name,model in model_.items():
    print("Model : ",name)
    Package = Pipeline([
    ('Preprocessing',Final_preprocessing),
    ('Poly',PolynomialFeatures(degree=2)),
    ('model',model)
])
    Package.fit(train_x,train_y)
    y_predict_train = Package.predict(train_x)
    y_predict_test = Package.predict(test_x)
    train_accuracy = accuracy_score(train_y,y_predict_train)
    test_accuracy  = accuracy_score(test_y,y_predict_test)
    print("Train Accuracy",train_accuracy)
    print("Test Accuracy",test_accuracy)
    res.append([name,train_accuracy,test_accuracy])

In [None]:
res = pd.DataFrame(res,columns = ['Model','TrainScore','TestScore'])