In [1]:
import numpy as np
import pandas as pd

df = pd.read_csv("Telco-Customer-Churn.csv")
pd.set_option("display.max_columns", None)
df.head(5)

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [2]:
# Checking for missing value
df.isna().sum()

customerID          0
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64

In [3]:
# maximum value of tenure
df['tenure'].max()

72

In [4]:
def cohort(tenure):
    if tenure <= 12:
        return 0
    elif tenure <= 24:
        return 1
    elif tenure <= 36:
        return 2
    elif tenure <= 48:
        return 3
    elif tenure <= 60:
        return 4
    elif tenure <= 72:
        return 5
    else:
        return 6

#  It apply a function to every data element of a column
df['tenure'] = df['tenure'].apply(cohort)
df.head(4)

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,0,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,2,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,0,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,3,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No


In [5]:
X = df.drop(['Churn', 'customerID'],axis=1)

# Converting target categorical column into numerical
df['Churn'] = df['Churn'].map( {'Yes':1 ,'No':0})

y = df['Churn']

In [6]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=101)
X_train.head(2)

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges
850,Male,0,No,No,0,Yes,Yes,No,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,Month-to-month,No,Mailed check,24.85,62.0
2122,Male,0,Yes,No,3,No,No phone service,DSL,Yes,Yes,Yes,No,No,No,One year,Yes,Electronic check,42.35,1716.45


In [7]:
from sklearn.svm import SVC
from sklearn.decomposition import PCA
from sklearn.compose import ColumnTransformer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import VotingClassifier, RandomForestClassifier, GradientBoostingClassifier, StackingClassifier

combined_trf = ColumnTransformer(
    transformers= [('ss', StandardScaler(), [17, 18]),
    ('ohe', OneHotEncoder(sparse_output=False, handle_unknown='ignore'),
     [0, 2, 3, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16])],
    remainder='passthrough')

pca = PCA(n_components=15)

# Base Learners
knn = KNeighborsClassifier(n_neighbors=11)
svc = SVC(kernel='poly', degree=3, C=1, probability=True)
rfc = RandomForestClassifier(n_estimators=64, min_samples_split=30, max_depth=8)

# Bagging
voting = VotingClassifier(estimators=[('knn', knn), ('svc', svc), ('rfc', rfc)],voting='soft')

# Boosting
gbc = GradientBoostingClassifier(n_estimators=128, min_samples_split=15, max_features=8, max_depth=7, learning_rate=0.6, n_iter_no_change=5, tol=0.0001, random_state=10)

# Stacking
estimator = [
    ('voting', voting),
    ('gbc', gbc)
]

stack = StackingClassifier(estimator, n_jobs=-1, final_estimator= LogisticRegression(penalty='elasticnet',solver='saga', l1_ratio=0.9),stack_method='predict_proba')

In [8]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import RandomizedSearchCV

# creating pipeline
pipe = Pipeline([
     ('step1', combined_trf),
     ('step2', pca),
     ('step3', stack)
     ]
)

# hyperparameter tuning
param_grid = {
    "step3__voting__knn__n_neighbors": [3, 5, 7, 9, 11],
    "step3__voting__weights": [[0.25, 0.25, 0.25], [0.5, 0.3, 0.1], [0.1, 0.3, 0.5], [0.2, 0.2, 0.4], [0.1, 0.1, 0.7]],
    "step3__voting__svc__C": [0.1, 1, 10],
    "step3__voting__svc__kernel": ['linear', 'poly', 'rbf', 'sigmoid'],
    "step3__voting__svc__degree": [2, 3],
    "step3__voting__rfc__n_estimators": [100, 200, 300],
    "step3__voting__rfc__max_depth": [None, 10, 20],
    "step3__voting__rfc__min_samples_split": [20, 30, 40],
    "step3__gbc__learning_rate": [0.2, 0.4, 0.6],
    "step3__gbc__max_features": [8, 12],
    "step3__gbc__n_estimators": [128, 256, 512],
    "step3__gbc__min_samples_split": [15, 25, 30],
    "step3__gbc__max_depth": [5, 7, 10]
}

search = RandomizedSearchCV(pipe, param_grid, n_jobs=-1, n_iter=2, scoring= 'accuracy', cv=5)
search.fit(X_train, y_train)

In [9]:
print("Parameters of Best Estimator in Model: ")
print(search.best_params_)
print("Best score in grid search:")
print(search.best_score_)

Parameters of Best Estimator in Model: 
{'step3__voting__weights': [0.1, 0.3, 0.5], 'step3__voting__svc__kernel': 'poly', 'step3__voting__svc__degree': 3, 'step3__voting__svc__C': 1, 'step3__voting__rfc__n_estimators': 100, 'step3__voting__rfc__min_samples_split': 40, 'step3__voting__rfc__max_depth': 10, 'step3__voting__knn__n_neighbors': 3, 'step3__gbc__n_estimators': 128, 'step3__gbc__min_samples_split': 30, 'step3__gbc__max_features': 8, 'step3__gbc__max_depth': 10, 'step3__gbc__learning_rate': 0.6}
Best score in grid search:
0.7969777777777777


In [10]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, classification_report, confusion_matrix
y_pred = search.predict(X_test)

print("Accuracy: ", accuracy_score(y_test, y_pred))
print("Precision: ", precision_score(y_test, y_pred))
print("Recall: ", recall_score(y_test, y_pred))

Accuracy:  0.8109452736318408
Precision:  0.6816326530612244
Recall:  0.4704225352112676


In [17]:
# Creative approach
# Using neural networks
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from sklearn.metrics import accuracy_score, precision_score, recall_score

model = Sequential()
model.add(Dense(128, activation='relu', input_shape=(X_train.shape[1],)))
model.add(Dropout(0.2))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X_train, y_train, epochs=10, batch_size=32)

# Convert non-numeric values to numeric values
X_test = X_test.apply(pd.to_numeric, errors='coerce')


y_pred = model.predict(X_test)
y_pred = (y_pred > 0.5).astype(int)

print("Neural Network Model")
print("Accuracy: ", accuracy_score(y_test, y_pred))
print("Precision: ", precision_score(y_test, y_pred))
print("Recall: ", recall_score(y_test, y_pred))


ModuleNotFoundError: No module named 'tensorflow'