## Importing Required Libraries

In [50]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.pipeline import Pipeline
import mlflow.sklearn
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_selection import SelectKBest,chi2
from sklearn.compose import ColumnTransformer
pd.set_option('display.max_columns', 0)

In [2]:
data=pd.read_csv(r"C:\Users\ASUS\Downloads\WA_Fn-UseC_-Telco-Customer-Churn.csv")

The dataset contains information about the Customer churn from telecom services.
You can find the details of the data columns below:

+ customerID: Customer ID

+ gender: Whether the customer is a male or a female

+ SeniorCitizen: Whether the customer is a senior citizen or not (1, 0)

+ checkPartner: Whether the customer has a partner or not (Yes, No)

+ checkDependents: Whether the customer has dependents or not (Yes, No)

+ Tenure: Number of months the customer has stayed with the company

+ checkPhoneService: Whether the customer has a phone service or not (Yes, No)

+ text_formatMultipleLines: Whether the customer has multiple lines or not (Yes, No, No phone service)

+ text_formatInternetService: Customer’s internet service provider (DSL, Fiber optic, No)

+ text_formatOnlineSecurity: Whether the customer has online security or not (Yes, No, No internet service)

+ text_formatOnlineBackup: Whether the customer has online backup or not (Yes, No, No internet service)

+ text_formatDeviceProtection: Whether the customer has device protection or not (Yes, No, No internet service)

+ text_formatTechSupport: Whether the customer has tech support or not (Yes, No, No internet service)

+ text_formatStreamingTV: Whether the customer has streaming TV or not (Yes, No, No internet service)

+ text_formatStreamingMovies: Whether the customer has streaming movies or not (Yes, No, No internet service)

+ text_formatContract: The contract term of the customer (Month-to-month, One year, Two year)

+ checkPaperlessBilling: Whether the customer has paperless billing or not (Yes, No)

+ text_formatPaymentMethod: The customer’s payment method (Electronic check, Mailed check, Bank transfer (automatic), Credit card (automatic))

+ MonthlyCharges: The amount charged to the customer monthly

+ TotalCharges: The total amount charged to the customer

+ checkChurn: Whether the customer churned or not (Yes or No)


In [3]:
data.drop('customerID',axis=1,inplace=True)

In [4]:
data

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,Male,0,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.30,1840.75,No
4,Female,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.70,151.65,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,Male,0,Yes,Yes,24,Yes,Yes,DSL,Yes,No,Yes,Yes,Yes,Yes,One year,Yes,Mailed check,84.80,1990.5,No
7039,Female,0,Yes,Yes,72,Yes,Yes,Fiber optic,No,Yes,Yes,No,Yes,Yes,One year,Yes,Credit card (automatic),103.20,7362.9,No
7040,Female,0,Yes,Yes,11,No,No phone service,DSL,Yes,No,No,No,No,No,Month-to-month,Yes,Electronic check,29.60,346.45,No
7041,Male,1,Yes,No,4,Yes,Yes,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Mailed check,74.40,306.6,Yes


In [53]:
X=data.drop('Churn',axis=1)
y=data['Churn']
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.25,random_state=42)

def encode_data(dataframe):
    if dataframe.dtype == "object":
        dataframe = LabelEncoder().fit_transform(dataframe)
    return dataframe

X_train = X_train.apply(lambda x: encode_data(x))
X_test = X_test.apply(lambda x: encode_data(x))
X_train.head()
X_test.head()

In [60]:
categorical_indices_numeric = [data.columns.get_loc(col) for col in data.select_dtypes(include='object').columns]

In [71]:
categorical_indices_numeric

[0, 2, 3, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 18, 19]

In [64]:
categorical_indices_numeric[:-1]

[0, 2, 3, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 18]

In [65]:
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical_indices_numeric[:-1])
    ]
)

pipelines = {
    'knn': Pipeline([
        ('preprocessor', preprocessor),
        ('feature_selection', SelectKBest(score_func=chi2)),
        ('Estimator',KNeighborsClassifier())
    ]),
    'naive_bayes': Pipeline([
        ('preprocessor', preprocessor),
        ('feature_selection', SelectKBest(score_func=chi2)),
        ('Estimator', MultinomialNB())
    ]),
    'logistic_regression': Pipeline([
        ('preprocessor', preprocessor),
        ('feature_selection', SelectKBest(score_func=chi2)),
        ('Estimator', LogisticRegression())
    ]),
    'random_forest': Pipeline([
        ('preprocessor', preprocessor),
        ('feature_selection', SelectKBest(score_func=chi2)),
        ('Estimator', RandomForestClassifier())
    ]),
    'decision_tree': Pipeline([
        ('preprocessor', preprocessor),
        ('feature_selection', SelectKBest(score_func=chi2)),
        ('Estimator', DecisionTreeClassifier())
    ])
}

param_grids = {
    'knn': [
        {
            'feature_selection__k' : [5,10,15],
            'Estimator__n_neighbors': [3,5,7,9],
            'Estimator__p': [1, 2, 3]
        }
    ],
    'naive_bayes': [
        {
            'feature_selection__k' : [5,10,15],
            'Estimator__alpha': [1, 2, 3]
        }
    ],
    'logistic_regression': [
        {
            'feature_selection__k' : [5,10,15],
            'Estimator__C': [0.1, 1, 10],
            'Estimator__penalty': ['l1', 'l2']
        }
    ],
    'random_forest': [
        {
            'feature_selection__k' : [5,10,15],
            'Estimator__n_estimators': [50, 100, 200]
        }
    ],
    'decision_tree': [
        {
            'feature_selection__k' : [5,10,15],
            'Estimator__max_depth': [None, 5, 10]
        }
    ]
}

In [66]:
best_models = {}
for algo in pipelines.keys():
    print(""*10, algo, ""*10)
    grid_search = GridSearchCV(estimator=pipelines[algo],
                               param_grid=param_grids[algo],
                               cv=5,
                               scoring='accuracy',
                               return_train_score=True,
                               verbose=1)

    grid_search.fit(X_train, y_train)

    train_score = grid_search.best_score_
    test_score = grid_search.score(X_test, y_test)

    print('Train Score: ', train_score)
    print('Test Score: ', test_score)

    best_models[algo] = grid_search.best_estimator_
    print()

 knn 
Fitting 5 folds for each of 36 candidates, totalling 180 fits
Train Score:  0.7754583440841719
Test Score:  0.7728563316297559

 naive_bayes 
Fitting 5 folds for each of 9 candidates, totalling 45 fits
Train Score:  0.7368420486797971
Test Score:  0.7279954571266326

 logistic_regression 
Fitting 5 folds for each of 18 candidates, totalling 90 fits
Train Score:  0.7820873111436025
Test Score:  0.7859170925610448

 random_forest 
Fitting 5 folds for each of 9 candidates, totalling 45 fits
Train Score:  0.7843575298873312
Test Score:  0.7859170925610448

 decision_tree 
Fitting 5 folds for each of 9 candidates, totalling 45 fits
Train Score:  0.7845469238267252
Test Score:  0.7859170925610448



In [68]:
import pickle
pickle.dump(pipelines['decision_tree'].fit(X_train,y_train),open("churn_model.pkl","wb"))
predict = pickle.load(open("churn_model.pkl","rb"))

In [69]:
predict

In [93]:
predict.predict([data.drop('Churn',axis=1).iloc[2]])[0]

'No'

In [72]:
data

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,Male,0,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.30,1840.75,No
4,Female,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.70,151.65,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,Male,0,Yes,Yes,24,Yes,Yes,DSL,Yes,No,Yes,Yes,Yes,Yes,One year,Yes,Mailed check,84.80,1990.5,No
7039,Female,0,Yes,Yes,72,Yes,Yes,Fiber optic,No,Yes,Yes,No,Yes,Yes,One year,Yes,Credit card (automatic),103.20,7362.9,No
7040,Female,0,Yes,Yes,11,No,No phone service,DSL,Yes,No,No,No,No,No,Month-to-month,Yes,Electronic check,29.60,346.45,No
7041,Male,1,Yes,No,4,Yes,Yes,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Mailed check,74.40,306.6,Yes


In [91]:
data['PaymentMethod'].unique()

array(['Electronic check', 'Mailed check', 'Bank transfer (automatic)',
       'Credit card (automatic)'], dtype=object)

In [92]:
data.drop('Churn',axis=1).columns

Index(['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'tenure',
       'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity',
       'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV',
       'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod',
       'MonthlyCharges', 'TotalCharges'],
      dtype='object')

In [107]:
[data.drop('Churn',axis=1).iloc[4].values]

[array(['Female', 0, 'No', 'No', 2, 'Yes', 'No', 'Fiber optic', 'No', 'No',
        'No', 'No', 'No', 'No', 'Month-to-month', 'Yes',
        'Electronic check', 70.7, '151.65'], dtype=object)]

In [109]:
result_df=pd.DataFrame(data.drop('Churn',axis=1).iloc[4].values.reshape(1, -1),columns=['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'tenure',
       'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity',
       'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV',
       'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod',
       'MonthlyCharges', 'TotalCharges'])

In [110]:
result_df

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges
0,Female,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65
