In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [2]:
df=pd.read_csv('C:\\Users\\Mohit\\Desktop\\Data Science and Machine Learning\\Data Science and Machine Learning Projects\\Telecom Churn prediction\\dataset\\Clean_dataset.csv')

In [3]:
df.head()

Unnamed: 0,Partner,tenure,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,Yes,1,No phone service,DSL,No,Yes,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,No,34,No,DSL,Yes,No,Yes,No,One year,No,Mailed check,56.95,1889.5,No
2,No,2,No,DSL,Yes,Yes,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,No,45,No phone service,DSL,Yes,No,Yes,Yes,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,No,2,No,Fiber optic,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [4]:
features_to_drop=['OnlineBackup' , 'Partner' ,'DeviceProtection' ]

In [5]:
replace_cols = ['MultipleLines', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport']
for col in replace_cols:
  df[col].replace({'No internet service': 'No' ,
                          'No phone service' : 'No'} , inplace=True)

In [6]:
df.drop(features_to_drop , axis=1 , inplace=True)

In [7]:
df.head()

Unnamed: 0,tenure,MultipleLines,InternetService,OnlineSecurity,TechSupport,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,1,No,DSL,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,34,No,DSL,Yes,No,One year,No,Mailed check,56.95,1889.5,No
2,2,No,DSL,Yes,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,45,No,DSL,Yes,Yes,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,2,No,Fiber optic,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [8]:
columns=list(df.columns)

In [9]:
numeric_features=[col for col in columns if df[col].dtypes!='object']
categorical_feature=[col for col in columns if df[col].dtypes=='object']

In [10]:
numeric_features

['tenure', 'MonthlyCharges', 'TotalCharges']

In [11]:
categorical_feature.pop()

'Churn'

In [12]:
categorical_feature.remove('Contract')

In [13]:
nominal_features=categorical_feature

In [14]:
ordinal_features=['Contract']

In [15]:
from sklearn.preprocessing import LabelEncoder , OneHotEncoder , OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [16]:
categories=['Month-to-month', 'One year', 'Two year']

In [17]:
numeric_pipeline=Pipeline(steps=[
    ('imputer' , SimpleImputer(strategy='median')),
    ('scaler' , StandardScaler())
])

ordinal_pipeline=Pipeline(
    steps=[
        ('imputer' , SimpleImputer(strategy='most_frequent')),
        ('ordinal_encoder' , OrdinalEncoder(categories=[categories]) )
    ]
)

nominal_pipeline=Pipeline(
    steps=[
        ('imputer' , SimpleImputer(strategy='most_frequent'))  ,
        ('onehot_encoder' , OneHotEncoder(handle_unknown='ignore' , drop='first'))
    ]
)

preprocessor=ColumnTransformer([
    ('numeric_pipline' , numeric_pipeline , numeric_features) ,
    ('ordinal_pipeline' , ordinal_pipeline , ordinal_features) , 
    ('nominal_pipeline' , nominal_pipeline , nominal_features)
])



In [18]:
preprocessor

In [19]:
X=df.iloc[: , :-1]
y=df.iloc[: , -1]

In [20]:
X

Unnamed: 0,tenure,MultipleLines,InternetService,OnlineSecurity,TechSupport,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges
0,1,No,DSL,No,No,Month-to-month,Yes,Electronic check,29.85,29.85
1,34,No,DSL,Yes,No,One year,No,Mailed check,56.95,1889.50
2,2,No,DSL,Yes,No,Month-to-month,Yes,Mailed check,53.85,108.15
3,45,No,DSL,Yes,Yes,One year,No,Bank transfer (automatic),42.30,1840.75
4,2,No,Fiber optic,No,No,Month-to-month,Yes,Electronic check,70.70,151.65
...,...,...,...,...,...,...,...,...,...,...
7038,24,Yes,DSL,Yes,Yes,One year,Yes,Mailed check,84.80,1990.50
7039,72,Yes,Fiber optic,No,No,One year,Yes,Credit card (automatic),103.20,7362.90
7040,11,No,DSL,Yes,No,Month-to-month,Yes,Electronic check,29.60,346.45
7041,4,Yes,Fiber optic,No,No,Month-to-month,Yes,Mailed check,74.40,306.60


In [21]:
y=y.map({'No' : 0 , 'Yes' : 1})

In [22]:
y

0       0
1       0
2       1
3       0
4       1
       ..
7038    0
7039    0
7040    0
7041    1
7042    0
Name: Churn, Length: 7043, dtype: int64

In [23]:
from sklearn.model_selection import train_test_split
X_train , X_test , y_train ,y_test = train_test_split(X , y , test_size=0.33 , random_state=101)

In [24]:
print(X_train.shape , y_train.shape)
print(X_test.shape , y_test.shape)

(4718, 10) (4718,)
(2325, 10) (2325,)


In [25]:
X_train_scaled=preprocessor.fit_transform(X_train)
X_test_scaled = preprocessor.transform(X_test)

In [26]:
X_train_scaled=pd.DataFrame(data=X_train_scaled  , columns=preprocessor.get_feature_names_out(input_features=X_train.columns))
X_test_scaled=pd.DataFrame(data=X_test_scaled , columns=preprocessor.get_feature_names_out())

In [27]:
X_train_scaled

Unnamed: 0,numeric_pipline__tenure,numeric_pipline__MonthlyCharges,numeric_pipline__TotalCharges,ordinal_pipeline__Contract,nominal_pipeline__MultipleLines_Yes,nominal_pipeline__InternetService_Fiber optic,nominal_pipeline__InternetService_No,nominal_pipeline__OnlineSecurity_Yes,nominal_pipeline__TechSupport_Yes,nominal_pipeline__PaperlessBilling_Yes,nominal_pipeline__PaymentMethod_Credit card (automatic),nominal_pipeline__PaymentMethod_Electronic check,nominal_pipeline__PaymentMethod_Mailed check
0,-1.150282,0.471274,-0.873980,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
1,1.615478,0.746772,1.756369,2.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0
2,-1.190955,1.134139,-0.873604,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0
3,-1.272301,-1.517319,-0.995147,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
4,-0.052113,-0.189920,-0.169577,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4713,1.330767,-1.480586,-0.377066,2.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0
4714,1.615478,0.524704,1.538181,2.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4715,-1.150282,0.457917,-0.857478,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4716,1.249421,1.428003,2.041520,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0


In [28]:
X_test_scaled

Unnamed: 0,numeric_pipline__tenure,numeric_pipline__MonthlyCharges,numeric_pipline__TotalCharges,ordinal_pipeline__Contract,nominal_pipeline__MultipleLines_Yes,nominal_pipeline__InternetService_Fiber optic,nominal_pipeline__InternetService_No,nominal_pipeline__OnlineSecurity_Yes,nominal_pipeline__TechSupport_Yes,nominal_pipeline__PaperlessBilling_Yes,nominal_pipeline__PaymentMethod_Credit card (automatic),nominal_pipeline__PaymentMethod_Electronic check,nominal_pipeline__PaymentMethod_Mailed check
0,1.574805,1.192578,2.168201,2.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0
1,-1.272301,-0.635726,-0.983451,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
2,-1.272301,-1.508970,-0.995036,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
3,-1.231628,-0.487124,-0.952705,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,-1.272301,0.297627,-0.971068,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2320,-0.946917,0.302636,-0.691677,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
2321,-0.987590,1.032288,-0.658982,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2322,1.615478,0.691673,1.755217,2.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0
2323,-0.824899,1.401288,-0.448193,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0


In [29]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier  , GradientBoostingClassifier

In [30]:
models={
    'LogisticRegression' : LogisticRegression(),
    'DecisionTreeClassifier' : DecisionTreeClassifier(),
    'SVC' : SVC(),
    'GaussianNB' : GaussianNB(),
    'RandomForestClassifier' : RandomForestClassifier(),
    'AdaBoostClassifier' : AdaBoostClassifier(),
    'GradientBoostingClassifier' : GradientBoostingClassifier()
}

In [31]:
from sklearn.metrics import accuracy_score , precision_score , recall_score ,f1_score , classification_report , confusion_matrix , roc_auc_score

In [32]:
def evaluate_model(y_true, y_pred):
    print("Accuracy:", accuracy_score(y_true, y_pred))
    print("Precision:", precision_score(y_true, y_pred))
    print("Recall:", recall_score(y_true, y_pred))
    print("F1 Score:", f1_score(y_true, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_true, y_pred))
    print("="*50)


In [33]:
for i in range(len(list(models))):
  model=list(models.values())[i]
  model.fit(X_train_scaled , y_train)
  print(f"Model Trained : {list(models.keys())[i]}")
  y_pred=model.predict(X_test_scaled)
  evaluate_model(y_test , y_pred)

Model Trained : LogisticRegression
Accuracy: 0.8012903225806451
Precision: 0.6645962732919255
Recall: 0.5169082125603864
F1 Score: 0.5815217391304348
Confusion Matrix:
 [[1542  162]
 [ 300  321]]
Model Trained : DecisionTreeClassifier
Accuracy: 0.7255913978494624
Precision: 0.4859504132231405
Recall: 0.47342995169082125
F1 Score: 0.4796084828711256
Confusion Matrix:
 [[1393  311]
 [ 327  294]]
Model Trained : SVC
Accuracy: 0.7991397849462366
Precision: 0.6782407407407407
Recall: 0.4718196457326892
F1 Score: 0.5565052231718899
Confusion Matrix:
 [[1565  139]
 [ 328  293]]
Model Trained : GaussianNB
Accuracy: 0.7582795698924731
Precision: 0.5339470655926352
Recall: 0.7471819645732689
F1 Score: 0.6228187919463087
Confusion Matrix:
 [[1299  405]
 [ 157  464]]
Model Trained : RandomForestClassifier
Accuracy: 0.7759139784946236
Precision: 0.6063829787234043
Recall: 0.45893719806763283
F1 Score: 0.5224564619615032
Confusion Matrix:
 [[1519  185]
 [ 336  285]]
Model Trained : AdaBoostClassifie

In [34]:
# Before dropping the features :

# Model Trained : LogisticRegression
# Accuracy: 0.8060215053763441
# Precision: 0.6741803278688525
# Recall: 0.5297906602254429
# F1 Score: 0.5933273219116321
# Confusion Matrix:
#  [[1545  159]
#  [ 292  329]]
# ==================================================
# Model Trained : DecisionTreeClassifier
# Accuracy: 0.7247311827956989
# Precision: 0.4846029173419773
# Recall: 0.48148148148148145
# F1 Score: 0.48303715670436187
# Confusion Matrix:
#  [[1386  318]
#  [ 322  299]]
# ==================================================
# Model Trained : SVC
# Accuracy: 0.7974193548387096
# Precision: 0.6752336448598131
# Recall: 0.46537842190016104
# F1 Score: 0.5510009532888466
# Confusion Matrix:
#  [[1565  139]
#  [ 332  289]]
# ==================================================
# Model Trained : GaussianNB
# Accuracy: 0.7587096774193548
# Precision: 0.5344036697247706
# Recall: 0.750402576489533
# F1 Score: 0.624246483590087
# Confusion Matrix:
#  [[1298  406]
#  [ 155  466]]
# ==================================================
# Model Trained : RandomForestClassifier
# Accuracy: 0.7836559139784947
# Precision: 0.6260683760683761
# Recall: 0.4718196457326892
# F1 Score: 0.5381083562901745
# Confusion Matrix:
#  [[1529  175]
#  [ 328  293]]
# ==================================================
# Model Trained : AdaBoostClassifier
# Accuracy: 0.8021505376344086
# Precision: 0.6842105263157895
# Recall: 0.48148148148148145
# F1 Score: 0.5652173913043478
# Confusion Matrix:
#  [[1566  138]
#  [ 322  299]]
# ==================================================
# Model Trained : GradientBoostingClassifier
# Accuracy: 0.7995698924731183
# Precision: 0.6591375770020534
# Recall: 0.5169082125603864
# F1 Score: 0.5794223826714802
# Confusion Matrix:
#  [[1538  166]
#  [ 300  321]]
# ==================================================

In [35]:
X_train_scaled.shape

(4718, 13)

In [36]:
X_train_scaled.columns

Index(['numeric_pipline__tenure', 'numeric_pipline__MonthlyCharges',
       'numeric_pipline__TotalCharges', 'ordinal_pipeline__Contract',
       'nominal_pipeline__MultipleLines_Yes',
       'nominal_pipeline__InternetService_Fiber optic',
       'nominal_pipeline__InternetService_No',
       'nominal_pipeline__OnlineSecurity_Yes',
       'nominal_pipeline__TechSupport_Yes',
       'nominal_pipeline__PaperlessBilling_Yes',
       'nominal_pipeline__PaymentMethod_Credit card (automatic)',
       'nominal_pipeline__PaymentMethod_Electronic check',
       'nominal_pipeline__PaymentMethod_Mailed check'],
      dtype='object')

In [37]:
X_train_scaled.shape

(4718, 13)

In [38]:
X.shape

(7043, 10)

In [39]:
X.columns

Index(['tenure', 'MultipleLines', 'InternetService', 'OnlineSecurity',
       'TechSupport', 'Contract', 'PaperlessBilling', 'PaymentMethod',
       'MonthlyCharges', 'TotalCharges'],
      dtype='object')

In [40]:
model=GradientBoostingClassifier(learning_rate=0.05 , loss='exponential' , max_depth=5 , max_features='sqrt' , n_estimators=100)

In [41]:
model.fit(X_train_scaled , y_train)

In [42]:
y_pred=model.predict(X_test_scaled)

In [43]:
evaluate_model(y_test , y_pred)

Accuracy: 0.8047311827956989
Precision: 0.6772823779193206
Recall: 0.5136876006441223
F1 Score: 0.5842490842490843
Confusion Matrix:
 [[1552  152]
 [ 302  319]]


In [44]:
# import pickle

# with open('model1.pkl' , 'wb') as file:
#   pickle.dump(model , file)

# with open('preprocessor1.pkl'  , 'wb') as file:
#   pickle.dump(preprocessor , file)

In [46]:
import joblib

joblib.dump(preprocessor, "preprocessor.pkl")
joblib.dump(model, "model.pkl")

['model.pkl']

In [47]:
clf=Pipeline(
    [
        ('preprocessor' , preprocessor) ,
        ('model' , model)
    ]
)

In [48]:
clf.fit(X_train_scaled , y_train)

ValueError: A given column is not a column of the dataframe