In [19]:
import os, sys, joblib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn import metrics
sns.set()
import warnings
warnings.filterwarnings('ignore')

In [20]:
pd.set_option("display.max_columns", None)

In [21]:
churn = pd.read_csv('Telcom Data.csv')
churn.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [22]:
churn.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


In [23]:
churn.duplicated().sum()

np.int64(0)

In [24]:
churn.columns

Index(['customerID', 'gender', 'SeniorCitizen', 'Partner', 'Dependents',
       'tenure', 'PhoneService', 'MultipleLines', 'InternetService',
       'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport',
       'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling',
       'PaymentMethod', 'MonthlyCharges', 'TotalCharges', 'Churn'],
      dtype='object')

In [25]:
churn['Churn'].value_counts(normalize=True)

Churn
No     0.73463
Yes    0.26537
Name: proportion, dtype: float64

In [26]:
churn['Churn'] = churn['Churn'].replace({'Yes':1, 'No':0})

In [27]:
churn['Churn'].value_counts()

Churn
0    5174
1    1869
Name: count, dtype: int64

In [28]:
churn['TotalCharges'] = churn['TotalCharges'].replace(" ", np.nan)

In [29]:
churn['TotalCharges'].isnull().sum()
churn['TotalCharges'].mode()
churn['TotalCharges'] = churn['TotalCharges'].fillna('20.2')

In [30]:
churn['TotalCharges'].isnull().sum()

np.int64(0)

In [31]:
churn['TotalCharges'] = churn['TotalCharges'].astype(float)

In [32]:
churn.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


In [33]:
churn.describe()

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges,TotalCharges,Churn
count,7043.0,7043.0,7043.0,7043.0,7043.0
mean,0.162147,32.371149,64.761692,2279.765853,0.26537
std,0.368612,24.559481,30.090047,2266.762876,0.441561
min,0.0,0.0,18.25,18.8,0.0
25%,0.0,9.0,35.5,398.55,0.0
50%,0.0,29.0,70.35,1394.55,0.0
75%,0.0,55.0,89.85,3786.6,1.0
max,1.0,72.0,118.75,8684.8,1.0


In [34]:
churn = pd.get_dummies(churn, columns=['gender','Partner','Dependents',
                                                        'PhoneService','MultipleLines', 'InternetService', 'OnlineSecurity',
       'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV',
       'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod'], drop_first=True)

In [36]:
joblib.dump(churn, 'churn_encoded.pkl')

['churn_encoded.pkl']

In [37]:
churn.drop(columns = 'customerID', axis = 1, inplace = True)

In [38]:
churn.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 31 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   SeniorCitizen                          7043 non-null   int64  
 1   tenure                                 7043 non-null   int64  
 2   MonthlyCharges                         7043 non-null   float64
 3   TotalCharges                           7043 non-null   float64
 4   Churn                                  7043 non-null   int64  
 5   gender_Male                            7043 non-null   bool   
 6   Partner_Yes                            7043 non-null   bool   
 7   Dependents_Yes                         7043 non-null   bool   
 8   PhoneService_Yes                       7043 non-null   bool   
 9   MultipleLines_No phone service         7043 non-null   bool   
 10  MultipleLines_Yes                      7043 non-null   bool   
 11  Inte

In [39]:
churn = churn.applymap(lambda x: int(x) if isinstance(x, bool) else x)

In [40]:
churn.head()

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges,TotalCharges,Churn,gender_Male,Partner_Yes,Dependents_Yes,PhoneService_Yes,MultipleLines_No phone service,MultipleLines_Yes,InternetService_Fiber optic,InternetService_No,OnlineSecurity_No internet service,OnlineSecurity_Yes,OnlineBackup_No internet service,OnlineBackup_Yes,DeviceProtection_No internet service,DeviceProtection_Yes,TechSupport_No internet service,TechSupport_Yes,StreamingTV_No internet service,StreamingTV_Yes,StreamingMovies_No internet service,StreamingMovies_Yes,Contract_One year,Contract_Two year,PaperlessBilling_Yes,PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,0,1,29.85,29.85,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,1,0
1,0,34,56.95,1889.5,0,1,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1
2,0,2,53.85,108.15,1,1,0,0,1,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,1
3,0,45,42.3,1840.75,0,1,0,0,0,1,0,0,0,0,1,0,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0
4,0,2,70.7,151.65,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0


In [41]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(churn.drop(columns=['Churn']),
                                                 churn['Churn'],
                                                 test_size=0.2,
                                                random_state=42,
                                                stratify=churn['Churn'])

In [None]:
y_test.value_counts()

In [42]:
from imblearn.over_sampling import SMOTE
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

In [43]:
num_cols = [1, 2, 3]

In [44]:
trf1 = ColumnTransformer([
    ('scaler', 'passthrough', num_cols)   # placeholder
], remainder='passthrough')

In [45]:
models = {
    'knn': Pipeline([
        ('scaler', trf1),
      ('classifier', KNeighborsClassifier())]),
    
    'svc': Pipeline([
        ('scaler', trf1),
      ('classifier', SVC())]),
    'logistic_regression':Pipeline([
        ('scaler', trf1),
      ('classifier', LogisticRegression())]),
    'random_forest': Pipeline([
        ('scaler', trf1),
      ('classifier', RandomForestClassifier())]),
    'decision_tree':Pipeline([
        ('scaler', trf1),
      ('classifier', DecisionTreeClassifier())]),
    }

In [46]:
N_neighbors = [3, 5, 7, 9, 11, 15]
P = [1, 2] 

In [47]:
# 
param_grid_1 = { 'knn': [
    {
        'scaler': [StandardScaler(), MinMaxScaler()],
        'classifier__n_neighbors' : N_neighbors,
        'classifier__p' : P}],
'svc': [
    {
        'scaler': [StandardScaler(), MinMaxScaler()],
        'classifier__kernel' : ['rbf', "linear", "poly","sigmoid"],
        }],
'logistic_regression': [
    {
        'scaler': [StandardScaler(), MinMaxScaler()],
        }],
'random_forest': [
    {
        'scaler': [StandardScaler(), MinMaxScaler()],
        'classifier__n_estimators' : [100,200],
        'classifier__criterion' : ["gini", "entropy"],
        'classifier__min_samples_split' : [2,4,6,8],
        'classifier__min_samples_leaf' : [1,2,3,4,5,6,7,8]}],
                
'decision_tree': [
    {
        'scaler': [StandardScaler(), MinMaxScaler()],
        'classifier__criterion' : ["gini", "entropy"],
        'classifier__min_samples_split' : [2,4,6,8],
        'classifier__min_samples_leaf' : [1,2,3,4,5,6,7,8]}],
}

In [None]:
results = {}

for name, pipeline in models.items():   # models dict same as before
    print(f"\n Running GridSearch for {name}...")

    grid = GridSearchCV(
        estimator=pipeline,
        param_grid=param_grid_1[name],   # use your existing param_grid_1
        cv=5,
        scoring='accuracy',
        n_jobs=-1
    )
    
    grid.fit(X_train_res, y_train_res)
    
    # Save results
    results[name] = {
        "best_params": grid.best_params_,
        "best_score": grid.best_score_
    }
    
    # Save best estimator model
    joblib.dump(grid.best_estimator_, f"{name}_best_model.pkl")
    print(f"Saved {name} best model as {name}_best_model.pkl")

# Show results
for model_name, res in results.items():
    print(f"\n{model_name} Best Params: {res['best_params']}")
    print(f"   Best CV Score: {res['best_score']:.4f}")



 Running GridSearch for knn...
Saved knn best model as knn_best_model.pkl

 Running GridSearch for svc...
Saved svc best model as svc_best_model.pkl

 Running GridSearch for logistic_regression...
Saved logistic_regression best model as logistic_regression_best_model.pkl

 Running GridSearch for random_forest...
