In [1]:
import pandas as pd

file_path = "data/TelcoCustomerChurnData.csv"
df = pd.read_csv(file_path)

print("Shape of dataset:",df.shape)
print("---------------------------------------------------------------------------------------------------------------------")
print("\nColumn Names:\n",df.columns)
print("---------------------------------------------------------------------------------------------------------------------")
print("\nFirst 5 Rows:\n",df.head())

print("---------------------------------------------------------------------------------------------------------------------")
print("\ndata Types:",df.dtypes)
print("---------------------------------------------------------------------------------------------------------------------")
print("\nMissing Value per Column:\n",df.isnull().sum())

print("---------------------------------------------------------------------------------------------------------------------")
print("\nChurn Value Counts:\n",df["Churn"].value_counts())

Shape of dataset: (7043, 21)
---------------------------------------------------------------------------------------------------------------------

Column Names:
 Index(['customerID', 'gender', 'SeniorCitizen', 'Partner', 'Dependents',
       'tenure', 'PhoneService', 'MultipleLines', 'InternetService',
       'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport',
       'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling',
       'PaymentMethod', 'MonthlyCharges', 'TotalCharges', 'Churn'],
      dtype='object')
---------------------------------------------------------------------------------------------------------------------

First 5 Rows:
    customerID  gender  SeniorCitizen Partner Dependents  tenure PhoneService  \
0  7590-VHVEG  Female              0     Yes         No       1           No   
1  5575-GNVDE    Male              0      No         No      34          Yes   
2  3668-QPYBK    Male              0      No         No       2          Yes   
3 

In [2]:
df["TotalCharges"] = pd.to_numeric(df["TotalCharges"], errors="coerce")
print(df["TotalCharges"].isnull().sum())#initially 11 rows were NaN
df = df.dropna(subset=["TotalCharges"])

11


In [3]:
df = df.drop(columns=["customerID"])

In [4]:
df["Churn"] = df["Churn"].map({"Yes":1,"No":0})

In [5]:
categorical_cols = df.select_dtypes(include=["object"]).columns
numerical_cols = df.select_dtypes(include=["int64","float64"]).columns

print("Categorical Columns:\n",categorical_cols)
print("\n\nNumerical_Columns:\n",numerical_cols)

Categorical Columns:
 Index(['gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines',
       'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection',
       'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract',
       'PaperlessBilling', 'PaymentMethod'],
      dtype='object')


Numerical_Columns:
 Index(['SeniorCitizen', 'tenure', 'MonthlyCharges', 'TotalCharges', 'Churn'], dtype='object')


In [6]:
print("\ndataset shape after cleaning:", df.shape)
print("\nMissing values:\n", df.isnull().sum())
print("\nTarget distribution:\n", df["Churn"].value_counts())


dataset shape after cleaning: (7032, 20)

Missing values:
 gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64

Target distribution:
 Churn
0    5163
1    1869
Name: count, dtype: int64


In [7]:
df_encoded = pd.get_dummies(df, columns=categorical_cols, drop_first=True)
print("Shape after Encoding:",df_encoded.shape)
print(df.head())
print("\n\n")
df_encoded.head()

Shape after Encoding: (7032, 31)
   gender  SeniorCitizen Partner Dependents  tenure PhoneService  \
0  Female              0     Yes         No       1           No   
1    Male              0      No         No      34          Yes   
2    Male              0      No         No       2          Yes   
3    Male              0      No         No      45           No   
4  Female              0      No         No       2          Yes   

      MultipleLines InternetService OnlineSecurity OnlineBackup  \
0  No phone service             DSL             No          Yes   
1                No             DSL            Yes           No   
2                No             DSL            Yes          Yes   
3  No phone service             DSL            Yes           No   
4                No     Fiber optic             No           No   

  DeviceProtection TechSupport StreamingTV StreamingMovies        Contract  \
0               No          No          No              No  Month-to-month   

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges,TotalCharges,Churn,gender_Male,Partner_Yes,Dependents_Yes,PhoneService_Yes,MultipleLines_No phone service,...,StreamingTV_No internet service,StreamingTV_Yes,StreamingMovies_No internet service,StreamingMovies_Yes,Contract_One year,Contract_Two year,PaperlessBilling_Yes,PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,0,1,29.85,29.85,0,False,True,False,False,True,...,False,False,False,False,False,False,True,False,True,False
1,0,34,56.95,1889.5,0,True,False,False,True,False,...,False,False,False,False,True,False,False,False,False,True
2,0,2,53.85,108.15,1,True,False,False,True,False,...,False,False,False,False,False,False,True,False,False,True
3,0,45,42.3,1840.75,0,True,False,False,False,True,...,False,False,False,False,True,False,False,False,False,False
4,0,2,70.7,151.65,1,False,False,False,True,False,...,False,False,False,False,False,False,True,False,True,False


In [8]:
X = df_encoded.drop("Churn", axis=1)
y = df_encoded["Churn"]

In [9]:
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(
    X,y,test_size=0.2,random_state=42,stratify=y
)

In [10]:
#standerdization of numerical cols...
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
numerical_cols = [col for col in numerical_cols if col != "Churn"]
X_train[numerical_cols] = scaler.fit_transform(X_train[numerical_cols])
X_test[numerical_cols] = scaler.transform(X_test[numerical_cols])

In [11]:
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

X_train shape: (5625, 30)
X_test shape: (1407, 30)
y_train shape: (5625,)
y_test shape: (1407,)


In [12]:
#finally training the model...
from sklearn.linear_model import LogisticRegression

#initiallise the model
model = LogisticRegression(max_iter=1000)

#train the model
model.fit(X_train,y_train)

In [13]:
y_pred = model.predict(X_test)
y_pred

array([0, 1, 0, ..., 0, 0, 0], shape=(1407,))

In [14]:
#comparing the predictions to actuals
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix

In [15]:
accuracy = accuracy_score(y_test,y_pred)
print("Accuracy:")
accuracy

Accuracy:


0.8052594171997157

In [16]:
print("Confusion Matrix:\n",confusion_matrix(y_test,y_pred))

Confusion Matrix:
 [[918 115]
 [159 215]]


In [17]:
print("Classification Report:\n",classification_report(y_test,y_pred))

Classification Report:
               precision    recall  f1-score   support

           0       0.85      0.89      0.87      1033
           1       0.65      0.57      0.61       374

    accuracy                           0.81      1407
   macro avg       0.75      0.73      0.74      1407
weighted avg       0.80      0.81      0.80      1407



In [18]:
#saving the trained model
import joblib
joblib.dump(model,"models/churn-model.pkl")
print("Model Saved As churn-model.pkl")

Model Saved As churn-model.pkl


In [19]:
#saving the scaler
joblib.dump(scaler,"models/scaler.pkl")
print("Scaler saved as scaler.pkl")

Scaler saved as scaler.pkl


In [20]:
#testing loaded model
loadedmodel = joblib.load("models/churn-model.pkl")
loadedscaler = joblib.load("models/scaler.pkl")

samplepred = loadedmodel.predict(X_test[:5])
print("Predictions from loaded model:",samplepred)

Predictions from loaded model: [0 1 0 0 0]
