In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score
from sklearn.pipeline import Pipeline


In [2]:
original_data = pd.read_csv('WA_Fn-UseC_-Telco-Customer-Churn.csv')
original_data.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [3]:
def get_unique_values(data):
    for column in data.columns:
        unique_values = data[column].unique()
        print(f"Column: {column}")
        print(f"Unique Values: {unique_values}")
        print("_" * 100)

In [4]:
get_unique_values(original_data)
data = original_data.copy()

Column: customerID
Unique Values: ['7590-VHVEG' '5575-GNVDE' '3668-QPYBK' ... '4801-JZAZL' '8361-LTMKD'
 '3186-AJIEK']
____________________________________________________________________________________________________
Column: gender
Unique Values: ['Female' 'Male']
____________________________________________________________________________________________________
Column: SeniorCitizen
Unique Values: [0 1]
____________________________________________________________________________________________________
Column: Partner
Unique Values: ['Yes' 'No']
____________________________________________________________________________________________________
Column: Dependents
Unique Values: ['No' 'Yes']
____________________________________________________________________________________________________
Column: tenure
Unique Values: [ 1 34  2 45  8 22 10 28 62 13 16 58 49 25 69 52 71 21 12 30 47 72 17 27
  5 46 11 70 63 43 15 60 18 66  9  3 31 50 64 56  7 42 35 48 29 65 38 68
 32 55 37 36 41  6

In [5]:
# Convert 'TotalCharges' to float
data['TotalCharges'] = data['TotalCharges'].replace(' ', 0.0)
data['TotalCharges'] = data['TotalCharges'].astype(float)

# Columns to map using a single mapping dictionary
columns_to_map = [
    'gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 
    'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 
    'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 
    'PaymentMethod', 'PaperlessBilling', 'Churn'
]

In [6]:
# Define the mapping for various values, including gender
value_mapping = {
    'Female': 1, 
    'Male': 0, 
    'Yes': 1, 
    'No': 0, 
    'No phone service': 0, 
    'DSL': 1, 
    'Fiber optic': 1,
    'No internet service': 0, 
    'Month-to-month': 0, 
    'One year': 1, 
    'Two year': 1, 
    'Electronic check': 0, 
    'Mailed check': 0, 
    'Bank transfer (automatic)': 1, 
    'Credit card (automatic)': 1
}

In [7]:
# Apply mapping to the relevant columns
for column in columns_to_map:
    data[column] = data[column].map(value_mapping)

# Check the processed dataset
data.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,1,0,1,0,1,0,0,1,0,...,0,0,0,0,0,1,0,29.85,29.85,0
1,5575-GNVDE,0,0,0,0,34,1,0,1,1,...,1,0,0,0,1,0,0,56.95,1889.5,0
2,3668-QPYBK,0,0,0,0,2,1,0,1,1,...,0,0,0,0,0,1,0,53.85,108.15,1
3,7795-CFOCW,0,0,0,0,45,0,0,1,1,...,1,1,0,0,1,0,1,42.3,1840.75,0
4,9237-HQITU,1,0,0,0,2,1,0,1,0,...,0,0,0,0,0,1,0,70.7,151.65,1


In [8]:
x_data = data.drop(['customerID', 'Dependents', 'tenure', 'MultipleLines', 'MonthlyCharges', 'Churn'], axis=1)
y_data = data['Churn']

x_features = x_data.columns.to_list()
print(x_features)

['gender', 'SeniorCitizen', 'Partner', 'PhoneService', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod', 'TotalCharges']


In [11]:
x = x_data.values
y = y_data.values

length = len(y)
print(f"Length of the data: {length}")

split_index = int(length * 0.90)

x_train = x[:split_index]
y_train = y[:split_index]

x_test = x[split_index:]
y_test = y[split_index:]

shape = x_train.shape
print(f"Shape of training data: {shape}")



Length of the data: 7043
Shape of training data: (6338, 15)


In [13]:
pipeline = Pipeline([
    ('scalar', StandardScaler()),
    ('model', LogisticRegression(penalty='l1', solver='liblinear', max_iter=1000000, C=1.0, random_state=50))
])

In [14]:
pipeline.fit(x_train, y_train)

In [None]:
y_predicted = pipeline.predict(x_test)

In [18]:
score = accuracy_score(y_test, y_predicted)
print(f"Accuracy Score: {score}")

Accuracy Score: 0.7787234042553192


In [20]:
matrix = confusion_matrix(y_test, y_predicted)
print("Confusion Matrix:")
print(matrix)

Confusion Matrix:
[[461  49]
 [107  88]]


In [22]:
report = classification_report(y_test, y_predicted)
print("Classifictaion Report:")
print(report)

Classifictaion Report:
              precision    recall  f1-score   support

           0       0.81      0.90      0.86       510
           1       0.64      0.45      0.53       195

    accuracy                           0.78       705
   macro avg       0.73      0.68      0.69       705
weighted avg       0.76      0.78      0.77       705

