# Chapter 34 - Supervised Learning : Logistic Regression

In [13]:
import pandas as pd
data=pd.read_csv("telecom_data.csv")
print("data")
print(data.head(10))
print("Data Completeness:")
data.isnull().sum()

data
   Age  Gender  PlanType  MonthlyUsage Churn
0   21  Female   Regular            15    No
1   45  Female   Economy            41    No
2   44  Female   Economy            40    No
3   31  Female   Regular            23   Yes
4   33  Female   Regular            12    No
5   42  Female   Regular            52    No
6   20  Female     Ultra            57   Yes
7   26    Male     Ultra            23    No
8   37  Female  Advanced            31    No
9   26    Male   Economy            23    No
Data Completeness:


Age             0
Gender          0
PlanType        0
MonthlyUsage    0
Churn           0
dtype: int64

In [14]:
#Inspect columns and basic info
print("Columns:", data.columns.tolist())

#1.Dataset Basic info
print("\nDataset Info:")
print(data.info())

#2.Completeness : Check for missing values:
#Checks if any column has missing/null values.
print("\nDataset Completeness:")
print(data.isnull().sum())

#3.Dataset Consistency : All numeric columns should be numeric.
print("\nDataset Consistency:")
print(data.dtypes)

Columns: ['Age', 'Gender', 'PlanType', 'MonthlyUsage', 'Churn']

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Age           150 non-null    int64 
 1   Gender        150 non-null    object
 2   PlanType      150 non-null    object
 3   MonthlyUsage  150 non-null    int64 
 4   Churn         150 non-null    object
dtypes: int64(2), object(3)
memory usage: 6.0+ KB
None

Dataset Completeness:
Age             0
Gender          0
PlanType        0
MonthlyUsage    0
Churn           0
dtype: int64

Dataset Consistency:
Age              int64
Gender          object
PlanType        object
MonthlyUsage     int64
Churn           object
dtype: object


In [15]:
#4.Accuracy
print("Database Describe:")
print(data.describe())
#Bias Check
print("\nDataset Bias")
print(data['Gender'].value_counts(normalize=True))
print(data['Churn'].value_counts(normalize=True))

Database Describe:
              Age  MonthlyUsage
count  150.000000    150.000000
mean    35.193333     33.693333
std     10.841566     15.923031
min     19.000000      3.000000
25%     25.000000     23.000000
50%     35.000000     35.000000
75%     44.000000     50.000000
max     54.000000     59.000000

Dataset Bias
Gender
Female    0.793333
Male      0.206667
Name: proportion, dtype: float64
Churn
No     0.893333
Yes    0.106667
Name: proportion, dtype: float64


# Dataset Specification Sheet â€“ Telecom Churn Data

| **Field**           | **Details**                                                                                                                                              |
|---------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------|
| **Dataset Name**    | `telecom_churn_data.csv`                                                                                                                                 |
| **Description**     | Contains customer demographic and usage details from a telecom company used to predict churn behavior. The dataset is used to demonstrate **data quality checks, feature encoding, and model evaluation using Logistic Regression**. |
| **Features**        | `Age`, `Gender`, `PlanType`, `MonthlyUsage`                                                                                                               |
| **Target Variable** | `Churn` *(Yes = churned, No = retained)*                                                                                                                  |
| **Number of Rows**  | **150**                                                                                                                                                  |
| **Null Handling**   | No missing values detected across any feature (`isnull().sum()` confirms all zeroes)                                                                    |
| **Feature Types**   | - **Age**: Numeric (int)  <br> - **Gender**: Categorical (object, Nominal) <br> - **PlanType**: Categorical (object, Nominal) <br> - **MonthlyUsage**: Numeric (int) <br> - **Churn**: Categorical (binary target) |



In [16]:
fdata=data.drop(columns=['Churn'])
X=fdata
y=data["Churn"].map({'Yes':1,'No':0})
categorical = fdata.select_dtypes(include='object').columns
numerical = fdata.select_dtypes(exclude='object').columns
print("Categorical Features:", list(categorical))
print("Numerical Features:", list(numerical))

Categorical Features: ['Gender', 'PlanType']
Numerical Features: ['Age', 'MonthlyUsage']


In [21]:
#Convert text into numbers:
from sklearn.preprocessing import OneHotEncoder, StandardScaler

#Feature Encoding
#====================
# Using pd.get_dummies()
#====================

X_encoded_gd = pd.get_dummies(
    X,
    columns=categorical,
    drop_first=True
)
print("gd Encoded Columns:",X_encoded_gd.columns.tolist())
print()
print(X_encoded_gd['Gender_Male'].head(15))


gd Encoded Columns: ['Age', 'MonthlyUsage', 'Gender_Male', 'PlanType_Economy', 'PlanType_Regular', 'PlanType_Standard', 'PlanType_Ultra']

0     False
1     False
2     False
3     False
4     False
5     False
6     False
7      True
8     False
9      True
10    False
11    False
12    False
13    False
14     True
Name: Gender_Male, dtype: bool


In [23]:
print(X_encoded_gd['PlanType_Economy'].head(15))

0     False
1      True
2      True
3     False
4     False
5     False
6     False
7     False
8     False
9      True
10     True
11    False
12    False
13    False
14    False
Name: PlanType_Economy, dtype: bool


In [None]:
#===================
#Using OneHotEncoder()
#===================
ohe=OneHotEncoder(
    drop='first',
    sparse_output=False
)
encoded_array= ohe.fit_transform(X[categorical])
print(encoded_array)
#Convert back to Dataframe
encoded_df=pd.DataFrame(
    encoded_array,
    columns=ohe.get_feature_names_out(categorical)
)

#Merge encoded columns with numeric ones
X_encoded_ohe=pd.concat(
    [X[numerical].reset_index(drop=True), encoded_df.reset_index(drop=True)],
    axis=1
)

print("OHE Encoded Columns: ",X_encoded_ohe.columns.tolist())


[[0. 0. 1. 0. 0.]
 [0. 1. 0. 0. 0.]
 [0. 1. 0. 0. 0.]
 [0. 0. 1. 0. 0.]
 [0. 0. 1. 0. 0.]
 [0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 1.]
 [1. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0.]
 [1. 1. 0. 0. 0.]
 [0. 1. 0. 0. 0.]
 [0. 0. 1. 0. 0.]
 [0. 0. 0. 1. 0.]
 [0. 0. 0. 1. 0.]
 [1. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0.]
 [1. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1.]
 [1. 0. 1. 0. 0.]
 [1. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 1.]
 [1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 1.]
 [0. 0. 0. 1. 0.]
 [1. 1. 0. 0. 0.]
 [0. 0. 0. 1. 0.]
 [0. 0. 1. 0. 0.]
 [1. 0. 0. 0. 1.]
 [0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0.]
 [1. 0. 0. 1. 0.]
 [0. 1. 0. 0. 0.]
 [0. 1. 0. 0. 0.]
 [1. 0. 0.

In [26]:
print(encoded_df)

     Gender_Male  PlanType_Economy  PlanType_Regular  PlanType_Standard  \
0            0.0               0.0               1.0                0.0   
1            0.0               1.0               0.0                0.0   
2            0.0               1.0               0.0                0.0   
3            0.0               0.0               1.0                0.0   
4            0.0               0.0               1.0                0.0   
..           ...               ...               ...                ...   
145          0.0               0.0               0.0                0.0   
146          0.0               0.0               0.0                0.0   
147          0.0               0.0               0.0                0.0   
148          0.0               1.0               0.0                0.0   
149          0.0               0.0               0.0                1.0   

     PlanType_Ultra  
0               0.0  
1               0.0  
2               0.0  
3          

In [27]:
scaler=StandardScaler()
X_scaled_gd=X_encoded_gd.copy()
X_scaled_gd[numerical]=scaler.fit_transform(X_scaled_gd[numerical])

#Feature Scalling
#===================
#Using OneHotEncoder()
#===================
X_scaled_ohe = X_encoded_ohe.copy()
X_scaled_ohe[numerical]= scaler.fit_transform(X_scaled_ohe[numerical])

df=pd.DataFrame({
    'Age_ohe':X_scaled_ohe[numerical]['Age'],
    'Age_gd':X_scaled_gd[numerical]['Age'],
    'MonthlyUsage_ohe':X_scaled_ohe[numerical]['MonthlyUsage'],
    'MonthlyUsage_gd':X_scaled_gd[numerical]['MonthlyUsage']
})

print("Comparison_Table:\n",df)

Comparison_Table:
       Age_ohe    Age_gd  MonthlyUsage_ohe  MonthlyUsage_gd
0   -1.313545 -1.313545         -1.177914        -1.177914
1    0.907574  0.907574          0.460411         0.460411
2    0.815027  0.815027          0.397399         0.397399
3   -0.388079 -0.388079         -0.673814        -0.673814
4   -0.202986 -0.202986         -1.366951        -1.366951
..        ...       ...               ...              ...
145 -1.498638 -1.498638         -1.934064        -1.934064
146  0.629934  0.629934          1.153549         1.153549
147  1.740493  1.740493         -1.682014        -1.682014
148 -1.406091 -1.406091          1.468612         1.468612
149 -0.388079 -0.388079         -0.673814        -0.673814

[150 rows x 4 columns]
