# Getting the data

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
data = "'https://raw.githubusercontent.com/alexeygrigorev/mlbookcamp-code/master/chapter-03-churn-prediction/WA_Fn-UseC_-Telco-Customer-Churn.csv'"

In [3]:
!wget $data -O data-week-3.csv 

--2025-10-17 10:57:28--  https://raw.githubusercontent.com/alexeygrigorev/mlbookcamp-code/master/chapter-03-churn-prediction/WA_Fn-UseC_-Telco-Customer-Churn.csv
185.199.108.133, 185.199.109.133, 185.199.111.133, ...tent.com)... 
connected. to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... 
HTTP request sent, awaiting response... 200 OK
Length: 977501 (955K) [text/plain]
Saving to: ‘data-week-3.csv’


2025-10-17 10:57:37 (144 KB/s) - ‘data-week-3.csv’ saved [977501/977501]



In [4]:
df = pd.read_csv("data-week-3.csv")
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [5]:
df.columns = df.columns.str.lower().str.replace(" ", "_")

## Converting the Index object to a regular Python list.

In [6]:
# Here we list all the columns types
print(df.dtypes)

customerid           object
gender               object
seniorcitizen         int64
partner              object
dependents           object
tenure                int64
phoneservice         object
multiplelines        object
internetservice      object
onlinesecurity       object
onlinebackup         object
deviceprotection     object
techsupport          object
streamingtv          object
streamingmovies      object
contract             object
paperlessbilling     object
paymentmethod        object
monthlycharges      float64
totalcharges         object
churn                object
dtype: object


In [7]:
#
print(df.dtypes[df.dtypes == 'object'])

customerid          object
gender              object
partner             object
dependents          object
phoneservice        object
multiplelines       object
internetservice     object
onlinesecurity      object
onlinebackup        object
deviceprotection    object
techsupport         object
streamingtv         object
streamingmovies     object
contract            object
paperlessbilling    object
paymentmethod       object
totalcharges        object
churn               object
dtype: object


In [8]:
'''What it does: Creates a boolean mask where:
True = column has dtype 'object' (typically strings/categorical data)
False = column has other dtypes (i
nt64, float64, etc.)
Prints where the condition is true'''
print(df.dtypes[df.dtypes == 'object'].index)

Index(['customerid', 'gender', 'partner', 'dependents', 'phoneservice',
       'multiplelines', 'internetservice', 'onlinesecurity', 'onlinebackup',
       'deviceprotection', 'techsupport', 'streamingtv', 'streamingmovies',
       'contract', 'paperlessbilling', 'paymentmethod', 'totalcharges',
       'churn'],
      dtype='object')


In [9]:
#Converts the Index object to a regular Python list.
print(list(df.dtypes[df.dtypes == 'object'].index))

['customerid', 'gender', 'partner', 'dependents', 'phoneservice', 'multiplelines', 'internetservice', 'onlinesecurity', 'onlinebackup', 'deviceprotection', 'techsupport', 'streamingtv', 'streamingmovies', 'contract', 'paperlessbilling', 'paymentmethod', 'totalcharges', 'churn']


In [10]:
df.columns = df.columns.str.lower().str.replace(' ', '_')

categorical_columns = list(df.dtypes[df.dtypes == 'object'].index)

for c in categorical_columns:
    df[c] = df[c].str.lower().str.replace(' ', '_')

In [11]:
df.head().T

Unnamed: 0,0,1,2,3,4
customerid,7590-vhveg,5575-gnvde,3668-qpybk,7795-cfocw,9237-hqitu
gender,female,male,male,male,female
seniorcitizen,0,0,0,0,0
partner,yes,no,no,no,no
dependents,no,no,no,no,no
tenure,1,34,2,45,2
phoneservice,no,yes,yes,no,yes
multiplelines,no_phone_service,no,no,no_phone_service,no
internetservice,dsl,dsl,dsl,dsl,fiber_optic
onlinesecurity,no,yes,yes,yes,no


In [12]:
# Use this as a safety check for the conversion
tc = pd.to_numeric(df.totalcharges, errors='coerce')

In [13]:
# The actual change to the column
df.totalcharges = pd.to_numeric(df.totalcharges, errors='coerce')

In [14]:
# Replace all NaN values with 0
df.totalcharges = df.totalcharges.fillna(0)

In [15]:
df.churn.head()

0     no
1     no
2    yes
3     no
4    yes
Name: churn, dtype: object

In [16]:
df.churn = (df.churn == 'yes').astype(int)

In [17]:
df_full_train, df_test = train_test_split(df, test_size = 0.2, random_state = 1)

In [18]:
df_train, df_val = train_test_split(df_full_train, test_size = 0.25, random_state = 1)

In [19]:
len(df_full_train), len(df_val), len(df_test)

(5634, 1409, 1409)


__Tidy up the row numbers (index) after the train/validation/test split__

train_test_split keeps the original indices, so each subset has
row labels like 12, 57, 123… which can leave gaps and look messy.
reset_index(drop=True) gives each DataFrame a fresh, sequential
index starting at 0.  drop=True discards the old index instead of
keeping it as an extra column.

* Result: df_train, df_val, df_test all have clean, contiguous row
  numbers, making .iloc, .loc and visual inspection easier.

In [20]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
adf_test = df_test.reset_index(drop=True)

__Below we separate the “answers” (churn) from the “questions” (all other columns) for each of the three datasets so scikit-learn can train, tune, and finally grade a model.__


In [21]:
y_train = df_train.churn.values
y_val = df_val.churn.values
y_test = df_test.churn.values

In [22]:
del df_train['churn']
del df_val['churn']
del df_test['churn']

# EDA

In [23]:
# Resetting the original indices; more orderly 
df_full_train = df_full_train.reset_index(drop = True)

In [24]:
# Check null values and calculate the sum per column
df_full_train.isnull().sum()

customerid          0
gender              0
seniorcitizen       0
partner             0
dependents          0
tenure              0
phoneservice        0
multiplelines       0
internetservice     0
onlinesecurity      0
onlinebackup        0
deviceprotection    0
techsupport         0
streamingtv         0
streamingmovies     0
contract            0
paperlessbilling    0
paymentmethod       0
monthlycharges      0
totalcharges        0
churn               0
dtype: int64

In [25]:
# Check the distribution of the variable 'churn'
df_full_train.churn.value_counts(normalize = True)

churn
0    0.730032
1    0.269968
Name: proportion, dtype: float64

In [26]:
global_churn_rate = df_full_train.churn.mean()
round(global_churn_rate, 2)

np.float64(0.27)

In [27]:
df_full_train.dtypes

customerid           object
gender               object
seniorcitizen         int64
partner              object
dependents           object
tenure                int64
phoneservice         object
multiplelines        object
internetservice      object
onlinesecurity       object
onlinebackup         object
deviceprotection     object
techsupport          object
streamingtv          object
streamingmovies      object
contract             object
paperlessbilling     object
paymentmethod        object
monthlycharges      float64
totalcharges        float64
churn                 int64
dtype: object

In [28]:
numeric = ['tenure', 'monthlycharges', 'totalcharges']

In [29]:
categorical = [
    'gender',
    'seniorcitizen',
    'partner',
    'dependents',
    'phoneservice',
    'multiplelines',
    'internetservice',
    'onlinesecurity',
    'onlinebackup',
    'deviceprotection',
    'techsupport',
    'streamingtv',
    'streamingmovies',
    'contract',
    'paperlessbilling',
    'paymentmethod',
]

In [30]:
df_full_train[categorical].nunique()

gender              2
seniorcitizen       2
partner             2
dependents          2
phoneservice        2
multiplelines       3
internetservice     3
onlinesecurity      3
onlinebackup        3
deviceprotection    3
techsupport         3
streamingtv         3
streamingmovies     3
contract            3
paperlessbilling    2
paymentmethod       4
dtype: int64

# Feature importance: Churn rate and risk ratio
Feature importance analysis (part of EDA) - identifying which features affect our target variable

* Churn rate
* Risk ratio
* Mutual information - later

In [31]:
global_churn = df_full_train.churn.mean()
churn_female = df_full_train[df_full_train.gender == 'female'].churn.mean()
churn_male = df_full_train[df_full_train.gender == 'male'].churn.mean()
print(f"Global churn", global_churn)
print(f"Female churn", churn_female)
print(f"Male churn", churn_male)

Global churn 0.26996805111821087
Female churn 0.27682403433476394
Male churn 0.2632135306553911


In [32]:
df_full_train['partner'].value_counts()

partner
no     2932
yes    2702
Name: count, dtype: int64

## Churn rate in specific columns

In [33]:
global_churn = df_full_train.churn.mean()
churn_partner = df_full_train[df_full_train.partner == 'yes'].churn.mean()
churn_no_partner = df_full_train[df_full_train.partner == 'no'].churn.mean()
print(f"Global churn", global_churn)
print(f"Female churn", churn_female)
print(f"Male churn", churn_male)

Global churn 0.26996805111821087
Female churn 0.27682403433476394
Male churn 0.2632135306553911


### What the numbers are really saying

| Group        | Churn rate | vs. global (26.9 %) |
|--------------|------------|---------------------|
| **Has partner**  | **20.5 %** | ➖ **6.4 pp lower** – big protective effect |
| **No partner**   | **33.0 %** | ➕ **6.1 pp higher** – clear risk factor |
| **Female**       | 27.7 %     | spot on average |
| **Male**         | 26.3 %     | spot on average |

Take-away  
Whether someone has a **partner** is a much stronger signal than gender.  
We should keep `partner` as a feature and maybe even derive a “single” flag later.

## Risk Ratio

In [34]:
churn_no_partner / global_churn

np.float64(1.2216593879412643)

In [35]:
churn_partner / global_churn

np.float64(0.7594724924338315)

### 📊 Lesson: “Partner” is a churn-shield, gender is neutral

| Snapshot     | Churn rate | Risk ratio vs global (26.9 %) |
|--------------|------------|-------------------------------|
| Has partner  | **20.5 %** | **0.76×** → 24 % *less* likely to leave |
| No partner   | **33.0 %** | **1.22×** → 22 % *more* likely to leave |
| Female       | 27.7 %     | ≈ 1× (no real effect) |
| Male         | 26.3 %     | ≈ 1× (no real effect) |

**Key takeaway**  
Relationship status is a **strong behavioural signal**; gender is not.  
Keep `partner`, drop or de-prioritise `gender` when you build the model.

__How to do this on several columns__

In [36]:
df_full_train.groupby('gender').churn.mean()

gender
female    0.276824
male      0.263214
Name: churn, dtype: float64

In [37]:
df_full_train.groupby('gender').churn.agg(['mean'], ['count'])

Unnamed: 0_level_0,mean
gender,Unnamed: 1_level_1
female,0.276824
male,0.263214


In [38]:
df_group = df_full_train.groupby('gender').churn.agg(['mean'], ['count'])
df_group['diff'] = df_group['mean'] - global_churn
df_group['risk'] = df_group['mean'] / global_churn
df_group

Unnamed: 0_level_0,mean,diff,risk
gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
female,0.276824,0.006856,1.025396
male,0.263214,-0.006755,0.97498


In [39]:
for c in categorical:
    print(c)
    df_group = df_full_train.groupby(c).churn.agg(['mean', 'count'])
    df_group['diff'] = df_group['mean'] - global_churn
    df_group['risk'] = df_group['mean'] / global_churn
    display(df_group)
    print()
    print()

gender


Unnamed: 0_level_0,mean,count,diff,risk
gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
female,0.276824,2796,0.006856,1.025396
male,0.263214,2838,-0.006755,0.97498




seniorcitizen


Unnamed: 0_level_0,mean,count,diff,risk
seniorcitizen,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,0.24227,4722,-0.027698,0.897403
1,0.413377,912,0.143409,1.531208




partner


Unnamed: 0_level_0,mean,count,diff,risk
partner,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.329809,2932,0.059841,1.221659
yes,0.205033,2702,-0.064935,0.759472




dependents


Unnamed: 0_level_0,mean,count,diff,risk
dependents,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.31376,3968,0.043792,1.162212
yes,0.165666,1666,-0.104302,0.613651




phoneservice


Unnamed: 0_level_0,mean,count,diff,risk
phoneservice,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.241316,547,-0.028652,0.89387
yes,0.273049,5087,0.003081,1.011412




multiplelines


Unnamed: 0_level_0,mean,count,diff,risk
multiplelines,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.257407,2700,-0.012561,0.953474
no_phone_service,0.241316,547,-0.028652,0.89387
yes,0.290742,2387,0.020773,1.076948




internetservice


Unnamed: 0_level_0,mean,count,diff,risk
internetservice,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
dsl,0.192347,1934,-0.077621,0.712482
fiber_optic,0.425171,2479,0.155203,1.574895
no,0.077805,1221,-0.192163,0.288201




onlinesecurity


Unnamed: 0_level_0,mean,count,diff,risk
onlinesecurity,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.420921,2801,0.150953,1.559152
no_internet_service,0.077805,1221,-0.192163,0.288201
yes,0.153226,1612,-0.116742,0.56757




onlinebackup


Unnamed: 0_level_0,mean,count,diff,risk
onlinebackup,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.404323,2498,0.134355,1.497672
no_internet_service,0.077805,1221,-0.192163,0.288201
yes,0.217232,1915,-0.052736,0.80466




deviceprotection


Unnamed: 0_level_0,mean,count,diff,risk
deviceprotection,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.395875,2473,0.125907,1.466379
no_internet_service,0.077805,1221,-0.192163,0.288201
yes,0.230412,1940,-0.039556,0.85348




techsupport


Unnamed: 0_level_0,mean,count,diff,risk
techsupport,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.418914,2781,0.148946,1.551717
no_internet_service,0.077805,1221,-0.192163,0.288201
yes,0.159926,1632,-0.110042,0.59239




streamingtv


Unnamed: 0_level_0,mean,count,diff,risk
streamingtv,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.342832,2246,0.072864,1.269897
no_internet_service,0.077805,1221,-0.192163,0.288201
yes,0.302723,2167,0.032755,1.121328




streamingmovies


Unnamed: 0_level_0,mean,count,diff,risk
streamingmovies,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.338906,2213,0.068938,1.255358
no_internet_service,0.077805,1221,-0.192163,0.288201
yes,0.307273,2200,0.037305,1.138182




contract


Unnamed: 0_level_0,mean,count,diff,risk
contract,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
month-to-month,0.431701,3104,0.161733,1.599082
one_year,0.120573,1186,-0.149395,0.446621
two_year,0.028274,1344,-0.241694,0.10473




paperlessbilling


Unnamed: 0_level_0,mean,count,diff,risk
paperlessbilling,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.172071,2313,-0.097897,0.637375
yes,0.338151,3321,0.068183,1.25256




paymentmethod


Unnamed: 0_level_0,mean,count,diff,risk
paymentmethod,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
bank_transfer_(automatic),0.168171,1219,-0.101797,0.622928
credit_card_(automatic),0.164339,1217,-0.10563,0.608733
electronic_check,0.45589,1893,0.185922,1.688682
mailed_check,0.19387,1305,-0.076098,0.718121






# Feature Importance: Mutual Information

## What MI Measures
* Quantifies how much knowing a feature reduces uncertainty about the target

* Measures any relationship - linear, nonlinear, categorical

* Higher MI = more predictive power

In [40]:
from sklearn.metrics import mutual_info_score

In [41]:
mutual_info_score(df_full_train.churn, df_full_train.contract)

0.0983203874041556

In [42]:
mutual_info_score(df_full_train.gender, df_full_train.churn)

0.0001174846211139946

In [43]:
mutual_info_score(df_full_train.contract, df_full_train.churn)
0.0983203874041556

0.0983203874041556

In [44]:
mutual_info_score(df_full_train.partner, df_full_train.churn)
0.009967689095399745

0.009967689095399745

## Interpretation
* High MI: Strong predictor, keep the feature

* Low/Near-zero MI: Weak/no predictive power, consider removing

* Compare scores relatively (ranking matters more than absolute values)

In [45]:
def mutual_info_churn_score(series):
    """
    Calculate Mutual Information between a feature series and the churn target.
    
    This function computes how much information a single feature provides
    about the churn target variable, measuring dependency between them.
    
    Parameters
    ----------
    series : pandas.Series
        A single feature column from the DataFrame to evaluate
        
    Returns
    -------
    float
        Mutual Information score between the feature and churn.
        Higher values indicate stronger predictive power.
    """
    return mutual_info_score(series, df_full_train.churn)

In [46]:
mi = df_full_train[categorical].apply(mutual_info_churn_score)
mi.sort_values(ascending=False)

contract            0.098320
onlinesecurity      0.063085
techsupport         0.061032
internetservice     0.055868
onlinebackup        0.046923
deviceprotection    0.043453
paymentmethod       0.043210
streamingtv         0.031853
streamingmovies     0.031581
paperlessbilling    0.017589
dependents          0.012346
partner             0.009968
seniorcitizen       0.009410
multiplelines       0.000857
phoneservice        0.000229
gender              0.000117
dtype: float64

# Feature Importance Correlation 

## Foundation
* Pearson Correlation measures linear relationship between numerical variables

* Range: -1 (perfect negative) to +1 (perfect positive)

* Positive Correlation vs. Negative Correlation

** When r is positive, an increase in x will increase y.

** When r is negative, an increase in x will decrease y.

** When r is 0, a change in x does not affect y.


* Use .abs() to focus on strength regardless of direction

### Calculating Correlation with Target Variable
* corrwith() calculates pairwise correlation between DataFrame columns and a Series

* Use this when you want correlation between multiple features and one target variable

* Alternative: corr() for full correlation matrix between all features

In [47]:
df_full_train[numeric].corrwith(df_full_train.churn)

tenure           -0.351885
monthlycharges    0.196805
totalcharges     -0.196353
dtype: float64

### Focusing on Relationship strength
__What to learn:__

* Correlation can be negative or positive (-1 to +1)

* .abs() takes absolute value to measure strength regardless of direction

* For feature importance, we often care about magnitude, not direction



In [48]:
df_full_train[numeric].corrwith(df_full_train.churn.abs())

tenure           -0.351885
monthlycharges    0.196805
totalcharges     -0.196353
dtype: float64

### Understanding data ranges 
* Always check value ranges before analysis

* Helps inform meaningful binning strategies

* Contextualizes what "high" and "low" values mean


In [49]:
df_full_train.tenure.max()  # 72

np.int64(72)

### Binning Analysis Technique
* Binning: Converting continuous variables into categorical ranges

* Why bin: Makes relationships interpretable, reveals non-linear patterns

* How to bin: Use business logic (new/medium/long-term customers) or statistical methods

* Analysis pattern: Filter → Calculate mean target → Compare across bins


In [50]:
df_full_train[df_full_train.tenure <= 2].churn.mean()


np.float64(0.5953420669577875)

In [51]:
df_full_train[(df_full_train.tenure > 2) & (df_full_train.tenure <= 12)].churn.mean()


np.float64(0.3994413407821229)

In [52]:
df_full_train[df_full_train.tenure > 12].churn.mean()

np.float64(0.17634908339788277)

# One Hot Encoding

* Use scikit_learn to encode categorical features

In [53]:
from sklearn.feature_extraction import DictVectorizer

In [54]:
#first 10 values in the gender and contract column
df_train[['gender', 'contract']].iloc[:10]

Unnamed: 0,gender,contract
0,female,two_year
1,male,month-to-month
2,female,month-to-month
3,female,month-to-month
4,female,two_year
5,male,month-to-month
6,male,month-to-month
7,female,month-to-month
8,female,two_year
9,female,month-to-month


In [55]:
#Convert into a dictionary. We use orient to target the rows. Otherwise it turns the columns
df_train[['gender', 'contract']].iloc[:10].to_dict(orient = 'records')

[{'gender': 'female', 'contract': 'two_year'},
 {'gender': 'male', 'contract': 'month-to-month'},
 {'gender': 'female', 'contract': 'month-to-month'},
 {'gender': 'female', 'contract': 'month-to-month'},
 {'gender': 'female', 'contract': 'two_year'},
 {'gender': 'male', 'contract': 'month-to-month'},
 {'gender': 'male', 'contract': 'month-to-month'},
 {'gender': 'female', 'contract': 'month-to-month'},
 {'gender': 'female', 'contract': 'two_year'},
 {'gender': 'female', 'contract': 'month-to-month'}]

In [56]:
dicts = df_train[['gender', 'contract']].iloc[:10].to_dict(orient = 'records')

In [57]:
dv = DictVectorizer(sparse = False)

In [58]:
dv.fit(dicts)

0,1,2
,dtype,<class 'numpy.float64'>
,separator,'='
,sparse,False
,sort,True


In [59]:
dv.transform(dicts)

array([[0., 1., 1., 0.],
       [1., 0., 0., 1.],
       [1., 0., 1., 0.],
       [1., 0., 1., 0.],
       [0., 1., 1., 0.],
       [1., 0., 0., 1.],
       [1., 0., 0., 1.],
       [1., 0., 1., 0.],
       [0., 1., 1., 0.],
       [1., 0., 1., 0.]])

In [60]:
dv.get_feature_names_out()

array(['contract=month-to-month', 'contract=two_year', 'gender=female',
       'gender=male'], dtype=object)

__Once we add a numerical column eg. tenure, scilearn understands the values and doesn't convert them__

In [61]:
dicts = df_train[['gender', 'contract', 'tenure']].iloc[:10].to_dict(orient = 'records')

In [62]:
dv = DictVectorizer(sparse = False)
dv.fit(dicts)
dv.transform(dicts)

array([[ 0.,  1.,  1.,  0., 72.],
       [ 1.,  0.,  0.,  1., 10.],
       [ 1.,  0.,  1.,  0.,  5.],
       [ 1.,  0.,  1.,  0.,  5.],
       [ 0.,  1.,  1.,  0., 18.],
       [ 1.,  0.,  0.,  1.,  4.],
       [ 1.,  0.,  0.,  1.,  1.],
       [ 1.,  0.,  1.,  0.,  1.],
       [ 0.,  1.,  1.,  0., 72.],
       [ 1.,  0.,  1.,  0.,  6.]])

## How everything comes together
* We used two then three columns, now lets combine the categorical and numeric columns

In [63]:
train_dicts = df_train[categorical + numeric].to_dict(orient = 'records')
train_dicts[0]

{'gender': 'female',
 'seniorcitizen': 0,
 'partner': 'yes',
 'dependents': 'yes',
 'phoneservice': 'yes',
 'multiplelines': 'yes',
 'internetservice': 'fiber_optic',
 'onlinesecurity': 'yes',
 'onlinebackup': 'yes',
 'deviceprotection': 'yes',
 'techsupport': 'yes',
 'streamingtv': 'yes',
 'streamingmovies': 'yes',
 'contract': 'two_year',
 'paperlessbilling': 'yes',
 'paymentmethod': 'electronic_check',
 'tenure': 72,
 'monthlycharges': 115.5,
 'totalcharges': 8425.15}

In [64]:
dv.fit(train_dicts)

0,1,2
,dtype,<class 'numpy.float64'>
,separator,'='
,sparse,False
,sort,True


In [65]:
dv.transform(train_dicts)

array([[0.00000e+00, 0.00000e+00, 1.00000e+00, ..., 1.00000e+00,
        7.20000e+01, 8.42515e+03],
       [1.00000e+00, 0.00000e+00, 0.00000e+00, ..., 0.00000e+00,
        1.00000e+01, 1.02155e+03],
       [1.00000e+00, 0.00000e+00, 0.00000e+00, ..., 0.00000e+00,
        5.00000e+00, 4.13650e+02],
       ...,
       [1.00000e+00, 0.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        2.00000e+00, 1.90050e+02],
       [0.00000e+00, 0.00000e+00, 1.00000e+00, ..., 0.00000e+00,
        2.70000e+01, 7.61950e+02],
       [1.00000e+00, 0.00000e+00, 0.00000e+00, ..., 0.00000e+00,
        9.00000e+00, 7.51650e+02]], shape=(4225, 45))

In [66]:
#Shorter more pythonic code 
X_train  = dv.fit_transform(train_dicts)

In [67]:
val_dicts = df_val [categorical + numeric].to_dict(orient = 'records')

In [68]:
X_val = dv.transform(val_dicts)

## One-slide cheat-sheet summary
* One hot encoding turns every categorical value into its own 0/1 column (“dummy”).
* Because models need numbers, not strings. Encoding must not impose order (0 < 1 < 2) → one-hot avoids fake ordinality.
* fit = learn the rule (once, only on train).
* transform = apply the rule (on every set: train/val/test/production).
* Never re-fit on val/test → prevents data leakage and false optimism.
* DictVectorizer/OneHotEncoder: set sparse=True by default; turn it to False only when data is small.

Memorise the mantra:
__“Fit once, transform everywhere; keep the zeros sparse and the validation untouched.”__

# Logistic Regression

In [69]:
def sigmoid(z):
    return 1 / (1 + np.exp(-z))

__Linear regression outputs ℝ; logistic regression squashes ℝ → (0, 1) via sigmoid so we get a probability.__

| Step | Formula | Code |
|------|---------|------|
| Linear score | z = w₀ + Σ wⱼ xⱼ | `score = w0 + xi @ w` |
| Probability | p = 1 / (1 + e^(−z)) | `sigmoid(score)` |
| Decision | ŷ = 1 if p ≥ 0.5 else 0 | `y_pred = (model.predict_proba(X

In [70]:
def linear_regression(xi):
    result = w0
    
    for j in range(len(w)):
        result = result + xi[j] * w[j]
        
    return result

In [71]:
def logistic_regression(xi):
    score = w0
    
    for j in range(len(w)):
        score = score + xi[j] * w[j]
        
    result = sigmoid(score)
    return result

# Training Logistic Regression with Scikit-learn
* Turn the one-hot matrix we built in 3.8 into a probabilistic churn classifier and measure its quality on the validation set that was never touched during training.


In [72]:
from sklearn.linear_model import LogisticRegression

### __Model = LogisticRegression(solver='lbfgs')__

* Meaning: Choose the L-BFGS algorithm to find the best weights.

* Analogy: You’re picking a GPS route—“lbfgs” is the fast highway that still avoids traffic jams (bad numerical spots).

In [88]:
model = LogisticRegression(max_iter = 1000)
# solver='lbfgs' is the default solver in newer version of sklearn
# for older versions, you need to specify it explicitly
# Model learns the weights by minimising log-loss on training data only.
model.fit(X_train, y_train)

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=1000).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,1000


In [89]:
'''We are telling the program/python that for any prediction it makes, 
the calculation must always start with this base value. 
All the other features just get added (or subtracted) onto this number.'''
model.intercept_[0]

np.float64(-0.12066865593034165)

In [91]:
# The learned weight for each feature, showing its push (positive/negative) on the prediction.
model.coef_[0].round(3)

array([ 0.639, -0.016, -0.734, -0.018, -0.093,  0.061, -0.11 , -0.062,
       -0.036, -0.075, -0.365,  0.364, -0.11 ,  0.   , -0.243,  0.118,
        0.014,  0.062, -0.11 , -0.063,  0.231, -0.11 , -0.232, -0.238,
        0.127, -0.123,  0.012, -0.087, -0.037,  0.066, -0.053,  0.118,
       -0.229,  0.193, -0.102, -0.11 ,  0.101, -0.062, -0.11 ,  0.06 ,
        0.181, -0.11 , -0.182, -0.068,  0.   ])

In [86]:
# Get the probability (confidence score) for the positive class (class 1) on the validation data.
y_pred = model.predict_proba(X_val)[:, 1]

In [77]:
churn_decision = (y_pred >= 0.5)

In [92]:
# Calculate the model's overall Accuracy on the unseen validation data.
(y_val == churn_decision).mean()

np.float64(0.8055358410220014)

These lines create a results table, df_pred, to audit the model's performance on the validation data.The probability column holds the confidence score (from $0.0$ to $1.0$) from y_pred.The prediction column holds the final binary result (1 or 0) derived from applying the $\ge 0.5$ decision rule to the probability.The actual column holds the ground truth from y_val.

In [94]:
# Create an empty table (DataFrame) to compile all our results for review.
df_pred = pd.DataFrame()

# Column 1: Store the model's confidence score for each customer.
df_pred['probability'] = y_pred

# Column 2: Store the final 'Yes' (1) or 'No' (0) decision based on the 0.5 threshold.
df_pred['prediction'] = churn_decision.astype(int)

# Column 3: Store the actual true result for comparison.
df_pred['actual'] = y_val

In [95]:
# Create a new column checking if the prediction matches the actual result (True/False).
df_pred['correct'] = df_pred.prediction == df_pred.actual

In [96]:
# Calculate the mean of the 'correct' column to get the overall Accuracy score.
df_pred.correct.mean()

np.float64(0.8055358410220014)

# Model Interpretation

In [101]:
#List a-keys, list b-values
a = [1, 2, 3, 4]
b = 'abcd'

In [102]:
# Building a dictionary in a single line
dict(zip(a, b))

{1: 'a', 2: 'b', 3: 'c', 4: 'd'}

In [103]:
dict(zip(dv.get_feature_names_out(), model.coef_[0].round(3)))

{'contract=month-to-month': np.float64(0.639),
 'contract=one_year': np.float64(-0.016),
 'contract=two_year': np.float64(-0.734),
 'dependents=no': np.float64(-0.018),
 'dependents=yes': np.float64(-0.093),
 'deviceprotection=no': np.float64(0.061),
 'deviceprotection=no_internet_service': np.float64(-0.11),
 'deviceprotection=yes': np.float64(-0.062),
 'gender=female': np.float64(-0.036),
 'gender=male': np.float64(-0.075),
 'internetservice=dsl': np.float64(-0.365),
 'internetservice=fiber_optic': np.float64(0.364),
 'internetservice=no': np.float64(-0.11),
 'monthlycharges': np.float64(0.0),
 'multiplelines=no': np.float64(-0.243),
 'multiplelines=no_phone_service': np.float64(0.118),
 'multiplelines=yes': np.float64(0.014),
 'onlinebackup=no': np.float64(0.062),
 'onlinebackup=no_internet_service': np.float64(-0.11),
 'onlinebackup=yes': np.float64(-0.063),
 'onlinesecurity=no': np.float64(0.231),
 'onlinesecurity=no_internet_service': np.float64(-0.11),
 'onlinesecurity=yes': np.

## Model Optimization
After examining the model coefficients above, we determined that only a few features—primarily related to the customer's contract, duration, and pricing—have a significant influence on the churn prediction. The goal of this phase is to build a much simpler and more efficient model by excluding all low-impact features, while confirming that we retain the same high level of predictive accuracy.

In [124]:
# Select only the features that showed the highest impact on the churn decision.
small = ['contract', 'tenure', 'monthlycharges']

In [125]:
# A new list of dictionaries for training
dicts_train_small = df_train[small].to_dict(orient='records')
# A new list of dictionaries for validation
dicts_val_small = df_val[small].to_dict(orient='records')

In [126]:
# Initialize a NEW DictVectorizer specifically for the 'small' features.
dv_small = DictVectorizer(sparse=False)
# FIT (learn the three features) and TRANSFORM the training data.
dv_small.fit(dicts_train_small)

0,1,2
,dtype,<class 'numpy.float64'>
,separator,'='
,sparse,False
,sort,True


In [127]:
dv_small.get_feature_names_out()

array(['contract=month-to-month', 'contract=one_year',
       'contract=two_year', 'monthlycharges', 'tenure'], dtype=object)

In [128]:
# We FIT (tell the program/python to learn the column names from the three features)
# and TRANSFORM the training dictionaries into a numerical matrix.
X_train_small = dv_small.transform(dicts_train_small)

In [129]:
model_small = LogisticRegression(solver='lbfgs')
model_small.fit(X_train_small, y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,100


## 🔢 4.2 Manual Prediction: The Core Logistic Regression Process

This final section manually verifies the model's logic by predicting the churn probability for a single, hypothetical customer using the weights and intercept we just extracted. This process demonstrates the **exact mathematical steps** the model takes to arrive at a decision.

### 1. Extracting the Model's Components

We first confirm the core parameters learned by our optimized model:

| Component | Example Value | Role in Prediction |
| :--- | :--- | :--- |
| **Intercept ($\boldsymbol{w_0}$)** | $\approx -2.477$ | The **baseline risk** of churn when all feature values are zero. Every prediction starts here. |
| **Weights ($\boldsymbol{w}$)** | e.g., `contract=two_year`: **$-0.949$** | The **influence** of each feature. Negative weights (like long-term contract) reduce risk. |

#### **Hypothetical Customer Profile**

We predict the probability for a customer with the following loyal characteristics:

| Feature | Input Value ($\boldsymbol{x}$) | Why These Values? |
| :--- | :--- | :--- |
| **Contract** | `two_year` | A high-commitment contract with a strong negative weight. |
| **Monthly Charges** | $30$ | A low monthly fee. |
| **Tenure (Months)** | $24$ | A medium-term duration of service. |

---

### 2. Step A: Calculate the Raw Score (Log-Odds, $Z$)

The model performs a **weighted sum** of all features plus the intercept. This raw score, $Z$, can be any number.

#### **Formula Used:**
$$\mathbf{Z} = w_0 + (w_{\text{contract=two\_year}} \cdot 1) + (w_{\text{monthly\_charges}} \cdot 30) + (w_{\text{tenure}} \cdot 24)$$

```python
# The calculation you performed:
-2.47 + (-0.949) + 30 * 0.027 + 24 * (-0.036)

# Raw Score (Log-Odds):
-3.473

In [130]:
# Default bias
w0 = model_small.intercept_[0]
w0

np.float64(-2.477957595829565)

In [131]:
w = model_small.coef_[0]
w.round(3)

array([ 0.971, -0.024, -0.948,  0.027, -0.036])

In [132]:
dict(zip(dv_small.get_feature_names_out(), w.round(3)))
{'contract=month-to-month': 0.97,
 'contract=one_year': -0.025,
 'contract=two_year': -0.949,
 'monthlycharges': 0.027,
 'tenure': -0.036}

{'contract=month-to-month': 0.97,
 'contract=one_year': -0.025,
 'contract=two_year': -0.949,
 'monthlycharges': 0.027,
 'tenure': -0.036}

In [133]:
# Z = Intercept + (w_contract * 1) + (w_monthlycharges * 30) + (w_tenure * 24)
-2.47 + (-0.949) + 30 * 0.027 + 24 * (-0.036)

-3.473

In [134]:
sigmoid(_)

np.float64(0.030090303318277657)

# Using the Model

## 🚀 3.12 Using the Final Model

The goal of this section is to finalize our Logistic Regression model by training it on **all available data** (`train` + `validation`) and then performing the final, definitive test on the reserved **test set**.

### 1. Training the Production Model

We combine the training and validation sets to create a single, robust final model.

| Code Snippet | Pseudocode Action | Purpose |
| :--- | :--- | :--- |
| `dicts_full_train = df_full_train[...].to_dict(orient='records')` | **Prepare the combined data** (all training and validation customers) into dictionaries. | Maximizes the amount of data the model can learn from. |
| `dv.fit_transform(dicts_full_train)` | **Fit the DictVectorizer** *one final time* to the combined data (`X_full_train`). | The model learns the definitive set of all features based on all available non-test data. |
| `model.fit(X_full_train, y_full_train)` | **Train the Logistic Regression model** on the complete feature matrix and target values. | This creates the **production model** that will be used for future predictions. |

**Analogy:** This is like a student who has completed all their studying (`train` and `validation`) and is now **combining everything into one final, polished thesis** (the final model). 

---

### 2. Final Evaluation on the Test Set

We test the model's performance on the completely unseen `df_test` data.

```python
dicts_test = df_test[categorical + numerical].to_dict(orient='records')
X_test = dv.transform(dicts_test)
y_pred = model.predict_proba(X_test)[:, 1]

# Calculate the model's overall Accuracy on the test data.
churn_decision = (y_pred >= 0.5)
(churn_decision == y_test).mean()

In [136]:
dicts_full_train = df_full_train[categorical + numeric].to_dict(orient='records')

In [137]:
dv = DictVectorizer(sparse=False)
X_full_train = dv.fit_transform(dicts_full_train)

In [138]:
y_full_train = df_full_train.churn.values

In [139]:
model = LogisticRegression(solver='lbfgs')
model.fit(X_full_train, y_full_train)

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=100).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,100


In [141]:
dicts_test = df_test[categorical + numeric].to_dict(orient='records')

In [142]:
X_test = dv.transform(dicts_test)

In [143]:
y_pred = model.predict_proba(X_test)[:, 1]

In [144]:
churn_decision = (y_pred >= 0.5)

In [145]:
(churn_decision == y_test).mean()

np.float64(0.8140525195173882)

In [146]:
y_test

array([0, 0, 0, ..., 0, 0, 1], shape=(1409,))

In [148]:
customer = dicts_test[-1]
customer

{'gender': 'female',
 'seniorcitizen': 0,
 'partner': 'yes',
 'dependents': 'yes',
 'phoneservice': 'yes',
 'multiplelines': 'yes',
 'internetservice': 'fiber_optic',
 'onlinesecurity': 'yes',
 'onlinebackup': 'no',
 'deviceprotection': 'yes',
 'techsupport': 'no',
 'streamingtv': 'yes',
 'streamingmovies': 'yes',
 'contract': 'month-to-month',
 'paperlessbilling': 'yes',
 'paymentmethod': 'electronic_check',
 'tenure': 17,
 'monthlycharges': 104.2,
 'totalcharges': 1743.5}

In [149]:
X_small = dv.transform([customer])
model.predict_proba(X_small)[0, 1]

np.float64(0.6613470087309817)

In [150]:
y_test[-1]

np.int64(1)

## Here is a breakdown of what is happening above
* Convert the full training data (train + validation) to a list of dictionaries.

dicts_full_train = df_full_train[categorical + numerical].to_dict(orient='records')

* Initialize the final feature vectorizer.

dv = DictVectorizer(sparse=False)

* FIT (learn all feature categories) and TRANSFORM the combined training data.

X_full_train = dv.fit_transform(dicts_full_train)

* Extract the target variable (churn) for the full training set.

y_full_train = df_full_train.churn.values

* Initialize the final Logistic Regression model with the standard solver.

model = LogisticRegression(solver='lbfgs')

* TRAIN the final production model on the entire feature matrix.

model.fit(X_full_train, y_full_train)

* Convert the independent test data into a list of dictionaries.

dicts_test = df_test[categorical + numerical].to_dict(orient='records')

* TRANSFORM the test data using the feature mapping learned from the full training set.

X_test = dv.transform(dicts_test)

* Generate churn probabilities (the second column, index 1) for every customer in the test set.

y_pred = model.predict_proba(X_test)[:, 1]

* Apply the 0.5 threshold to convert probabilities into binary predictions (0 or 1).

churn_decision = (y_pred >= 0.5)

* Calculate the model's final Accuracy on the test set.

(churn_decision == y_test).mean()

__ (Output: Final Accuracy score is displayed here)__
__ 0.815471965933286__

* (Output: The array of actual test outcomes is displayed here)


  y_test

* Select the last customer from the test set for a single prediction demonstration.

customer = dicts_test[-1]

customer

* TRANSFORM the single customer dictionary into a numerical matrix (X_small).

X_small = dv.transform([customer])

* Predict the churn probability for this single customer and show the result.

model.predict_proba(X_small)[0, 1]

__* (Output: The prediction probability is displayed here)__
__* 0.5968852088293909__

* Display the actual outcome (1=Churn, 0=No Churn) for this specific customer.
y_test[-1]

__(Output: The actual outcome is displayed here)__

__1__