In [101]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, f1_score, roc_auc_score
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline                          
import numpy as np

In [102]:
# Read data
data = pd.read_csv('/Users/mario.iuliano/marios_env/Portfolio/test/feature_selection/churn/telco.csv')

In [103]:
# Explore 
data.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [104]:
# Normalize column names 
data.columns = data.columns.str.lower().str.replace(' ', '_')

In [105]:
data.head()

Unnamed: 0,customerid,gender,seniorcitizen,partner,dependents,tenure,phoneservice,multiplelines,internetservice,onlinesecurity,...,deviceprotection,techsupport,streamingtv,streamingmovies,contract,paperlessbilling,paymentmethod,monthlycharges,totalcharges,churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [106]:
# Info 
data.info() 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerid        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   seniorcitizen     7043 non-null   int64  
 3   partner           7043 non-null   object 
 4   dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   phoneservice      7043 non-null   object 
 7   multiplelines     7043 non-null   object 
 8   internetservice   7043 non-null   object 
 9   onlinesecurity    7043 non-null   object 
 10  onlinebackup      7043 non-null   object 
 11  deviceprotection  7043 non-null   object 
 12  techsupport       7043 non-null   object 
 13  streamingtv       7043 non-null   object 
 14  streamingmovies   7043 non-null   object 
 15  contract          7043 non-null   object 
 16  paperlessbilling  7043 non-null   object 


In [107]:
# Total customers 
data.customerid.nunique() == data.shape[0]
# Each row is a customer, no duplicates

True

In [108]:
# Distribution of churn 
data.churn.value_counts(normalize=True)
# Churn class is imbalanced, 73% of customers are not churning

churn
No     0.73463
Yes    0.26537
Name: proportion, dtype: float64

In [109]:
# Convert numerical columns to numeric
data.tenure = data.tenure.astype('int') 

In [110]:
# Total charges have no values - Why? 
data[data.totalcharges == ' ']
# Not sure whether missing values are due to customers being new or not having a total charge- As the impact is minimum, we will drop these rows
data = data[data.totalcharges != ' ']
data.totalcharges = data.totalcharges.astype('float')

In [111]:
data.describe(include = ['O'])

Unnamed: 0,customerid,gender,partner,dependents,phoneservice,multiplelines,internetservice,onlinesecurity,onlinebackup,deviceprotection,techsupport,streamingtv,streamingmovies,contract,paperlessbilling,paymentmethod,churn
count,7032,7032,7032,7032,7032,7032,7032,7032,7032,7032,7032,7032,7032,7032,7032,7032,7032
unique,7032,2,2,2,2,3,3,3,3,3,3,3,3,3,2,4,2
top,7590-VHVEG,Male,No,No,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,No
freq,1,3549,3639,4933,6352,3385,3096,3497,3087,3094,3472,2809,2781,3875,4168,2365,5163


In [112]:
# Drop customer id
data.drop(columns = ['customerid'], inplace = True)

In [113]:
data['churn_num'] = data.churn.map({'No': 0, 'Yes': 1})

In [118]:
y = data['churn'].map({'No': 0, 'Yes': 1})  # Encode target to 0/1
X = data.drop(['churn_num','churn'], axis=1)  # Drop ID & raw target

In [119]:
#categorical_cols = X.select_dtypes(include=['object']).columns.tolist()
categorical_cols = ['contract',
                    'onlinesecurity',
                    'techsupport',
                    'internetservice',
                    'paymentmethod',
                    'onlinebackup',
                    'deviceprotection',
                    'streamingmovies',
                    'streamingtv',
                    'paperlessbilling',
                    'dependents',
                    'partner']
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

In [120]:
# Create pipeline 
from sklearn.pipeline import Pipeline

numerical_transformer = Pipeline(steps= [
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(
        drop='first', 
        sparse_output=False, 
        handle_unknown='ignore'
    ))
])


# Combine 
preprocessor = ColumnTransformer(transformers = [
    ('num', numerical_transformer, numerical_cols),
    ('cat', categorical_transformer, categorical_cols)
])


In [121]:
# Check the data
from imblearn.pipeline import Pipeline as ImbPipeline 
from imblearn.combine import SMOTEENN                           # Combines SMOTE and ENN for class balancing

smote_enn = SMOTEENN(random_state=42)

# Full pipeline: preprocessing → resampling
pipeline = ImbPipeline(steps=[
    ('preprocessor', preprocessor),
    ('sampler', smote_enn)
])

In [122]:
X_resampled, y_resampled = pipeline.fit_resample(X, y)

In [123]:
# Split resampled data into train & test sets
X_train, X_test, y_train, y_test = train_test_split(
    X_resampled,
    y_resampled,
    test_size=0.2,
    random_state=42,
    stratify=y_resampled
)

print("Training set shape:", X_train.shape, y_train.shape)
print("Testing set shape:", X_test.shape, y_test.shape)

Training set shape: (5116, 26) (5116,)
Testing set shape: (1280, 26) (1280,)


In [124]:
# Initialize the Random Forest model
# – random_state for reproducibility
# – n_jobs=-1 to use all CPU cores
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=10, random_state=42, n_jobs=-1)

# Fit the model on the training data
rf.fit(X_train, y_train)

In [125]:
# Generate predictions and probabilities
y_pred  = rf.predict(X_test)              # class labels
y_proba = rf.predict_proba(X_test)[:, 1]  # probability of positive class (Churn)

# Print key metrics
print("Default Random Forest Performance")
print(f"F1-score : {f1_score(y_test, y_pred):.4f}")
print(f"AUC-ROC  : {roc_auc_score(y_test, y_proba):.4f}\n")
print(classification_report(y_test, y_pred))

Default Random Forest Performance
F1-score : 0.9655
AUC-ROC  : 0.9928

              precision    recall  f1-score   support

           0       0.95      0.96      0.96       569
           1       0.97      0.96      0.97       711

    accuracy                           0.96      1280
   macro avg       0.96      0.96      0.96      1280
weighted avg       0.96      0.96      0.96      1280



In [126]:
# Convert lists to pandas Series and then concatenate
X_labels = pd.concat([pd.Series(categorical_cols), pd.Series(numerical_cols)], axis=0)

In [127]:
# Logreg 
lr = LogisticRegression()
lr.fit(X_train, y_train)


In [128]:
lr_y_pred = lr.predict(X_test)
lr_y_proba = lr.predict_proba(X_test)[:,1]

In [129]:
print(classification_report(y_test, lr_y_pred))
print(f"F1-score : {f1_score(y_test, lr_y_pred):.4f}")
print(f"AUC-ROC  : {roc_auc_score(y_test, lr_y_proba):.4f}")

              precision    recall  f1-score   support

           0       0.91      0.91      0.91       569
           1       0.92      0.92      0.92       711

    accuracy                           0.92      1280
   macro avg       0.91      0.91      0.91      1280
weighted avg       0.92      0.92      0.92      1280

F1-score : 0.9241
AUC-ROC  : 0.9727


In [131]:
# Fit the pipeline
X_resampled, y_resampled = pipeline.fit_resample(X, y)

# Get the fitted preprocessor
fitted_preprocessor = pipeline.named_steps['preprocessor']

# Access fitted one-hot encoder
fitted_ohe = fitted_preprocessor.named_transformers_['cat'].named_steps['onehot']

# Get feature names
ohe_feature_names = fitted_ohe.get_feature_names_out(categorical_cols)

# Combine with numerical columns
feature_names = list(ohe_feature_names) + numerical_cols


In [132]:
coefficients = lr.coef_[0]

coeff_df = pd.DataFrame({
    'Feature': feature_names,
    'Coefficient': coefficients
})

In [133]:
coeff_df['odds_ratio'] = np.exp(coeff_df['Coefficient'])

In [134]:
coeff_df['abs_log_odds'] = coeff_df['Coefficient'].abs()

In [135]:
coeff_df_sorted = coeff_df.sort_values(by='abs_log_odds', ascending=False)
coeff_df_sorted

Unnamed: 0,Feature,Coefficient,odds_ratio,abs_log_odds
5,techsupport_Yes,-2.894072,0.05535,2.894072
1,contract_Two year,-2.357927,0.094616,2.357927
10,paymentmethod_Mailed check,2.115332,8.292336,2.115332
4,techsupport_No internet service,-1.340694,0.261664,1.340694
3,onlinesecurity_Yes,1.197936,3.313271,1.197936
9,paymentmethod_Electronic check,-1.080032,0.339585,1.080032
22,seniorcitizen,1.0095,2.744228,1.0095
20,dependents_Yes,0.862371,2.36877,0.862371
23,tenure,0.802899,2.232002,0.802899
2,onlinesecurity_No internet service,-0.732373,0.480767,0.732373



Customers with this feature (Partner) are only 12% as likely to churn as those without it.

Or, better phrased:

They are 88% less likely to churn.

 

In [136]:
def interpret_odds_ratio_message(odds):
    if odds > 1.0:
        return f"{round((odds - 1) * 100)}% more likely to churn"
    elif odds < 1.0:
        return f"{round((1 - odds) * 100)}% less likely to churn"
    else:
        return "no effect on churn"

In [137]:
np.exp(-0.85)

np.float64(0.4274149319487267)

In [138]:
X_processed = preprocessor.fit_transform(X)
feature_names = preprocessor.named_transformers_['cat'].get_feature_names_out(categorical_cols)
all_feature_names = numerical_cols + list(feature_names)
X_df = pd.DataFrame(X_processed, columns=all_feature_names)

In [139]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

vif_df = pd.DataFrame()
vif_df["feature"] = X_df.columns
vif_df["VIF"] = [variance_inflation_factor(X_df.values, i) for i in range(X_df.shape[1])]


  vif = 1. / (1. - r_squared_i)


In [140]:
vif_df

Unnamed: 0,feature,VIF
0,seniorcitizen,1.149547
1,tenure,7.507553
2,monthlycharges,15.158174
3,totalcharges,10.643472
4,contract_One year,1.995152
5,contract_Two year,3.395735
6,onlinesecurity_No internet service,inf
7,onlinesecurity_Yes,1.863099
8,techsupport_No internet service,inf
9,techsupport_Yes,2.050593


# When VIF = 866 for monthlycharges, what does that mean?
It means that monthlycharges is extremely linearly correlated with one or more of the other features in the model.

But: VIF doesn’t tell you which one directly.
It just says:

“Hey, this feature is mostly predictable from a combination of the others.”

So yes — in this case, it’s very likely that the strongest culprit is:

✅ totalcharges
Why?

totalcharges
≈
monthlycharges
×
tenure
totalcharges≈monthlycharges×tenure
This means:

If you know monthlycharges and tenure, you can almost exactly predict totalcharges

So they’re mathematically tied

This creates severe multicollinearity.



In [99]:
data[['monthlycharges', 'tenure', 'totalcharges']].corr()

Unnamed: 0,monthlycharges,tenure,totalcharges
monthlycharges,1.0,0.246862,0.651065
tenure,0.246862,1.0,0.82588
totalcharges,0.651065,0.82588,1.0
