In [49]:
'''
MACHINE LEARNING PROBLEM:

This is a regression because we are predicting whether a customer will leave or stay.

POTENTIAL MODELS:

1. Logistic Regression is a linear model that is interpretable and simple commonly used for binary classification. 
It would show the features that are likely to increase churn likelihood.

2. Random Forest Classifier is an ensemble tree-based model. suitable for mixed data types, non-linear relationships 
and robust outliers. It also show the features that are likely to increase churn likelihood.

3. XGBoost is a gradient boosting ensemble. It slo is commonly used for churn prediction. It captures complex patterns 
and interaction and handles imbalanced data.
'''

'\nMACHINE LEARNING PROBLEM:\n\nThis is a regression because we are predicting whether a customer will leave or stay.\n\nPOTENTIAL MODELS:\n\n1. Logistic Regression is a linear model that is interpretable and simple commonly used for binary classification. \nIt would show the features that are likely to increase churn likelihood.\n\n2. Random Forest Classifier is an ensemble tree-based model. suitable for mixed data types, non-linear relationships \nand robust outliers. It also show the features that are likely to increase churn likelihood.\n\n3. XGBoost is a gradient boosting ensemble. It slo is commonly used for churn prediction. It captures complex patterns \nand interaction and handles imbalanced data.\n'

In [50]:
!pip install pandas




In [51]:
pip install scikit-learn

Note: you may need to restart the kernel to use updated packages.


In [52]:
## Import libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, accuracy_score, classification_report

In [53]:
## Load Data and view
df = pd.read_csv("Telco-Customer-Churn.csv")
print(df.head())
print(df.shape)
print(df.info())


   customerID  gender  SeniorCitizen Partner Dependents  tenure PhoneService  \
0  7590-VHVEG  Female              0     Yes         No       1           No   
1  5575-GNVDE    Male              0      No         No      34          Yes   
2  3668-QPYBK    Male              0      No         No       2          Yes   
3  7795-CFOCW    Male              0      No         No      45           No   
4  9237-HQITU  Female              0      No         No       2          Yes   

      MultipleLines InternetService OnlineSecurity  ... DeviceProtection  \
0  No phone service             DSL             No  ...               No   
1                No             DSL            Yes  ...              Yes   
2                No             DSL            Yes  ...               No   
3  No phone service             DSL            Yes  ...              Yes   
4                No     Fiber optic             No  ...               No   

  TechSupport StreamingTV StreamingMovies        Contract Pape

In [54]:
##Clean the column names
df.columns = (
    df.columns.str.strip()          # remove leading/trailing spaces
              .str.lower()          # convert to lowercase
              .str.replace(' ', '_') # replace spaces with underscores
              .str.replace('(', '')  # remove parentheses
              .str.replace(')', '')  # remove parentheses
)

##View the cleaned column names
print("\nAfter cleaning:\n", df.columns.tolist())


After cleaning:
 ['customerid', 'gender', 'seniorcitizen', 'partner', 'dependents', 'tenure', 'phoneservice', 'multiplelines', 'internetservice', 'onlinesecurity', 'onlinebackup', 'deviceprotection', 'techsupport', 'streamingtv', 'streamingmovies', 'contract', 'paperlessbilling', 'paymentmethod', 'monthlycharges', 'totalcharges', 'churn']


In [55]:
## Drop non-informative data
df.drop("customerid",axis=1, inplace=True, errors='ignore')

In [56]:
## convert TotalCharges from object to numeric
df['totalcharges'] = pd.to_numeric(df['totalcharges'], errors='coerce')
df['totalcharges'].fillna(df['totalcharges'].median(), inplace=True)

In [57]:
##Encode dependent variable
df['churn'] = df['churn'].map({'Yes':1, 'No':0})

## Encode categorical variables
cat_cols = df.select_dtypes('object').columns
df = pd.get_dummies(df, columns=cat_cols, drop_first=True)

In [58]:
print(df.columns)


Index(['seniorcitizen', 'tenure', 'monthlycharges', 'totalcharges', 'churn',
       'gender_Male', 'partner_Yes', 'dependents_Yes', 'phoneservice_Yes',
       'multiplelines_No phone service', 'multiplelines_Yes',
       'internetservice_Fiber optic', 'internetservice_No',
       'onlinesecurity_No internet service', 'onlinesecurity_Yes',
       'onlinebackup_No internet service', 'onlinebackup_Yes',
       'deviceprotection_No internet service', 'deviceprotection_Yes',
       'techsupport_No internet service', 'techsupport_Yes',
       'streamingtv_No internet service', 'streamingtv_Yes',
       'streamingmovies_No internet service', 'streamingmovies_Yes',
       'contract_One year', 'contract_Two year', 'paperlessbilling_Yes',
       'paymentmethod_Credit card (automatic)',
       'paymentmethod_Electronic check', 'paymentmethod_Mailed check'],
      dtype='object')


In [60]:
##split data
X = df.drop("churn", axis=1)
y = df["churn"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [61]:
##Scale numeric features 
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [63]:
##Logistic Regression
log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train_scaled, y_train)
y_pred_lr = log_reg.predict(X_test_scaled)

##Evaluate Logistic Regression
f1_lr = f1_score(y_test, y_pred_lr)
print("Logistic Regression F1 Score:", round(f1_lr, 4))

Logistic Regression F1 Score: 0.6092


In [64]:
##Random Forest Classifier
rf = RandomForestClassifier(n_estimators=200, random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)

##Evaluate Random Forest
f1_rf = f1_score(y_test, y_pred_rf)
print("Random Forest F1 Score:", round(f1_rf, 4))

##Compare results
print("\n=== MODEL COMPARISON ===")
print(f"Logistic Regression F1: {f1_lr:.4f}")
print(f"Random Forest F1:      {f1_rf:.4f}")

Random Forest F1 Score: 0.5556

=== MODEL COMPARISON ===
Logistic Regression F1: 0.6092
Random Forest F1:      0.5556


In [None]:
'''
The logistic regression (0.6092) is slightly better than the Random Forest (0.5556). 
This makes the logistic regression a reasnnable baseline model. This shows that there is
still room for improvement with hyperparameter tuning.
'''