In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
# Read the excel'

# Replace with own version of data

url = '/content/drive/MyDrive/POA FY25/a_IBM Telco Customers Churn Datasets.xlsx'

excel    = pd.ExcelFile(url)

# Lets view the excel
excel.sheet_names

In [None]:
# Merge the data

df_telecho            = pd.read_excel(excel , sheet_name='Telco_Churn')
df_status             = pd.read_excel(excel , sheet_name='Status_Analysis')
df_customer_info      = pd.read_excel(excel , sheet_name='Customer_Info')
df_loc                = pd.read_excel(excel , sheet_name='Location_Data')


In [None]:
df_status.head(2)

In [None]:
df_customer_info.head(2)

In [None]:
df_loc.head(2)

In [None]:
cust_info =  df_loc.merge(df_customer_info, on='customer_id',how='left').merge(df_status[['customer_id','churn_value']])
df_telecho.rename(columns={"Customer ID":"customer_id"},inplace=True)
df = df_telecho.merge(cust_info, on='customer_id',how='left')

In [None]:
df.columns

In [None]:
#
del df['Count']
del df['Quarter']
del df['customer_id']
del df['zip_code']
del df['latitude']
del df['longitude']
del df['city']

#### Data Preprocessing

In [None]:
# df.select_dtypes(include=['object','bool','string'])

df.isna().sum()

Both Internet Type and Offer are missing at random. Instead of deleting these records—which could potentially introduce bias into our data distribution—we will handle the missing values by imputing them with a constant value. This approach ensures the integrity of our dataset while allowing us to account for these features consistentl

In [None]:
df['Internet Type'] = df['Internet Type'].fillna("no internet")
df['Offer']         = df['Offer'].fillna("no offer")

In [None]:
numerical_cols   = df.select_dtypes(include=['float','int']).columns
categorical_cols = df.select_dtypes(include=['object','bool','string']).columns


#### Data Encoding

In [None]:
# df.select_dtypes(include=['object','bool','string'])

# Check for cardinality

for col in categorical_cols:
    print(f'{col} has {df[col].nunique()} No of Elements')

**Many columns have managable elements**

**Now we can encoded, We will use simple one -hot encoder from pandas, however always remember to check for ordinal categorical data if you have such use label encoder**
**NB. Some ensemble methods are able to handle categorical data an example boosting models**

In [None]:
# Now we can encoded, We will use simple one -hot encoder from pandas, however always remember to check for ordinal categorical data if you have such use label encoder


encodedf = pd.get_dummies(df[categorical_cols], dtype=float)
df = pd.concat( [df[numerical_cols], encodedf], axis=1)
df.sample(2)

**At this point , we may consider doing feature scaling whoever non mixed models ensemble methods are not sensitive to outliers so we will skip it at this point**
**We will revist this later**

#### Prepare data for modeling

In [None]:
# Check for  imbalance

df['churn_value'].value_counts(normalize=True)*100

**Not Bad!! we have 26% of data labelled as Churn, Introducing imbalance techniques may outway the benefits so we will avoid**

In [None]:
# Lets divide our data into X and y


X = df.drop(columns=['churn_value'])
y= df['churn_value']

In [None]:
# The next step is to use a hold-out method for split our train , test OR train , validation and test
# At this stage we will use the simplest method One Hold method

from sklearn.model_selection import train_test_split


X_train , X_test, y_train, y_test = train_test_split(X,y , test_size=0.2 , random_state=111)

**Now our data is ready for demonstrating Bias-Trade off Concepts using Decision Tree and Ensemble**

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score


# Intialize some errors

train_erros = []
test_errors = []

max_depths = range(1,12)

for depth in max_depths:
    model = DecisionTreeClassifier(max_depth=depth)
    model.fit(X_train,y_train)

    # compute some errors

    train_erros.append(accuracy_score(y_train,model.predict(X_train)))
    test_errors.append(accuracy_score(y_test,model.predict(X_test)))


In [None]:
plt.figure(figsize=(10,6))
plt.plot(max_depths,train_erros,label='Training accuracy',marker='o')
plt.plot(max_depths,test_errors,label='Test accuracy',marker='*')
plt.xlabel('Tree depth')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

**One of challenges of the above method is the use of One Hold out cross validation methods**
**Rememeber**

1. Its does not give us any insights on model stability acccross different splits
2. Bias in random Splits
3. Unreliable in small dataset

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier


train_accuracies        = []
cross_val_accuracies    = []
max_depths              = range(1, 12)


for depth in max_depths:
    model = DecisionTreeClassifier(max_depth=depth, random_state=42)

    cv_scores = cross_val_score(model, X, y, cv=5, scoring='accuracy')   # note we are now pass the full data since we are apply K-FOLD CV
    cross_val_accuracies.append(np.mean(cv_scores))

    model.fit(X, y)
    train_accuracies.append(np.mean(model.predict(X) == y))

In [None]:
plt.figure(figsize=(10, 6))
plt.plot(max_depths, train_accuracies, label="Training Accuracy", marker="o")
plt.plot(max_depths, cross_val_accuracies, label="Cross-Validation Accuracy", marker="o")
plt.title("Bias-Variance Tradeoff in Decision Trees (with Cross-Validation)")
plt.xlabel("Tree Depth")
plt.ylabel("Accuracy")
plt.legend()
plt.grid()
plt.show()

Since we have coded a validation curve from scratch , lets use a validation curve in sklearn

**Lets now use a validation curve from yellowbrick**

Ensure you have install yellowbrick package using =>`pip install yellowbrick`

In [None]:
# Investigating the overfitting and underfitting for max-depth params

from yellowbrick.model_selection import ValidationCurve



viz = ValidationCurve(
    DecisionTreeClassifier(),
    param_name="max_depth",
    param_range=np.arange(1,11),
    cv = 10,
    scoring= 'accuracy'
)

viz.fit(X,y)
viz.show()

In [None]:

# Investigating the overfitting and underfitting for min samples split params

viz = ValidationCurve(
        DecisionTreeClassifier(),
        param_name="min_samples_split",
        param_range=np.arange(2,100),
        cv = 10,
        scoring= 'accuracy',
        n_jobs= 10
    )

viz.fit(X,y)
viz.show()

Now lets train our decision tree with max_depth of 5

In [None]:
from sklearn.metrics import accuracy_score

def check_overfitting(model, X_train, X_test, y_train, y_test):


    y_pred_train = model.predict(X_train)
    y_pred_test  = model.predict(X_test)

    train_score = accuracy_score(y_train, y_pred_train)
    test_score  = accuracy_score(y_test, y_pred_test)

    # Prepare results dictionary
    results = {"Train Score": train_score, "Test Score": test_score}

    return results


In [None]:
dt_model = DecisionTreeClassifier(
        max_depth= 5,
        criterion= 'entropy',
        min_samples_split= 100
    )

dt_model.fit(X_train,y_train)
dt_model


In [None]:
check_overfitting(dt_model, X_train, X_test, y_train, y_test)

In [None]:
from sklearn.tree import export_graphviz
from sklearn import tree


fig, axes = plt.subplots(figsize = (4,4), dpi=1200)
tree.plot_tree(dt_model,
            #    max_depth=1,
               feature_names= X.columns,
               class_names= ["alive","dead"],
               filled=True);

In [None]:
# Lets see the features with most predictive power

from yellowbrick.model_selection import FeatureImportances

viz = FeatureImportances(dt_model, topn=10)
viz.fit(X, y)
viz.show()

### Ensemble Methods

### 1. Bagging - Bootstrap Aggregating, is an ensemble method that builds multiple versions of a model (usually decision trees) and combines their predictions to reduce variance and improve accuracy.

In [None]:
from sklearn.ensemble import BaggingClassifier

bagging_clf = BaggingClassifier(
                            estimator=DecisionTreeClassifier(max_depth=5),
                            n_estimators= 1000,
                            random_state= 11,
                            bootstrap=True,  # is True by default
                            n_jobs= -1,
                            # verbose=True

                        )

bagging_clf.fit(X_train, y_train)


check_overfitting(bagging_clf, X_train, X_test, y_train, y_test)


Random Forest is an good example of Bagging Method Lets use too

In [None]:
from sklearn.ensemble import RandomForestClassifier


rf_model = RandomForestClassifier(
                    n_estimators= 100,
                    max_depth= 5,
                    criterion= 'entropy',
                    random_state= 112


                )

rf_model.fit(X_train, y_train)
check_overfitting(rf_model, X_train, X_test, y_train, y_test)


In [None]:
viz = FeatureImportances(rf_model,topn=20)
viz.fit(X, y)
viz.show()

2. Votting Classifier-  combines predictions from multiple models to improve overall performance. There are two types of voting:

 -  Hard Voting: Majority voting, where the final prediction is the class most predicted by the base models.
 -  Soft Voting: Probability averaging, where the final prediction is based on the average predicted probabilities

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier


clf1 = LogisticRegression(random_state=42,penalty='l2')
clf2 = DecisionTreeClassifier(random_state=42, max_depth=5)
clf3 = SVC(probability=True, random_state=42)

# Create VotingClassifier with soft voting

voting_clf = VotingClassifier(
                        estimators=[('lr', clf1), ('dt', clf2), ('svc', clf3)],
                        voting='soft'
                    )

voting_clf.fit(X_train,y_train)

check_overfitting(voting_clf, X_train, X_test, y_train, y_test)

In [None]:
from sklearn.ensemble import StackingClassifier


base_models = [
    ('dt', DecisionTreeClassifier(random_state=42,max_depth=5)),
    ('svc', SVC(probability=True, random_state=42)),
    ('rf', RandomForestClassifier(random_state=42,max_depth=2))
]

# Define meta-model (level-1)
meta_model = LogisticRegression(penalty='l2')

# Create StackingClassifier
stacking_clf = StackingClassifier(
                estimators      = base_models,
                final_estimator = meta_model,
                cv=5
            )

# Train the StackingClassifier
stacking_clf.fit(X, y)

check_overfitting(stacking_clf, X_train, X_test, y_train, y_test)


3.Boosting Models

models are trained sequentially, and each new model focuses on correcting the errors of the previous ones. This process combines weak learners (usually shallow decision trees) to form a strong predictive model

In [None]:
from sklearn.ensemble import GradientBoostingClassifier



boosting_clf = GradientBoostingClassifier(
    n_estimators=100,  # Number of boosting stages
    learning_rate=0.1, # Step size shrinkage
    max_depth=3,       # Maximum depth of the trees
    random_state=42
)

# Train the model
boosting_clf.fit(X_train, y_train)
check_overfitting(boosting_clf, X_train, X_test, y_train, y_test)

*Lets try a neural network to see if any model will match our Ensemble- Boosting**

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping

In [None]:
# Build the ANN model
ann_model = Sequential([
    Dense(64, activation='relu', input_shape=(X_train.shape[1],)),  # Input layer
    Dropout(0.2),  # Dropout for regularization
    Dense(128, activation='relu'),  # Hidden layer
    Dense(256, activation='relu'),  # Hidden layer
    Dropout(0.2),
    Dense(1, activation='sigmoid')  # Output layer for binary classification
])

# Compile the model
ann_model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy']
              )


early_stopping = EarlyStopping(
                            monitor             ='val_loss',
                            patience            =5,
                            restore_best_weights=True
                        )




In [None]:
# Train the model
history = ann_model.fit(X_train,
                    y_train,
                    validation_data=(X_test, y_test),
                    epochs=100,
                    batch_size=32,
                    callbacks = [early_stopping],
                    verbose=1
                    )

In [None]:
# Evaluate the model
train_loss, train_acc = ann_model.evaluate(X_train, y_train, verbose=0)
test_loss, test_acc = ann_model.evaluate(X_test, y_test, verbose=0)

print(f"Train Accuracy: {train_acc:.2f}")
print(f"Test Accuracy: {test_acc:.2f}")

In [None]:
from sklearn.model_selection import GridSearchCV


param_grid = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7],
}

grid_search = GridSearchCV(estimator=boosting_clf,
                           param_grid=param_grid,
                           cv=5,
                           scoring='accuracy',
                           n_jobs=-1)

# Fit the model to the training data
grid_search.fit(X, y)

# Retrieve the best parameters and model accuracy
best_params   = grid_search.best_params_
best_model    = grid_search.best_estimator_
accuracy_best = accuracy_score(y, best_model.predict(X))

print("Best Parameters:", best_params)
print(f"Best Model Accuracy: {accuracy_best:.4f}")

In [None]:
from sklearn.model_selection import RandomizedSearchCV


param_dist = {
    'n_estimators'  : [50, 100, 200],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth'    : [3, 5, 7],
}

# Initialize RandomizedSearchCV
random_search = RandomizedSearchCV(estimator=boosting_clf,
                                   param_distributions=param_dist,
                                   n_iter=10,
                                   cv=5,
                                   scoring='accuracy',
                                   n_jobs=-1)


random_search.fit(X, y)

# Retrieve the best parameters and model accuracy
best_params_random   = random_search.best_params_
best_model_random    = random_search.best_estimator_
accuracy_best_random = accuracy_score(y, best_model_random.predict(X))

print("Best Parameters (Random Search):", best_params_random)
print(f"Best Model Accuracy (Random Search): {accuracy_best_random:.4f}")