In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import ExtraTreesClassifier

In [2]:
df = pd.read_csv("./Downloads/WA_Fn-UseC_-Telco-Customer-Churn.csv")

In [3]:
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [4]:
df.tail()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
7038,6840-RESVB,Male,0,Yes,Yes,24,Yes,Yes,DSL,Yes,...,Yes,Yes,Yes,Yes,One year,Yes,Mailed check,84.8,1990.5,No
7039,2234-XADUH,Female,0,Yes,Yes,72,Yes,Yes,Fiber optic,No,...,Yes,No,Yes,Yes,One year,Yes,Credit card (automatic),103.2,7362.9,No
7040,4801-JZAZL,Female,0,Yes,Yes,11,No,No phone service,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.6,346.45,No
7041,8361-LTMKD,Male,1,Yes,No,4,Yes,Yes,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Mailed check,74.4,306.6,Yes
7042,3186-AJIEK,Male,0,No,No,66,Yes,No,Fiber optic,Yes,...,Yes,Yes,Yes,Yes,Two year,Yes,Bank transfer (automatic),105.65,6844.5,No


In [5]:
df.describe()

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges
count,7043.0,7043.0,7043.0
mean,0.162147,32.371149,64.761692
std,0.368612,24.559481,30.090047
min,0.0,0.0,18.25
25%,0.0,9.0,35.5
50%,0.0,29.0,70.35
75%,0.0,55.0,89.85
max,1.0,72.0,118.75


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


In [7]:
df.shape

(7043, 21)

# Todo

## Data Pre-Processing

1. Perform initial data preparation by converting the 'TotalCharges' column to numeric values and filling missing values with 0.

In [8]:
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
df['TotalCharges'] = df['TotalCharges'].fillna(0)
df

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.50,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.30,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.70,151.65,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,6840-RESVB,Male,0,Yes,Yes,24,Yes,Yes,DSL,Yes,...,Yes,Yes,Yes,Yes,One year,Yes,Mailed check,84.80,1990.50,No
7039,2234-XADUH,Female,0,Yes,Yes,72,Yes,Yes,Fiber optic,No,...,Yes,No,Yes,Yes,One year,Yes,Credit card (automatic),103.20,7362.90,No
7040,4801-JZAZL,Female,0,Yes,Yes,11,No,No phone service,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.60,346.45,No
7041,8361-LTMKD,Male,1,Yes,No,4,Yes,Yes,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Mailed check,74.40,306.60,Yes


In [9]:
print(df['TotalCharges'].isnull().sum())

0


2. Convert the 'Churn' column to binary values, where 'No' is mapped to 0 and 'Yes' is mapped to 1.

In [10]:
df['Churn'] = df['Churn'].replace({'No': 0, 'Yes': 1})
df

  df['Churn'] = df['Churn'].replace({'No': 0, 'Yes': 1})


Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,0
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.50,0
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,1
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.30,1840.75,0
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.70,151.65,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,6840-RESVB,Male,0,Yes,Yes,24,Yes,Yes,DSL,Yes,...,Yes,Yes,Yes,Yes,One year,Yes,Mailed check,84.80,1990.50,0
7039,2234-XADUH,Female,0,Yes,Yes,72,Yes,Yes,Fiber optic,No,...,Yes,No,Yes,Yes,One year,Yes,Credit card (automatic),103.20,7362.90,0
7040,4801-JZAZL,Female,0,Yes,Yes,11,No,No phone service,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.60,346.45,0
7041,8361-LTMKD,Male,1,Yes,No,4,Yes,Yes,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Mailed check,74.40,306.60,1


In [11]:
df['Churn'].unique()

array([0, 1], dtype=int64)

3. Split the data into an 80-20 train-test split with a random state of “1”

In [12]:
X = df.drop('Churn', axis=1)
y = df['Churn']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

4. Select these features:  

    categorical = ['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService','OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies','Contract', 'PaperlessBilling', 'PaymentMethod']

    numerical = ['tenure', 'MonthlyCharges', 'TotalCharges']

In [13]:
categorical = ['gender', 'SeniorCitizen', 'Partner', 'Dependents',
               'PhoneService', 'MultipleLines', 'InternetService',
               'OnlineSecurity', 'OnlineBackup', 'DeviceProtection',
               'TechSupport', 'StreamingTV', 'StreamingMovies','Contract',
               'PaperlessBilling', 'PaymentMethod']

numerical = ['tenure', 'MonthlyCharges', 'TotalCharges']

## Feature engineering

1. The numerical features should be scaled using StandardScaler, convert the output back to a dataframe and put back the column names.

In [14]:
# Scale numerical features using StandardScaler
scaler = StandardScaler()

X_train_numerical_scaled = pd.DataFrame(scaler.fit_transform(X_train[numerical]), columns=numerical)
X_test_numerical_scaled = pd.DataFrame(scaler.transform(X_test[numerical]), columns=numerical)

X_train_numerical_scaled_df = pd.DataFrame(X_train_numerical_scaled, columns=numerical)
X_test_numerical_scaled_df = pd.DataFrame(X_test_numerical_scaled, columns=numerical)

print("Scaled Training Data:")
print(X_train_numerical_scaled_df.head())
print("\nScaled Test Data:")
print(X_test_numerical_scaled_df.head())

Scaled Training Data:
     tenure  MonthlyCharges  TotalCharges
0 -0.825884       -1.497530     -0.890947
1  0.395961        0.302996      0.389693
2  1.577078        0.012320      1.060945
3  1.577078        0.686687      1.775397
4 -0.092777        0.186726     -0.102671

Scaled Test Data:
     tenure  MonthlyCharges  TotalCharges
0  0.355233        0.500655      0.460383
1  1.373437        1.249767      1.850854
2 -0.825884       -0.657063     -0.773570
3 -1.110981       -0.471031     -0.894653
4 -0.907340        0.037235     -0.713691


In [15]:
encoder = OneHotEncoder(sparse_output=False, drop='first')

# Fit the encoder on the training data and transform both train and test data
X_train_categorical_encoded = encoder.fit_transform(X_train[categorical])
X_test_categorical_encoded = encoder.transform(X_test[categorical])

encoder_columns = encoder.get_feature_names_out(categorical)

X_train_categorical_encoded_df = pd.DataFrame(X_train_categorical_encoded, columns = encoder_columns)
X_test_categorical_encoded_df = pd.DataFrame(X_test_categorical_encoded, columns = encoder_columns)

print("Processed Training Data:")
print(X_train_categorical_encoded_df.head())
print("\nProcessed Test Data:")
print(X_test_categorical_encoded_df.head())

Processed Training Data:
   gender_Male  SeniorCitizen_1  Partner_Yes  Dependents_Yes  \
0          1.0              0.0          1.0             1.0   
1          0.0              0.0          0.0             0.0   
2          1.0              0.0          1.0             0.0   
3          1.0              0.0          1.0             1.0   
4          1.0              0.0          0.0             0.0   

   PhoneService_Yes  MultipleLines_No phone service  MultipleLines_Yes  \
0               1.0                             0.0                0.0   
1               1.0                             0.0                0.0   
2               1.0                             0.0                1.0   
3               1.0                             0.0                1.0   
4               1.0                             0.0                0.0   

   InternetService_Fiber optic  InternetService_No  \
0                          0.0                 1.0   
1                          0.0       

3. Combine scaled numerical and one-hot encoded categorical features into train and test set dataframes (use pd.concat)

In [16]:
# Combining scaled numerical and one-hot encoded categorical features
X_train_processed = pd.concat([X_train_numerical_scaled_df, X_train_categorical_encoded_df], axis=1)
X_test_processed = pd.concat([X_test_numerical_scaled_df, X_test_categorical_encoded_df], axis=1)

print("Processed Training Data:")
print(X_train_processed.head())
print("\nProcessed Test Data:")
print(X_test_processed.head())

Processed Training Data:
     tenure  MonthlyCharges  TotalCharges  gender_Male  SeniorCitizen_1  \
0 -0.825884       -1.497530     -0.890947          1.0              0.0   
1  0.395961        0.302996      0.389693          0.0              0.0   
2  1.577078        0.012320      1.060945          1.0              0.0   
3  1.577078        0.686687      1.775397          1.0              0.0   
4 -0.092777        0.186726     -0.102671          1.0              0.0   

   Partner_Yes  Dependents_Yes  PhoneService_Yes  \
0          1.0             1.0               1.0   
1          0.0             0.0               1.0   
2          1.0             0.0               1.0   
3          1.0             1.0               1.0   
4          0.0             0.0               1.0   

   MultipleLines_No phone service  MultipleLines_Yes  ...  \
0                             0.0                0.0  ...   
1                             0.0                0.0  ...   
2                           

4. Use scikit learn to train a random forest and extra trees classifier, and use xgboost and lightgbm to train an extreme boosting model and a light gradient boosting model. Use random_state = 1 for training all models and evaluate on the test set. Answer from question 14

In [17]:
# Initialize models
models = {
    'RandomForest': RandomForestClassifier(random_state=1),
    'ExtraTrees': ExtraTreesClassifier(random_state=1),
    'XGBoost': XGBClassifier(random_state=1, use_label_encoder=False, eval_metric='logloss'),
    'LightGBM': LGBMClassifier(random_state=1)
}

# Train and evaluate models
for name, model in models.items():
    model.fit(X_train_processed, y_train)
    y_pred = model.predict(X_test_processed)
    print(f"{name} Classification Report:")
    print(classification_report(y_test, y_pred))
    print("-" * 80)

RandomForest Classification Report:
              precision    recall  f1-score   support

           0       0.86      0.88      0.87      1061
           1       0.61      0.55      0.58       348

    accuracy                           0.80      1409
   macro avg       0.73      0.72      0.72      1409
weighted avg       0.80      0.80      0.80      1409

--------------------------------------------------------------------------------
ExtraTrees Classification Report:
              precision    recall  f1-score   support

           0       0.84      0.87      0.86      1061
           1       0.57      0.51      0.54       348

    accuracy                           0.78      1409
   macro avg       0.71      0.69      0.70      1409
weighted avg       0.78      0.78      0.78      1409

--------------------------------------------------------------------------------
XGBoost Classification Report:
              precision    recall  f1-score   support

           0       0.86     

### Q17
To improve the Extra Trees Classifier, you will use the following parameters (number of estimators, minimum number of samples, minimum number of samples for leaf node and the number of features to consider when looking for the best split) for the hyperparameter grid needed to run a Randomized Cross Validation Search (RandomizedSearchCV). 

n_estimators = [50, 100, 300, 500, 1000]

min_samples_split = [2, 3, 5, 7, 9]

min_samples_leaf = [1, 2, 4, 6, 8]

max_features = ['auto', 'sqrt', 'log2', None] 

hyperparameter_grid = {'n_estimators': n_estimators,

                       'min_samples_leaf': min_samples_leaf,

                       'min_samples_split': min_samples_split,

                       'max_features': max_features}

Using the ExtraTreesClassifier as your estimator with cv=5, n_iter=10, scoring = 'accuracy', n_jobs = -1, verbose = 1 and random_state = 1. What are the best hyperparameters from the randomized search CV?

Options
N_estimators = 1000 , min_samples_split = 9 , min_samples_leaf = 8, max_features = None

N_estimators = 1000 , min_samples_split = 2 , min_samples_leaf = 8, max_features = None

N_estimators = 500 , min_samples_split = 2 , min_samples_leaf = 8, max_features = ‘log2‘

N_estimators = 300 , min_samples_split = 5 , min_samples_leaf = 6, max_features = ‘auto’

In [18]:
# Define hyperparameter grid
n_estimators = [50, 100, 300, 500, 1000]
min_samples_split = [2, 3, 5, 7, 9]
min_samples_leaf = [1, 2, 4, 6, 8]
max_features = ['auto', 'sqrt', 'log2', None]

hyperparameter_grid = {
    'n_estimators': n_estimators,
    'min_samples_leaf': min_samples_leaf,
    'min_samples_split': min_samples_split,
    'max_features': max_features
}

# Initialize the model
et_classifier = ExtraTreesClassifier(random_state=1)

# Perform Randomized Search with Cross-Validation
random_search = RandomizedSearchCV(estimator=et_classifier,
                                   param_distributions=hyperparameter_grid,
                                   n_iter=10,
                                   scoring='accuracy',
                                   cv=5,
                                   n_jobs=-1,
                                   verbose=1,
                                   random_state=1)

# Fit the random search model
random_search.fit(X_train_processed, y_train)

# Get the best parameters from the random search
best_params = random_search.best_params_
print("Best parameters found: ", best_params)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


20 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
20 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\G-One\anaconda3\envs\TfNew\Lib\site-packages\sklearn\model_selection\_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\G-One\anaconda3\envs\TfNew\Lib\site-packages\sklearn\base.py", line 1467, in wrapper
    estimator._validate_params()
  File "C:\Users\G-One\anaconda3\envs\TfNew\Lib\site-packages\sklearn\base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "C:\Users\G-One\anaconda3\envs\TfNew\Lib\site-packages\sklearn\utils\_param_validation.py", line 95, in validate_para

Best parameters found:  {'n_estimators': 1000, 'min_samples_split': 2, 'min_samples_leaf': 8, 'max_features': None}


### Q18
Train a new ExtraTreesClassifier Model with the new Hyperparameters from the RandomizedSearchCV (with random_state = 1). Is the accuracy of the new optimal model higher or lower than the initial ExtraTreesClassifier model with no hyperparameter tuning?

In [19]:
optimal_et_classifier = ExtraTreesClassifier(random_state=1, **best_params)
optimal_et_classifier.fit(X_train_processed, y_train)
accuracy_optimal = optimal_et_classifier.score(X_test_processed, y_test)

print("Accuracy of the new optimal ExtraTreesClassifier model:", accuracy_optimal)

Accuracy of the new optimal ExtraTreesClassifier model: 0.8048261178140526


### Q20
Find the feature importance using the optimal ExtraTreesClassifier model. Which features are the two most important respectively?

In [21]:
feature_importance = optimal_et_classifier.feature_importances_
feature_importance_df = pd.DataFrame({'Feature': X_train_processed.columns, 'Importance': feature_importance})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

print("Top two most important features:", feature_importance_df.head(2))

Top two most important features:                         Feature  Importance
0                        tenure    0.246930
10  InternetService_Fiber optic    0.219039
