In [2]:
import pandas as pd
import numpy as np
pd.set_option("display.max_columns", 300)

# visualization
import matplotlib.pyplot as plt
import seaborn as sns

# utility
from pprint import pprint
import time


from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score, roc_auc_score

# Data

In [4]:
data = pd.read_csv(r"D:\Git-GitHub\Repositories\data-science-track\data-science-track\03-machine-learning\resources\Telco-Customer-Churn.csv")
print(data.shape)

data.head()

(7043, 21)


Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


# Check for missing values and duplicates

In [5]:
data.isna().sum()

customerID          0
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64

In [6]:
print(data.drop_duplicates().shape)

(7043, 21)


There is no duplicate and no missing values

# Train Test Split

In [None]:
train, test = train_test_split(data, test_size=0.12, stratify=data['Churn'], random_state=19)

print('train shape: ', train.shape)
print('test shape: ', test.shape)

train.head(2)

# EDA

In [None]:
train.describe()

In [None]:
train['SeniorCitizen'].value_counts()

As you can see, "SeniorCitizen" is indeed a categorical feature, even though it initially appeared in description!

In [None]:
train['Churn'].value_counts(normalize=True)

It is always good to check the distribution of different levels in your target variable. Data is more or less imbalanced! But nothing terrible.

In [None]:
data['customerID'].value_counts()

I could not find anything useful in this feature. It is only an ID. I also investigated both number and letter parts of it. I could not find any pattern in it. So as an ID, it does not provide any information, and we will ignore this feature.

In [None]:
num_features = ['tenure', 'MonthlyCharges']
cat_features = [el for el in train.columns if el not in ['Churn', 'customerID'] + num_features ]

print('num of cat features: ', len(cat_features))
pprint(cat_features)

In [None]:
for col in cat_features:
  print(col, ': ', train[col].nunique())

We have one odd categorical feature! "TotalCharges"!

**Question:** Is it really categorical?

In [None]:
train['TotalCharges']

In [None]:
train['TotalCharges'].iloc[0]

In [None]:
#train['TotalCharges'].astype(float)
# This line fails. It means we have missing values in a special form " "

In [None]:
train.loc[train['TotalCharges']==' ']

In [None]:
test.loc[test['TotalCharges']==' ']

**Explanation**

Because of missing values in a special format " ", the column has been cast as an object. In Pandas, if a column has a mixture of strings and floats, its type becomes object and therefore it does not appear in train.describe() function!

Otherwise, the column is in essence numerical! We have to consider it as such and impute missing values!

In [None]:
train['TotalCharges'].dtype

For the moment, let us impute the missing values because we want to perform EDA. Later on, we'll do the imputation in a pipeline!

In [None]:
train, test = train_test_split(data, test_size=0.12, stratify=data['Churn'], random_state=19)

# Find the indices where train and test having values in TotalCharges
missing_indices_train = train.loc[train['TotalCharges']==' '].index
missing_indices_test = test.loc[test['TotalCharges']==' '].index

median_value = train.loc[~train.index.isin(missing_indices_train), 'TotalCharges'].astype(float).median()
train.loc[missing_indices_train, 'TotalCharges'] = median_value
test.loc[missing_indices_test, 'TotalCharges'] = median_value  # Use the same median from train for test

# After all, we cast the type to float
train['TotalCharges'] = train['TotalCharges'].astype(float)
test['TotalCharges'] = test['TotalCharges'].astype(float)


num_features = ['tenure', 'MonthlyCharges', 'TotalCharges']
cat_features = [el for el in train.columns if el not in ['Churn', 'customerID'] + num_features ]

## Univariate EDA

In [None]:
fig, axs = plt.subplots(1, 3, figsize=(12, 4))
axs = axs.flatten()

# Plot histograms for each column
for i, col in enumerate(num_features):
    axs[i].hist(train[col], bins=20, edgecolor='black')
    axs[i].set_xlabel(col)
    axs[i].set_ylabel('Counts')


**Observation**

1. All numerical values seem to have a typical distribution with respect to what they represent. Nothing odd to be observed, suc as outliers!


Next, let us see some stats on our categorical features as well!

In [None]:
print(len(cat_features))

In [None]:
fig, axs = plt.subplots(4, 4, figsize=(10, 15))
axs = axs.flatten()

for i, col in enumerate(cat_features):
  train[col].value_counts(ascending=True).plot(kind='bar', color='blue', ax=axs[i])
  axs[i].set


# Hide any unused subplots
for j in range(i + 1, len(axs)):
    fig.delaxes(axs[j])

# Adjust layout to prevent overlap
plt.tight_layout()


## Bivariate EDA

In [None]:
output_dic = {'Yes': 1, 'No': 0}
train['Churn_encoded'] = train['Churn'].map(output_dic)
train['Churn_encoded'].value_counts(normalize=True)

In [None]:
def plotCorrelationMatrix(df, figsize=(5, 5)):

    # Compute the correlation matrix
    corr = df.corr()

    # Set up the matplotlib figure
    plt.figure(figsize=figsize)

    # Generate a heatmap
    sns.heatmap(corr, annot=True, cmap='coolwarm', fmt=".2f", annot_kws={"size": 10})

    # Set title and adjust layout
    plt.title('Correlation Matrix')
    plt.tight_layout()

    # Show plot
    plt.show()

In [None]:
plotCorrelationMatrix(train[num_features+['Churn_encoded']])

**Observations**

There is a pretty substantial relationship between the target and three other numerical features.

In [None]:
cat_features

In [None]:
def one_hot_encode(train, test, columns):
  for col in columns:
    enc = OneHotEncoder(drop='if_binary', handle_unknown='ignore', sparse_output=False)
    train_encoded = enc.fit_transform(train[[col]])
    features = [col + '_' + str(el) for el in enc.categories_[0]]
    if train_encoded.shape[1] == 1:
      features = [features[1]] # In case it was binary, only consider what is necessary
    test_encoded = enc.transform(test[[col]])

    train.drop(columns=[col], inplace=True)
    train[features] = train_encoded


    test.drop(columns=[col], inplace=True)
    test[features] = test_encoded

  return train, test

In [None]:
train.head(3)

In [None]:
train_encoded, test_encoded = one_hot_encode(train.copy(), test.copy(), cat_features)
print(train_encoded.shape)
print(test_encoded.shape)

train_encoded.head()

In [None]:
ohe_features = [el for el in train_encoded.columns if el not in train.columns]
print(len(ohe_features))
pprint(ohe_features)

In [None]:
plotCorrelationMatrix(train_encoded[ohe_features+['Churn_encoded']], figsize=(25, 25))

**Observations**

1. The feature "Contract_Month-to-month" has the strongest positive correlation with churn="yes". What does this mean? It means, if the contract is month to month, there is a high likelihood that the customer might want to cancel it soon. It makes sense!

2. After that, other important features are "TechSuppot_No" and "OnlineSecurity_No"

3. Some features in the middle, have almost 100% and in one case, -100% correlationship with each other! It means, we can perhaps drop some of those features because the other ones replace them effectively!

...

For the moment, we first train a baseline model just as it is without any feature selection or engineering!

# Baseline Model

In [None]:
# It is better to repeat every operation we performed on train and test so far, just in case we might have done a mistake.
# This makes sure our data is what we expect it to be

def preprocess_data_LR(data, log_transform=True):
  train, test = train_test_split(data, test_size=0.12, stratify=data['Churn'], random_state=19)

  # Taking care of missing values
  missing_indices_train = train.loc[train['TotalCharges']==' '].index
  missing_indices_test = test.loc[test['TotalCharges']==' '].index
  median_value = train.loc[~train.index.isin(missing_indices_train), 'TotalCharges'].astype(float).median()
  train.loc[missing_indices_train, 'TotalCharges'] = median_value
  test.loc[missing_indices_test, 'TotalCharges'] = median_value  # Use the same median from train for test
  # After all, we cast the type to float
  train['TotalCharges'] = train['TotalCharges'].astype(float)
  test['TotalCharges'] = test['TotalCharges'].astype(float)

  # map the outputs as well
  output_dic = {'Yes': 1, 'No': 0}
  train['Churn'] = train['Churn'].map(output_dic)
  test['Churn'] = test['Churn'].map(output_dic)

  # get numerical and categorical features
  num_features = ['tenure', 'MonthlyCharges', 'TotalCharges']
  cat_features = [el for el in train.columns if el not in ['Churn', 'customerID'] + num_features ]

  # Perform scaling on numerical features for Logistic Regression
  if log_transform:
    for col in num_features:
      train[col] = np.log1p(train[col])
      test[col] = np.log1p(test[col])

  # one hot encode categorical features
  train_encoded, test_encoded = one_hot_encode(train.copy(), test.copy(), cat_features)

  ohe_features = [el for el in train_encoded.columns if el not in train.columns]

  return train_encoded, test_encoded, num_features, ohe_features

In [None]:
def get_model_inputs(train_encoded, test_encoded, num_features, ohe_features):
  X_train = train_encoded[num_features + ohe_features].values
  y_train = train_encoded['Churn'].values
  X_test = test_encoded[num_features + ohe_features].values
  y_test = test_encoded['Churn'].values

  return X_train, y_train, X_test, y_test

In [None]:
train_encoded, test_encoded, num_features, ohe_features = preprocess_data_LR(data, log_transform=True)
X_train, y_train, X_test, y_test = get_model_inputs(train_encoded, test_encoded, num_features, ohe_features)

print('X_train shape: ', X_train.shape)
print('X_test shape: ', X_test.shape)

In [None]:
train_encoded.head(2)

In [None]:
logistic_model = LogisticRegression(max_iter=1000)
logistic_model.fit(X_train, y_train)

In [None]:
# Define the evaluation function
def evaluate_model(model, X_train, y_train, X_test, y_test):
    # Train predictions
    train_pred = model.predict(X_train)

    # Test predictions
    test_pred = model.predict(X_test)

    # Train accuracy
    train_accuracy = accuracy_score(y_train, train_pred)
    print("Train Accuracy:", train_accuracy)

    # Test accuracy
    test_accuracy = accuracy_score(y_test, test_pred)
    print("Test Accuracy:", test_accuracy)

    # Train confusion matrix
    train_conf_matrix = confusion_matrix(y_train, train_pred)
    print("\nTrain Confusion Matrix:")
    print(train_conf_matrix)

    # Test confusion matrix
    test_conf_matrix = confusion_matrix(y_test, test_pred)
    print("\nTest Confusion Matrix:")
    print(test_conf_matrix)

    # Calculate precision, recall, and F1-score for training set
    train_precision = precision_score(y_train, train_pred)
    train_recall = recall_score(y_train, train_pred)
    train_f1_score = f1_score(y_train, train_pred)

    print("\nTrain Precision:", train_precision)
    print("Train Recall:", train_recall)
    print("Train F1 Score:", train_f1_score)

    # Calculate precision, recall, and F1-score for test set
    test_precision = precision_score(y_test, test_pred)
    test_recall = recall_score(y_test, test_pred)
    test_f1_score = f1_score(y_test, test_pred)

    print("\nTest Precision:", test_precision)
    print("Test Recall:", test_recall)
    print("Test F1 Score:", test_f1_score)

    # Train ROC AUC score
    train_roc_auc = roc_auc_score(y_train, train_pred)
    print("\nTrain ROC AUC Score:", train_roc_auc)

    # Test ROC AUC score
    test_roc_auc = roc_auc_score(y_test, test_pred)
    print("Test ROC AUC Score:", test_roc_auc)

In [None]:
evaluate_model(logistic_model, X_train, y_train, X_test, y_test)

In [None]:
# If we remember we had
train['Churn'].value_counts(normalize=True)

It means, if we predict everything as No, we already get an accuracy of 73%. This is called the null classifier.

Our logistic Regression achieved an accuracy of 81.8% with log transformation. That is already better than the null classifier! One step to the positive direction. Also, if we don't perform log transformation, the accuracy is 81.4%!

Log transformation is awesome for numerical features!

Can we do better?

Although Logistic regression has some hyperparameters, and we can obviously tune them, it is not worth the effort because there are better models. We save the efforts for them.

But at least we have a baseline.

Can we do better before we try another model? Perhaps let us do some feature selection!

In [None]:
ohe_features

In [None]:
# We will drop the following features

features_to_drop = [ # drop because of 100% correlations
                    'MultipleLines_No phone service',  # for -100% correlation
                    'InternetService_No',
                    'OnlineSecurity_No internet service',
                    'OnlineBackup_No internet service',
                    'DeviceProtection_No internet service',
                    'TechSupport_No internet service',
                    'StreamingMovies_No internet service',

                    # Drop because correlation is 1%
                    'gender_Male',
                    'PhoneService_Yes'
                    ]

In [None]:
train_encoded.head(1)

In [None]:
X_train, y_train, X_test, y_test = get_model_inputs(train_encoded.drop(columns=features_to_drop),
                                                    test_encoded.drop(columns=features_to_drop),
                                                    num_features,
                                                    [el for el in ohe_features if el not in features_to_drop])

print('X_train shape: ', X_train.shape)
print('X_test shape: ', X_test.shape)

In [None]:
logistic_model = LogisticRegression(max_iter=1000)
logistic_model.fit(X_train, y_train)
evaluate_model(logistic_model, X_train, y_train, X_test, y_test)

We saw that we could further improve test accuracy be deselecting some unimportant features, especially the last two. Could it further improve?

In [None]:
features_to_drop = [ # drop because of 100% correlations
                    'MultipleLines_No phone service',
                    'InternetService_No',
                    'OnlineSecurity_No internet service',
                    'OnlineBackup_No internet service',
                    'DeviceProtection_No internet service',
                    'TechSupport_No internet service',
                    'StreamingMovies_No internet service',

                    # Drop because correlation is 1%
                    'gender_Male',
                    'PhoneService_Yes',

                    # drop because correlation is 5%
                    'MultipleLines_No',
                    'MultipleLines_Yes'
                    ]

In [None]:
X_train, y_train, X_test, y_test = get_model_inputs(train_encoded.drop(columns=features_to_drop),
                                                    test_encoded.drop(columns=features_to_drop),
                                                    num_features,
                                                    [el for el in ohe_features if el not in features_to_drop])

print('X_train shape: ', X_train.shape)
print('X_test shape: ', X_test.shape)

In [None]:
logistic_model = LogisticRegression(max_iter=1000)
logistic_model.fit(X_train, y_train)
evaluate_model(logistic_model, X_train, y_train, X_test, y_test)

It was too much. Previous feature selection was the best! The last two features we dropped had some information which we lost!

# KNN

Since we have some numerical features, it is better to scale them before applying KNN. We use log transformation. Although one can try StandardScaler and MinMaxScaler as well!

Although there are some 9 missing values in the train data, doing a pipeline to prevent data leakage from happening is not worth the effort. Also, log transformation does not lead to data leakage because there is no encoder being fit on train!

So we leave out pipeline and proceed with the data we preprocessed for Logistic Regression.

In [None]:
train_encoded, test_encoded, num_features, ohe_features = preprocess_data_LR(data)


features_to_drop = [ # drop because of 100% correlations
                    'MultipleLines_No phone service',
                    'InternetService_No',
                    'OnlineSecurity_No internet service',
                    'OnlineBackup_No internet service',
                    'DeviceProtection_No internet service',
                    'TechSupport_No internet service',
                    'StreamingMovies_No internet service',

                    # Drop because correlation is 1%
                    'gender_Male',
                    'PhoneService_Yes'
                    ]


X_train, y_train, X_test, y_test = get_model_inputs(train_encoded.drop(columns=features_to_drop),
                                                    test_encoded.drop(columns=features_to_drop),
                                                    num_features,
                                                    [el for el in ohe_features if el not in features_to_drop])
print('X_train shape: ', X_train.shape)
print('X_test shape: ', X_test.shape)

In [None]:
k_values = list(range(1, 71))
param_grid = {'n_neighbors': k_values}

In [None]:
start = time.time()

clf = KNeighborsClassifier()
grid = GridSearchCV(estimator=clf,
                    param_grid=param_grid,
                    scoring='accuracy', # 'f1',
                    n_jobs=-1,
                    cv=10,
                    verbose=3
                    )

grid.fit(X_train, y_train)

print("Time taken: %d seconds."%(time.time()-start))

In [None]:
grid.cv_results_['mean_test_score']

In [None]:
# Plot mean cross-validation scores vs K
plt.figure(figsize=(20, 6))
plt.plot(k_values, grid.cv_results_['mean_test_score'], marker='o', linestyle='-')
plt.title('Mean Cross-Validation Score vs K for KNN')
plt.xlabel('K')
plt.ylabel('Mean Cross-Validation Score')
plt.xticks(k_values)
plt.grid(True)
plt.show()

In [None]:
print(grid.best_score_)
print(grid.best_params_)
best_estimator = grid.best_estimator_

In [None]:
knn_model = KNeighborsClassifier(n_neighbors=60)
knn_model.fit(X_train, y_train)
evaluate_model(knn_model, X_train, y_train, X_test, y_test)

We see that KNN worked well, but not as good as the best LR we trained!

# SVM

For SVM, we also scale the numerical features like in KNN. Therefore, we can already use X_train, y_train, ...

In [None]:
C_values = [0.001, 0.01, 0.1, 1, 10, 100]
param_grid = {'kernel': ['linear'],
              'C': C_values}

In [None]:
start = time.time()

clf = SVC(kernel='linear')
grid = GridSearchCV(estimator=clf,
                    param_grid=param_grid,
                    scoring='accuracy', # 'f1',
                    n_jobs=-1,
                    cv=10,
                    verbose=3
                    )

grid.fit(X_train, y_train)

print("Time taken: %d seconds."%(time.time()-start))

In [None]:
# Plot cross-validation scores vs C
plt.figure(figsize=(10, 6))
plt.plot(C_values, grid.cv_results_['mean_test_score'], marker='o', linestyle='-')
plt.title('Mean Cross-Validation Score vs C for SVM with Linear Kernel')
plt.xlabel('C')
plt.ylabel('Mean Cross-Validation Score')
plt.xscale('log')
plt.grid(True)
plt.show()




> ### High values for C mean more of hard margine, more potentially overfitting.



> ### Smaller values for C mean more of soft margin, allowing for more mistakes to happen and less of overfitting!





In [None]:
# Evaluate the model
best_svm_model = SVC(kernel='linear', C=0.1)
best_svm_model.fit(X_train, y_train)
evaluate_model(best_svm_model, X_train, y_train, X_test, y_test)

In [None]:
# Evaluate the model
best_svm_model = SVC(kernel='rbf', C=0.1)
best_svm_model.fit(X_train, y_train)
evaluate_model(best_svm_model, X_train, y_train, X_test, y_test)

So far LR has been the ebst model we have achieved!

# Decision Tree


In [None]:
def preprocess_data_DT(data):
  train, test = train_test_split(data, test_size=0.12, stratify=data['Churn'], random_state=19)

  # Taking care of missing values
  missing_indices_train = train.loc[train['TotalCharges']==' '].index
  missing_indices_test = test.loc[test['TotalCharges']==' '].index
  # We impute it with negative values
  train.loc[missing_indices_train, 'TotalCharges'] = -99
  test.loc[missing_indices_test, 'TotalCharges'] = -99
  # After all, we cast the type to float
  train['TotalCharges'] = train['TotalCharges'].astype(float)
  test['TotalCharges'] = test['TotalCharges'].astype(float)

  # map the outputs as well
  output_dic = {'Yes': 1, 'No': 0}
  train['Churn'] = train['Churn'].map(output_dic)
  test['Churn'] = test['Churn'].map(output_dic)

  # get numerical and categorical features
  num_features = ['tenure', 'MonthlyCharges', 'TotalCharges']
  cat_features = [el for el in train.columns if el not in ['Churn', 'customerID'] + num_features ]


  # one hot encode categorical features
  train_encoded, test_encoded = one_hot_encode(train.copy(), test.copy(), cat_features)

  ohe_features = [el for el in train_encoded.columns if el not in train.columns]

  return train_encoded, test_encoded, num_features, ohe_features

In [None]:
train_encoded, test_encoded, num_features, ohe_features = preprocess_data_DT(data)

In [None]:
train_encoded.head(1)

In [None]:
features_to_drop = [ # drop because of 100% correlations
                    'MultipleLines_No phone service',
                    'InternetService_No',
                    'OnlineSecurity_No internet service',
                    'OnlineBackup_No internet service',
                    'DeviceProtection_No internet service',
                    'TechSupport_No internet service',
                    'StreamingMovies_No internet service',

                    # Drop because correlation is 1%
                    #'gender_Male',
                    #'PhoneService_Yes'
                    ]

X_train, y_train, X_test, y_test = get_model_inputs(train_encoded.drop(columns=features_to_drop),
                                                    test_encoded.drop(columns=features_to_drop),
                                                    num_features,
                                                    [el for el in ohe_features if el not in features_to_drop])
print('X_train shape: ', X_train.shape)
print('X_test shape: ', X_test.shape)

In [None]:
decision_tree = DecisionTreeClassifier(random_state=11)
decision_tree.fit(X_train, y_train)

# Evaluate the model
evaluate_model(decision_tree, X_train, y_train, X_test, y_test)

In [None]:
help(DecisionTreeClassifier)

In [None]:
decision_tree = DecisionTreeClassifier(max_depth=4,
                                       random_state=11)
decision_tree.fit(X_train, y_train)

# Evaluate the model
evaluate_model(decision_tree, X_train, y_train, X_test, y_test)

In [None]:
param_grid = {'criterion': ['gini'], # 'entropy', 'log_loss'],
              'max_depth': [4, 5, 6, 7, 8, 9, 10, 15],
              'min_samples_split': [2, 4, 6],
              'min_samples_leaf': [1, 2, 4]
              }

start = time.time()

clf = DecisionTreeClassifier(random_state=11)
grid = GridSearchCV(estimator=clf,
                    param_grid=param_grid,
                    scoring='accuracy', # 'f1',
                    n_jobs=-1,
                    cv=10,
                    verbose=3
                    )

grid.fit(X_train, y_train)

print("Time taken: %d seconds."%(time.time()-start))

In [None]:
print(grid.best_score_)
print(grid.best_params_)
best_estimator = grid.best_estimator_

In [None]:
decision_tree = DecisionTreeClassifier(**grid.best_params_,
                                       random_state=11)
decision_tree.fit(X_train, y_train)

# Evaluate the model
evaluate_model(decision_tree, X_train, y_train, X_test, y_test)

# Random Forest

The data we use for random forest is similar to what we use for decision trees.

In [None]:
rf_classifier = RandomForestClassifier(n_estimators=50, random_state=11)

rf_classifier.fit(X_train, y_train)

evaluate_model(rf_classifier, X_train, y_train, X_test, y_test)


Clearly we have some overfitting in Random Forest because train accuracy is very high. Now we need to optimize hyperparameters!

In [None]:
param_grid = {
    'n_estimators': [100, 150, 200, 250, 300],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth': [None, 10, 20, 30, 40, 50],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False],
    'random_state': [11]
}

In [None]:
start = time.time()

clf = RandomForestClassifier()

random_search = RandomizedSearchCV(clf,
                                   param_grid,
                                   n_iter=100,
                                   cv=10,
                                   verbose=2,
                                   random_state=11,
                                   n_jobs=-1)
random_search.fit(X_train, y_train)

print("Time taken: %d seconds."%(time.time()-start))

In [None]:
print(random_search.best_score_)
print(random_search.best_params_)
best_estimator = random_search.best_estimator_

In [None]:
rf = RandomForestClassifier(**random_search.best_params_)
rf.fit(X_train, y_train)

evaluate_model(rf, X_train, y_train, X_test, y_test)

# AdaBoost

In [None]:
train_encoded, test_encoded, num_features, ohe_features = preprocess_data_LR(data)


features_to_drop = [ # drop because of 100% correlations
                    'MultipleLines_No phone service',
                    'InternetService_No',
                    'OnlineSecurity_No internet service',
                    'OnlineBackup_No internet service',
                    'DeviceProtection_No internet service',
                    'TechSupport_No internet service',
                    'StreamingMovies_No internet service',

                    # Drop because correlation is 1%
                    'gender_Male',
                    'PhoneService_Yes'
                    ]


X_train, y_train, X_test, y_test = get_model_inputs(train_encoded.drop(columns=features_to_drop),
                                                    test_encoded.drop(columns=features_to_drop),
                                                    num_features,
                                                    [el for el in ohe_features if el not in features_to_drop])
print('X_train shape: ', X_train.shape)
print('X_test shape: ', X_test.shape)

In [None]:
base_estimator = DecisionTreeClassifier(max_depth=3)
adaboost = AdaBoostClassifier(estimator=base_estimator,
                         learning_rate=0.02,
                         n_estimators=300,
                         random_state=11)

adaboost.fit(X_train, y_train)
evaluate_model(adaboost, X_train, y_train, X_test, y_test)

In [None]:
param_grid = {
    'n_estimators': [200, 300, 400],
    'learning_rate': [0.01, 0.02],
    'estimator__max_depth': [3, 4],
    'estimator__min_samples_split': [2, 5],
    'estimator__min_samples_leaf': [1, 2,]
}

In [None]:
start = time.time()

base_estimator = DecisionTreeClassifier()
ada = AdaBoostClassifier(estimator=base_estimator)

grid_search = GridSearchCV(ada, param_grid, cv=10, verbose=2, n_jobs=-1)
grid_search.fit(X_train, y_train)

print("Time taken: %d seconds."%(time.time()-start))

In [None]:
print(grid_search.best_score_)
best_estimator = grid_search.best_estimator_
print(best_estimator)

In [None]:
best_estimator.fit(X_train, y_train)

evaluate_model(best_estimator, X_train, y_train, X_test, y_test)

In [None]:
param_grid = {
    'n_estimators': [300, 350, 400],
    'learning_rate': [0.01, 0.02, 0.3, 0.4, 0.5],
    'estimator__max_depth': [3],
    'estimator__min_samples_split': [4, 5],
    'estimator__min_samples_leaf': [2, 3]
}

In [None]:
start = time.time()

base_estimator = DecisionTreeClassifier()
ada = AdaBoostClassifier(estimator=base_estimator)

grid_search = GridSearchCV(ada, param_grid, cv=10, verbose=2, n_jobs=-1)
grid_search.fit(X_train, y_train)

print("Time taken: %d seconds."%(time.time()-start))

In [None]:
print(grid_search.best_score_)
best_estimator = grid_search.best_estimator_
print(best_estimator)

In [None]:
best_estimator.fit(X_train, y_train)

evaluate_model(best_estimator, X_train, y_train, X_test, y_test)