In [7]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import ExtraTreesClassifier


In [8]:
# load data
df = pd.read_csv('/content/WA_Fn-UseC_-Telco-Customer-Churn.csv')

# Split the data into an 80-20 train-test split with a random state of “1”.

df['TotalCharges'] = df['TotalCharges'].replace(' ', '0').astype(float)
df['Churn'] = (df['Churn'] == 'Yes').astype(int)

X = df.drop('Churn', axis=1)
y = df['Churn']

# split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

# categorical = ['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService','OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies','Contract', 'PaperlessBilling', 'PaymentMethod'] numerical = ['tenure', 'MonthlyCharges', 'TotalCharges']

categorical = ['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService','OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies','Contract', 'PaperlessBilling', 'PaymentMethod']
numerical = ['tenure', 'MonthlyCharges', 'TotalCharges']

X_train = X_train[categorical + numerical]
X_test = X_test[categorical + numerical]


# Scale the numerical features
scaler = StandardScaler()
X_train_numerical = scaler.fit_transform(X_train[numerical])
X_test_numerical = scaler.transform(X_test[numerical])

# One-hot encode the categorical features
encoder = OneHotEncoder(sparse_output=False)
X_train_categorical = encoder.fit_transform(X_train[categorical])
X_test_categorical = encoder.transform(X_test[categorical])

# Convert the scaled and encoded features back to dataframes and put back the column names
X_train_scaled = pd.DataFrame(X_train_numerical, columns=numerical)
X_test_scaled = pd.DataFrame(X_test_numerical, columns=numerical)

X_train_encoded = pd.DataFrame(X_train_categorical, columns=encoder.get_feature_names_out())
X_test_encoded = pd.DataFrame(X_test_categorical, columns=encoder.get_feature_names_out())



# Use scikit learn to train a random forest and extra trees classifier, and
# use xgboost and lightgbm to train an extreme boosting model and a light
# gradient boosting model. Use random_state = 1 for training all models
# and evaluate on the test set

# Combine scaled numerical and one-hot encoded categorical features into train and test set dataframes
X_train_combined = pd.concat([X_train_scaled, X_train_encoded], axis=1)
X_test_combined = pd.concat([X_test_scaled, X_test_encoded], axis=1)

# Train the models
models = [
    RandomForestClassifier(random_state=1),
    ExtraTreesClassifier(random_state=1),
    XGBClassifier(random_state=1),
    LGBMClassifier(random_state=1)
]

for model in models:
    model.fit(X_train_combined, y_train)
    score = model.score(X_test_combined, y_test)
    print(f"{model.__class__.__name__}: {score}")


RandomForestClassifier: 0.7913413768630234
ExtraTreesClassifier: 0.7672107877927609
XGBClassifier: 0.7934705464868701
[LightGBM] [Info] Number of positive: 1521, number of negative: 4113
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001005 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 670
[LightGBM] [Info] Number of data points in the train set: 5634, number of used features: 46
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.269968 -> initscore=-0.994785
[LightGBM] [Info] Start training from score -0.994785
LGBMClassifier: 0.8034066713981547


In [None]:

# Define the hyperparameter grid
n_estimators = [50, 100, 300, 500, 1000]
min_samples_split = [2, 3, 5, 7, 9]
min_samples_leaf = [1, 2, 4, 6, 8]
max_features = ['auto', 'sqrt', 'log2', None]

hyperparameter_grid = {
    'n_estimators': n_estimators,
    'min_samples_leaf': min_samples_leaf,
    'min_samples_split': min_samples_split,
    'max_features': max_features
}

# Initialize the ExtraTreesClassifier
et_clf = ExtraTreesClassifier(random_state=1)

# Perform randomized search with cross-validation
random_search = RandomizedSearchCV(estimator=et_clf, param_distributions=hyperparameter_grid, cv=5, n_iter=10, scoring='accuracy', n_jobs=-1, verbose=1, random_state=1)

# Fit the model
random_search.fit(X_train_combined, y_train)

# Get the best hyperparameters
best_params = random_search.best_params_

# Print the best hyperparameters
print("Best hyperparameters:", best_params)


Fitting 5 folds for each of 10 candidates, totalling 50 fits


In [None]:
# Create a new ExtraTreesClassifier with the best hyperparameters
et_clf_tuned = ExtraTreesClassifier(random_state=1, **best_params)

# Train the new model
et_clf_tuned.fit(X_train_combined, y_train)

# Calculate the accuracy of the new model
et_clf_tuned_score = et_clf_tuned.score(X_test_combined, y_test)

# Calculate the accuracy of the initial model
et_clf_initial_score = random_search.cv_results_['mean_test_score'][0]

# Print the accuracy of both models
print(f"Initial ExtraTreesClassifier accuracy: {et_clf_initial_score}")
print(f"Tuned ExtraTreesClassifier accuracy: {et_clf_tuned_score}")

# Compare the accuracies
if et_clf_tuned_score > et_clf_initial_score:
    print("The accuracy of the tuned model is higher than the initial model.")
else:
    print("The accuracy of the tuned model is lower than or equal to the initial model.")


In [None]:
importances = et_clf_tuned.feature_importances_
features = X_train_combined.columns

# Sort the features by their importance
sorted_features = sorted(zip(importances, features), reverse=True)

# Print the two most important features
print(f"The two most important features are: {sorted_features[0][1]} and {sorted_features[1][1]}")
