In [15]:
import pandas as pd
from sklearn.model_selection import train_test_split

# For Loading the dataset


df = pd.read_csv(r"C:\Users\kandh\OneDrive\Desktop\WA_Fn-UseC_-Telco-Customer-Churn.csv")

# To Convert 'TotalCharges' to numeric and fill missing values with 0
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
df['TotalCharges'].fillna(0, inplace=True)

# To Convert 'Churn' to binary values
df['Churn'] = df['Churn'].map({'Yes': 1, 'No': 0})

# To Split the data into an 80-20 train-test split with a random state of 1
X = df.drop(['Churn'], axis=1)  
y = df['Churn']  

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

# To Select features
categorical_features = ['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines',
                       'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport',
                       'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod']

numerical_features = ['tenure', 'MonthlyCharges', 'TotalCharges']

# To Create dataframes with only selected features
X_train_categorical = X_train[categorical_features]
X_test_categorical = X_test[categorical_features]

X_train_numerical = X_train[numerical_features]
X_test_numerical = X_test[numerical_features]


Feature Scaling for Numerical Features:

In [16]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_numerical_scaled = scaler.fit_transform(X_train_numerical)
X_test_numerical_scaled = scaler.transform(X_test_numerical)

# To Convert the scaled arrays back to dataframes with column names
X_train_numerical_scaled_df = pd.DataFrame(X_train_numerical_scaled, columns=numerical_features)
X_test_numerical_scaled_df = pd.DataFrame(X_test_numerical_scaled, columns=numerical_features)


In [17]:
# One-Hot Encoding for Categorical Features:

from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder(sparse=False)
X_train_categorical_encoded = encoder.fit_transform(X_train_categorical)
X_test_categorical_encoded = encoder.transform(X_test_categorical)

# Get the column names for one-hot encoded features
categorical_feature_names = encoder.get_feature_names(input_features=categorical_features)

# Convert the encoded arrays back to dataframes with column names
X_train_categorical_encoded_df = pd.DataFrame(X_train_categorical_encoded, columns=categorical_feature_names)
X_test_categorical_encoded_df = pd.DataFrame(X_test_categorical_encoded, columns=categorical_feature_names)




In [18]:
# To Combine Scaled Numerical and One-Hot Encoded Categorical Features:

X_train_combined = pd.concat([X_train_numerical_scaled_df, X_train_categorical_encoded_df], axis=1)
X_test_combined = pd.concat([X_test_numerical_scaled_df, X_test_categorical_encoded_df], axis=1)


In [19]:
# To Train Machine Learning Models:
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

# To Set random_state to ensure reproducibility
random_state = 1

# To Train Random Forest Classifier
rf_classifier = RandomForestClassifier(random_state=random_state)
rf_classifier.fit(X_train_combined, y_train)

# To Train XGBoost Classifier
xgb_classifier = XGBClassifier(random_state=random_state)
xgb_classifier.fit(X_train_combined, y_train)

# To Train LightGBM Classifier
lgbm_classifier = LGBMClassifier(random_state=random_state)
lgbm_classifier.fit(X_train_combined, y_train)


LGBMClassifier(random_state=1)

In [22]:
# Evaluate Models on the Test Set:

from sklearn.metrics import accuracy_score, classification_report

# To Evaluate Random Forest model
rf_predictions = rf_classifier.predict(X_test_combined)
rf_accuracy = accuracy_score(y_test, rf_predictions)
rf_classification_report = classification_report(y_test, rf_predictions)

print(f"Random Forest Model Accuracy: {rf_accuracy}")
print("Classification Report for Random Forest Model:\n", rf_classification_report)


Random Forest Model Accuracy: 0.7913413768630234
Classification Report for Random Forest Model:
               precision    recall  f1-score   support

           0       0.85      0.88      0.86      1061
           1       0.58      0.53      0.56       348

    accuracy                           0.79      1409
   macro avg       0.72      0.71      0.71      1409
weighted avg       0.79      0.79      0.79      1409



In [24]:
from sklearn.ensemble import ExtraTreesClassifier

# To Define and train the ExtraTreesClassifier model
et_classifier = ExtraTreesClassifier(random_state=1)
et_classifier.fit(X_train_combined, y_train)


ExtraTreesClassifier(random_state=1)

In [25]:
# Assuming that we have already trained the ExtraTreesClassifier model et_classifier

# To Get feature importances
feature_importances = et_classifier.feature_importances_

# To Create a DataFrame to associate feature names with their importances
feature_importance_df = pd.DataFrame({'Feature': X_train_combined.columns, 'Importance': feature_importances})

# To Sort the DataFrame by importance in descending order
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

# To Get the two most important features
top_2_features = feature_importance_df.head(2)

# To Display the two most important features
print("Two Most Important Features:")
print(top_2_features)


Two Most Important Features:
        Feature  Importance
2  TotalCharges    0.126948
0        tenure    0.117973


In [27]:
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.metrics import accuracy_score

# To Define the hyperparameters obtained from RandomizedSearchCV
new_hyperparameters = {
    'n_estimators': 300, 
    'max_depth': 15,     
    'min_samples_split': 2,  
    'min_samples_leaf': 1,   
    'max_features': 'auto',  
    'random_state': 1
}

# To Create and train the new ExtraTreesClassifier model with hyperparameters
new_et_classifier = ExtraTreesClassifier(**new_hyperparameters)
new_et_classifier.fit(X_train_combined, y_train)

# To  Make predictions on the test set for the new model
new_et_predictions = new_et_classifier.predict(X_test_combined)

# To Calculate the accuracy of the new model
new_et_accuracy = accuracy_score(y_test, new_et_predictions)

# To Calculate the accuracy of the initial model
initial_et_predictions = et_classifier.predict(X_test_combined) 
initial_et_accuracy = accuracy_score(y_test, initial_et_predictions)

# To Print the accuracy comparison
print("Accuracy of the New Optimal ExtraTreesClassifier Model:", new_et_accuracy)
print("Accuracy of the Initial ExtraTreesClassifier Model (No Hyperparameter Tuning):", initial_et_accuracy)

# To Compare the accuracies
if new_et_accuracy > initial_et_accuracy:
    print("The accuracy of the new optimal model is higher than the initial model.")
elif new_et_accuracy < initial_et_accuracy:
    print("The accuracy of the new optimal model is lower than the initial model.")
else:
    print("The accuracies of the new optimal model and the initial model are the same.")


Accuracy of the New Optimal ExtraTreesClassifier Model: 0.7828246983676366
Accuracy of the Initial ExtraTreesClassifier Model (No Hyperparameter Tuning): 0.7672107877927609
The accuracy of the new optimal model is higher than the initial model.


In [28]:
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import RandomizedSearchCV

# to Define the hyperparameter grid
n_estimators = [50, 100, 300, 500, 1000]
min_samples_split = [2, 3, 5, 7, 9]
min_samples_leaf = [1, 2, 4, 6, 8]
max_features = ['auto', 'sqrt', 'log2', None]

hyperparameter_grid = {
    'n_estimators': n_estimators,
    'min_samples_leaf': min_samples_leaf,
    'min_samples_split': min_samples_split,
    'max_features': max_features
}

# To Create an ExtraTreesClassifier model
et_classifier = ExtraTreesClassifier(random_state=1)

# To Perform RandomizedSearchCV
random_search = RandomizedSearchCV(
    et_classifier,  # Estimator
    param_distributions=hyperparameter_grid,
    cv=5,
    n_iter=10,  # Number of iterations for random search
    scoring='accuracy',
    n_jobs=-1,  # Use all available CPU cores
    verbose=1,  # Verbosity level
    random_state=1
)

# To Fit the random search to the data
random_search.fit(X_train_combined, y_train)

# To Get the best hyperparameters
best_hyperparameters = random_search.best_params_

print("Best Hyperparameters:", best_hyperparameters)


Fitting 5 folds for each of 10 candidates, totalling 50 fits
Best Hyperparameters: {'n_estimators': 1000, 'min_samples_split': 9, 'min_samples_leaf': 8, 'max_features': 'sqrt'}


In [29]:
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score

# Lets Assume we have defined hyperparameters for the LGBM model
lgbm_hyperparameters = {
    'n_estimators': 100, 
    'max_depth': 6,      
    'learning_rate': 0.1, 
    'random_state': 1
}

# Create and train the LGBM classifier
lgbm_classifier = LGBMClassifier(**lgbm_hyperparameters)
lgbm_classifier.fit(X_train_combined, y_train)

# Make predictions on the test set
lgbm_predictions = lgbm_classifier.predict(X_test_combined)

# Calculate the accuracy of the LGBM model on the test set
lgbm_accuracy = accuracy_score(y_test, lgbm_predictions)

print("Accuracy of the LGBM Classifier on the Test Set:", lgbm_accuracy)


Accuracy of the LGBM Classifier on the Test Set: 0.8090844570617459


In [30]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

# By Assuming we have defined hyperparameters for the XGBoost model
xgb_hyperparameters = {
    'n_estimators': 100, 
    'max_depth': 6,      
    'learning_rate': 0.1, 
    'random_state': 1
}

# To Create and train the XGBoost classifier
xgb_classifier = XGBClassifier(**xgb_hyperparameters)
xgb_classifier.fit(X_train_combined, y_train)

# To Make predictions on the test set
xgb_predictions = xgb_classifier.predict(X_test_combined)

# To Calculate the accuracy of the XGBoost model on the test set
xgb_accuracy = accuracy_score(y_test, xgb_predictions)

print("Accuracy of the XGBoost Classifier on the Test Set:", xgb_accuracy)


Accuracy of the XGBoost Classifier on the Test Set: 0.8048261178140526


In [31]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# By Assuming we have defined hyperparameters for the Random Forest model
rf_hyperparameters = {
    'n_estimators': 100,  
    'max_depth': 6,      
    'min_samples_split': 2,  
    'min_samples_leaf': 1,  
    'max_features': 'auto',  
    'random_state': 1
}

# To Create and train the Random Forest classifier
rf_classifier = RandomForestClassifier(**rf_hyperparameters)
rf_classifier.fit(X_train_combined, y_train)

# To Make predictions on the test set
rf_predictions = rf_classifier.predict(X_test_combined)

# To Calculate the accuracy of the Random Forest model on the test set
rf_accuracy = accuracy_score(y_test, rf_predictions)

print("Accuracy of the Random Forest Classifier on the Test Set:", rf_accuracy)


Accuracy of the Random Forest Classifier on the Test Set: 0.8140525195173882
