In [22]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import  LabelEncoder, OneHotEncoder, TargetEncoder,OrdinalEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
import pickle

## Data Preprocessing

In [3]:
## Import the data 
df = pd.read_csv('Cleaned_churn_data.csv')

# separate features and Target 
X = df.drop(['Customer Status','Customer ID','Zip Code','Latitude','Longitude'], axis = 1)
y = df['Customer Status']

In [4]:
cat_cols = X.select_dtypes(include = 'object').columns.tolist()
cat_cols

['Gender',
 'Married',
 'City',
 'Phone Service',
 'Multiple Lines',
 'Internet Service',
 'Internet Type',
 'Online Security',
 'Online Backup',
 'Device Protection Plan',
 'Premium Tech Support',
 'Streaming TV',
 'Streaming Movies',
 'Streaming Music',
 'Unlimited Data',
 'Contract',
 'Paperless Billing',
 'Payment Method']

### Splitting the dataset into Training and Test set

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2, random_state = 42)

#### Change Yes or No Values with 1s and 0s

In [8]:
# columns with Yes or No values
cat_cols_with_Yes_or_No_val = ['Married','Phone Service','Multiple Lines','Internet Service',
                               'Online Security','Online Backup','Device Protection Plan','Premium Tech Support',
                               'Streaming TV','Streaming Movies','Streaming Music','Unlimited Data','Paperless Billing']

# change Yes or No values to 0 and 1 
X_train[cat_cols_with_Yes_or_No_val] = X_train[cat_cols_with_Yes_or_No_val].replace({'Yes':1,'No':0})
X_test[cat_cols_with_Yes_or_No_val] = X_test[cat_cols_with_Yes_or_No_val].replace({'Yes':1,'No':0})

  X_train[cat_cols_with_Yes_or_No_val] = X_train[cat_cols_with_Yes_or_No_val].replace({'Yes':1,'No':0})
  X_test[cat_cols_with_Yes_or_No_val] = X_test[cat_cols_with_Yes_or_No_val].replace({'Yes':1,'No':0})


In [9]:
# categorical columns in X
categorical_cols = X.select_dtypes(include ='object').columns.tolist()

#### Create a pipleline for Eccoding Categorical Columns

In [36]:
# One-Hot Encoding (Gender, Internet Type, Payment Method)
ohe_features = ['Gender', 'Internet Type', 'Payment Method']
ohe_transformer = Pipeline(steps=[
    ('encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

#Ordinalencoding for ordinal feature(Contract)
contract_categories_ordered = ['Month-to-Month', 'One Year', 'Two Year']
ordinal_features = ['Contract']                         
ordinal_transformer = Pipeline(steps=[
    ('encoder', OrdinalEncoder(categories=[contract_categories_ordered], handle_unknown='use_encoded_value', unknown_value=-1))
])

# Target encoding for City 
target_encode_features = ['City']
target_encoder_transformer = Pipeline(steps=[
    ('encoder', TargetEncoder(smooth=10)) # Adjust smoothing params 
])


preprocessor = ColumnTransformer(
    transformers=[
        ('ordinal_features', ordinal_transformer, ordinal_features),
        ('ohe_features', ohe_transformer, ohe_features),
        ('target_encode_city', target_encoder_transformer, target_encode_features),
    ],
    remainder='passthrough'
)
# encode the Target Variable
le= LabelEncoder()
y_train_encoded = le.fit_transform(y_train)

# 3. Transform y_test using the SAME fitted LabelEncoder
y_test_encoded = le.transform(y_test)
y_test_encoded = le.transform(y_test)

### Create a Model Pipline and Train the Model 

In [39]:
pipeline = Pipeline(steps=[
    ('preprocessor',preprocessor),
    ('Classifier', None)
])

param_grid = [
    {
        # Parameters for RandomForestClassifier
        'Classifier': [RandomForestClassifier(random_state=42)], # The model instance
        'Classifier__n_estimators': [100, 200],
        'Classifier__max_depth': [10, 20],
        'Classifier__min_samples_leaf': [1, 5]
    },
    {
        # Parameters for XGBClassifier
        'Classifier': [XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42)], # The model instance
        'Classifier__n_estimators': [100, 200],
        'Classifier__max_depth': [5, 10],
        'Classifier__learning_rate': [0.05, 0.1]
    },
    {
        # Parameters for LGBMClassifier
        'Classifier': [LGBMClassifier(random_state=42)], # The model instance
        'Classifier__n_estimators': [100, 200],
        'Classifier__max_depth': [10, -1], # -1 means no limit
        'Classifier__learning_rate': [0.05, 0.1]
    }
]

grid_search = GridSearchCV(pipeline, param_grid, cv=5, n_jobs=-1, verbose=2, scoring='accuracy')
grid_search.fit(X_train, y_train_encoded)

print(f"Best parameters: {grid_search.best_params_}")
print(f"Best cross-validation score: {grid_search.best_score_:.4f}")

Fitting 5 folds for each of 24 candidates, totalling 120 fits
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000532 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2188
[LightGBM] [Info] Number of data points in the train set: 3868, number of used features: 35
[LightGBM] [Info] Start training from score -1.123214
[LightGBM] [Info] Start training from score -3.024051
[LightGBM] [Info] Start training from score -0.468144
Best parameters: {'Classifier': LGBMClassifier(random_state=42), 'Classifier__learning_rate': 0.1, 'Classifier__max_depth': -1, 'Classifier__n_estimators': 100}
Best cross-validation score: 0.8239


### Prediction on Test set

In [34]:
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
acc = accuracy_score(y_test_encoded,y_pred)
print(f'Accuracy : {acc * 100:.2f} %')

Accuracy : 81.80 %


### Save the Model and Load Model to Make Sure 

In [81]:
with open('LightGBM_Model.pkl', 'wb') as file:
    pickle.dump(best_model, file)

In [88]:
with open('LightGBM_Model.pkl', 'rb') as file:
        loaded_model = pickle.load(file)
        print(f"Model successfully loaded from {'LightGBM_Model.pkl'}")

Model successfully loaded from LightGBM_Model.pkl
