# Importing the Dataset and data cleaning

In [10]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler

In [11]:


def preprocess_data(file_path):

  # load the telecom churn dataset
  df = pd.read_csv(file_path)

  # drop customer id columns from the dataset
  df = df.drop('customerID',axis=1)

  # separate categorical and numerical columns
  categorical_columns = df.select_dtypes(include=['object'])
  numerical_columns = df.select_dtypes(include=['int64','float64'])


  # encode categorical Columns:
  le = LabelEncoder()
  for cat in categorical_columns:
      df[cat] = le.fit_transform(df[cat])

  # standardize the numerical columns
  scaler = StandardScaler()
  df[numerical_columns.columns] = scaler.fit_transform(df[numerical_columns.columns])

  df.to_csv('preprocessed_telecom_churn_data.csv',index=False)
  return df

In [12]:
filepath = "telecom_churn_dataset.csv"
df = preprocess_data(filepath)
print("Preprocessed Data is saved as 'preprocessed_telecom_churn_data.csv")



Preprocessed Data is saved as 'preprocessed_telecom_churn_data.csv


# Exploratory Data Analysis(EDA)

In [15]:
import matplotlib.pyplot as plt
import seaborn as sns

def perform_eda(file_path):
  df = pd.read_csv(file_path)

  # target variable churn distribution
  plt.figure(figsize=(12,6))
  sns.countplot(x='Churn',data=df)
  plt.title("Churn distribution")
  plt.savefig('Churn_distibution.png')
  plt.close()

  # correlation between the columns of the dataframe
  plt.figure(figsize=(12,6))
  sns.heatmap(df.corr(),annot = True,cmap = 'coolwarm',fmt = '.2f')
  plt.title("Correlation between the columns")
  plt.savefig('correlation_heatmap.png')
  plt.close()


  # tenure vs churn
  plt.figure(figsize=(8, 6))
  sns.boxplot(x='Churn', y='tenure', data=df)
  plt.title('Tenure vs Churn')
  plt.savefig('tenure_vs_churn.png')
  plt.close()




In [21]:
file_path = "preprocessed_telecom_churn_data.csv"
perform_eda(file_path)
print("EDA plots are saved as : Churn_distibution.png,correlation_heatmap.png,tenure_vs_churn.png")

EDA plots are saved as : Churn_distibution.png,correlation_heatmap.png,tenure_vs_churn.png


# model Training


In [23]:
import xgboost as xgb
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    roc_auc_score,
    f1_score,
    confusion_matrix,
    classification_report
)
import joblib

In [26]:
def train_model(file_path):
  df = pd.read_csv(file_path)

  # split the dataset feature and target
  X = df.drop('Churn',axis=1)
  y = df['Churn']

  # split dataset into train and test
  X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)

  # train xgboost model
  base_model = xgb.XGBClassifier(use_label_encoder=False,eval_metrics='logloss')

  param_grid = {
      'max_depth':[3,5,7],
      'learning_rate':[0.01,0.1,0.3],
      'n_estimators':[100,200]
  }

  # perfomring  gris search sv for hyperparameter tuning
  grid_search = GridSearchCV(estimator=base_model,param_grid=param_grid,
                             cv = 5,scoring='roc_auc',n_jobs=1,verbose=1)
  grid_search.fit(X_train,y_train)

  # get the best model
  best_model = grid_search.best_estimator_

  # Evaluate the model
  y_train_pred = best_model.predict(X_train)

   # Evaluate model on training set
  y_train_pred = best_model.predict(X_train)
  train_metrics = {
        'accuracy': accuracy_score(y_train, y_train_pred),
        'precision': precision_score(y_train, y_train_pred),
        'recall': recall_score(y_train, y_train_pred),
        'roc_auc': roc_auc_score(y_train, y_train_pred)
  }

    # Evaluate model on test set
  y_pred = best_model.predict(X_test)
  test_metrics = {
        'accuracy': accuracy_score(y_test, y_pred),
        'precision': precision_score(y_test, y_pred),
        'recall': recall_score(y_test, y_pred),
        'roc_auc': roc_auc_score(y_test, y_pred)
  }
  print(f"Test Accuracy: {test_metrics['accuracy']:.2f}")
  print(f"Test Precision: {test_metrics['precision']:.2f}")
  print(f"Test Recall: {test_metrics['recall']:.2f}")
  print(f"Test ROC-AUC: {test_metrics['roc_auc']:.2f}")

    # Save model
  joblib.dump(best_model, 'churn_model.pkl')

  return best_model

In [27]:

model = train_model(file_path)
print("Model saved as 'churn_model.pkl'")

Fitting 5 folds for each of 18 candidates, totalling 90 fits


Parameters: { "eval_metrics", "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "eval_metrics", "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "eval_metrics", "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "eval_metrics", "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "eval_metrics", "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "eval_metrics", "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "eval_metrics", "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "eval_metrics", "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "eval_metrics", "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
P

Test Accuracy: 0.82
Test Precision: 0.69
Test Recall: 0.54
Test ROC-AUC: 0.73
Model saved as 'churn_model.pkl'


In [28]:
def plot_feature_importance(model_path, feature_names):
    # Load model
    model = joblib.load(model_path)

    # Plot feature importance
    plt.figure(figsize=(10, 6))
    xgb.plot_importance(model, max_num_features=10)
    plt.title('Feature Importance')
    plt.savefig('feature_importance.png')
    plt.close()



df = pd.read_csv(file_path)
feature_names = df.drop('Churn', axis=1).columns
plot_feature_importance('churn_model.pkl', feature_names)
print("Feature importance plot saved as 'feature_importance.png'")

Feature importance plot saved as 'feature_importance.png'


<Figure size 1000x600 with 0 Axes>

# retention strategy

In [31]:
def get_retention_strategy(churn_prob,customer_data):
  suggestions = []

  # cutomer with high churn probability
  if(churn_prob>0.7):
    suggestions.append("Offer a X % discount on the next billing cycle.")
    suggestions.append("Schedule a follow up call to address concerns")

  # medium churn probabilty
  elif churn_prop>0.3:
    suggestions.append("Send a personalized email with loyalty rewards.")
    suggestions.append("Offer a free upgrade to a premium feature.")

  if customer_data.get('tenure', 0) < 12:
      suggestions.append("Provide a welcome package to new customers.")
  if customer_data.get('Contract') == 0:  # Month-to-month contract
      suggestions.append("Promote a 1-year contract with a discount.")

  return suggestions if suggestions else ["Monitor customer engagement."]


In [32]:
customer = {'tenure': 6, 'Contract': 0, 'MonthlyCharges': 80}
churn_prob = 0.75
strategies = get_retention_strategy(churn_prob, customer)
print("Retention Strategies:", strategies)

Retention Strategies: ['Offer a X % discount on the next billing cycle.', 'Schedule a follow up call to address concerns', 'Provide a welcome package to new customers.', 'Promote a 1-year contract with a discount.']
