In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
import seaborn as sns
import warnings
import plotly.graph_objects as go

from sklearn.preprocessing import LabelEncoder

warnings.filterwarnings("ignore")

In [2]:
teleco = pd.read_csv('telco_customer_churn.csv')

In [3]:
teleco.head()

In [4]:
teleco.info()

In [5]:
teleco.TotalCharges = pd.to_numeric(teleco.TotalCharges, errors = 'coerce')

In [6]:
teleco.TotalCharges.dtype

In [7]:
#teleco.describe(include = 'all')
teleco.describe()

In [8]:
teleco.isnull().sum()

In [9]:
teleco.dropna(inplace = True)

In [10]:
tchurn = teleco.drop('customerID', axis = 1)

In [11]:
tchurn.head()

In [12]:
tchurn['Churn'].replace(to_replace = 'Yes', value = 1, inplace = True)
tchurn['Churn'].replace(to_replace = 'No', value = 0, inplace = True)
df_dummies = pd.get_dummies(tchurn)

label_encoder = LabelEncoder()
df_dummies['Churn'] = label_encoder.fit_transform(df_dummies['Churn'])
df_dummies.head()

In [13]:
# Plotting the churn distribution
plt.figure(figsize=(8, 5))
sns.countplot(x='Churn', data=df_dummies, palette='coolwarm')
plt.title('Churn Distribution')
plt.xlabel('Churn')
plt.ylabel('Count')
plt.xticks(ticks=[0, 1], labels=['No', 'Yes'])
plt.show()

# Calculating the percentage of customers who have churned
churn_rate = df_dummies['Churn'].mean() * 100
churn_rate

-The majority of customers have not churned (represented by "No").

-Approximately 26.58% of customers have churned (represented by "Yes").

This information provides valuable insights into customer retention and can guide further analysis to understand the factors influencing churn.

In [14]:
pd.DataFrame(df_dummies.corr()['Churn'])

The correlation coefficients indicate the strength and direction of the relationship between each feature and the "Churn" variable.

In [15]:
plt.figure(figsize = (12,8))
df_dummies.corr()['Churn'].sort_values(ascending = False).plot(kind='bar')

In [16]:
plt.figure(figsize = (18,22))
sns.heatmap(pd.DataFrame(df_dummies.corr()['Churn'].sort_values(ascending = False)), annot=False, cmap='coolwarm', center=0, linewidths=0.5, )

Positive Correlations: Features like "InternetService_Fiber optic," "PaymentMethod_Electronic check," and "MonthlyCharges" have positive correlations, suggesting that higher values in these features may be associated with higher churn.

Negative Correlations: Features like "tenure," "Contract_Two year," and "TotalCharges" have negative correlations, suggesting that higher values in these features may be associated with lower churn.

## Demographs

In [17]:
teleco.columns

In [18]:
demograph = ['gender', 'SeniorCitizen', 'Partner', 'Dependents']
fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(20, 15))

for n_col, demograph in enumerate(demograph):
    if n_col < 2:
        churn_counts = teleco.groupby(demograph)['Churn'].value_counts(normalize=True).unstack() * 100
        ax = churn_counts.plot(kind='bar', stacked=False, ax=axes[n_col, 0], rot=0)
    elif n_col >= 2:
        churn_counts = teleco.groupby(demograph)['Churn'].value_counts(normalize=True).unstack() * 100
        ax = churn_counts.plot(kind='bar', stacked=False, ax=axes[n_col - 3, 1], rot=0)

        
    ax.yaxis.set_major_formatter(mtick.PercentFormatter(xmax=100.0))
    ax.set_ylabel('Percentage of Total Count')
    ax.legend(title='Churn', labels=['No', 'Yes'])

plt.tight_layout()
plt.show()

---

In [19]:
sns.displot(teleco['tenure'], bins = 30)
plt.ylabel('Number of customers')

In [20]:
plt.bar(teleco['Contract'].value_counts().index, teleco['Contract'].value_counts())

In [21]:
fig, (ax1, ax2, ax3) = plt.subplots(nrows=1, ncols=3, sharey=True, figsize=(20, 6))

sns.histplot(teleco[teleco['Contract'] == 'Month-to-month']['tenure'], ax=ax1)
ax1.set_title('Month-to-month Contract')

sns.histplot(teleco[teleco['Contract'] == 'One year']['tenure'], ax=ax2)
ax2.set_title('One Year Contract')

sns.histplot(teleco[teleco['Contract'] == 'Two year']['tenure'], ax=ax3)
ax3.set_title('Two Year Contract')

plt.show()

In [22]:
sns.scatterplot(teleco, x= 'MonthlyCharges', y = 'TotalCharges', hue ='Churn')

In [23]:
services = ['PhoneService','MultipleLines','InternetService','OnlineSecurity',
           'OnlineBackup','DeviceProtection','TechSupport','StreamingTV','StreamingMovies']

fig, axes = plt.subplots(nrows=3, ncols=3, figsize=(20, 15))

for n_col, item in enumerate(services):
    if n_col < 3:
        churn_counts = teleco.groupby(item)['Churn'].value_counts(normalize=True).unstack() * 100
        ax = churn_counts.plot(kind='bar', stacked=False, ax=axes[n_col, 0], rot=0)
    elif n_col >= 3 and n_col < 6:
        churn_counts = teleco.groupby(item)['Churn'].value_counts(normalize=True).unstack() * 100
        ax = churn_counts.plot(kind='bar', stacked=False, ax=axes[n_col - 3, 1], rot=0)
    elif n_col < 9:
        churn_counts = teleco.groupby(item)['Churn'].value_counts(normalize=True).unstack() * 100
        ax = churn_counts.plot(kind='bar', stacked=False, ax=axes[n_col - 6, 2], rot=0)
        
    ax.yaxis.set_major_formatter(mtick.PercentFormatter(xmax=100.0))
    ax.set_title(item)
    ax.set_ylabel('Percentage of Total Count')
    ax.legend(title='Churn', labels=['No', 'Yes'])

plt.tight_layout()
plt.show()

# Predictive Modeling

In [24]:
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.ensemble import AdaBoostClassifier

from sklearn.metrics import accuracy_score,recall_score, precision_score, f1_score, roc_auc_score, classification_report
from sklearn.model_selection import GridSearchCV, cross_validate, RandomizedSearchCV, validation_curve
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report, confusion_matrix  

from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn import metrics


In [37]:
y = df_dummies['Churn'].values
X = df_dummies.drop(columns = ['Churn'])

In [38]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train.shape, X_test.shape

In [39]:
feature_names = X.columns
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

## Choosing Best Model

In [28]:
models = [('LOGR', LogisticRegression(max_iter= 1000, random_state=42)),
          ('SVC', SVC(probability = True)),
          ('DT', DecisionTreeClassifier(random_state=42)),
          ('RF', RandomForestClassifier(random_state=42)),
          ('XGB', XGBClassifier(random_state=42, objective ='binary:logistic',).set_params(eval_metric='error')),
          ('ADA', AdaBoostClassifier(random_state=42))]

In [29]:
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42),
    'SVC': SVC(probability=True),
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'Random Forest': RandomForestClassifier(random_state=42),
    'XGBoost Classifier': XGBClassifier(use_label_encoder=False, eval_metric='mlogloss'),
    'AdaBoost': AdaBoostClassifier(random_state=42)
}

# Function to evaluate a model
def evaluate_model(model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1]
    return {
        'Accuracy': accuracy_score(y_test, y_pred),
        'Recall': recall_score(y_test, y_pred),
        'Precision': precision_score(y_test, y_pred),
        'ROC AUC': roc_auc_score(y_test, y_prob),
        'F1 Score': f1_score(y_test, y_pred)
    }

# Evaluating all models and storing the results
model_results = {name: evaluate_model(model, X_train, y_train, X_test, y_test) for name, model in models.items()}
model_results_df = pd.DataFrame(model_results).T
model_results_df


In [30]:
model_results_df.plot(kind='bar', figsize=(15, 8), colormap='viridis')
plt.title('Comparison of Different Classification Models')
plt.ylabel('Score')
plt.xlabel('Model')
plt.xticks(rotation=45)
plt.ylim(0, 1)
plt.legend(title='Metrics', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.grid(axis='y', linestyle='--')
plt.tight_layout()
plt.show()

## ADABoost

### Hyperparameter tuning

In [31]:
param_grid = {
    'n_estimators': [50, 100, 150],
    'learning_rate': [0.01, 0.05, 0.1, 0.5, 1],
    'algorithm': ['SAMME', 'SAMME.R']
}

# Creating the AdaBoost model
adaboost_model = AdaBoostClassifier(random_state=42)

# Using Grid Search with 5-fold cross-validation to find the best hyperparameters
grid_search = GridSearchCV(adaboost_model, param_grid, cv=5, scoring='accuracy', verbose=1, n_jobs=-1)
grid_search.fit(X_train, y_train)

# Getting the best hyperparameters and the corresponding accuracy
best_params = grid_search.best_params_
best_score = grid_search.best_score_

best_params, best_score

In [58]:
# Creating the AdaBoost model with the best hyperparameters
best_adaboost_model = AdaBoostClassifier(
    algorithm=best_params['algorithm'],
    learning_rate=best_params['learning_rate'],
    n_estimators=best_params['n_estimators'],
    random_state=42
)

# Training the model on the full training set
best_adaboost_model.fit(X_train, y_train)

# Evaluating the model on the test set
adaboost_test_results = evaluate_model(best_adaboost_model, X_train, y_train, X_test, y_test)
adaboost_test_results_df = pd.DataFrame(adaboost_test_results, index=['AdaBoost']).T
adaboost_test_results_df


In [60]:
y_pred = best_adaboost_model.predict(X_test)

# Creating the confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Plotting the confusion matrix using Seaborn's heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False)
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix for AdaBoost Model')
plt.xticks([0.5, 1.5], labels=['No Churn', 'Churn'])
plt.yticks([0.5, 1.5], labels=['No Churn', 'Churn'])
plt.show()

# Returning the confusion matrix values
cm

True Positives (TP): 190 customers were correctly predicted to churn.

True Negatives (TN): 923 customers were correctly predicted not to churn.

False Positives (FP): 110 customers were incorrectly predicted to churn (they did not actually churn).

False Negatives (FN): 184 customers were incorrectly predicted not to churn (they did actually churn).

### Feature Models

In [63]:
# Getting the feature importance from the AdaBoost model
feature_importance = best_adaboost_model.feature_importances_

# Creating a DataFrame to hold the feature names and their corresponding importance scores
feature_importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importance})

# Sorting the features by importance
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

# Plotting the feature importance
plt.figure(figsize=(12, 8))
plt.barh(feature_importance_df['Feature'].head(10), feature_importance_df['Importance'].head(10), color='teal')
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.title('Feature Importance for AdaBoost Model')
plt.gca().invert_yaxis() # Invert the y-axis to have the most important features on top
plt.show()

# Displaying the top 10 important features
feature_importance_df.head(10)*100

In [61]:
import pickle

In [62]:
 pickle.dump(best_adaboost_model, open('best_adaboost.pickle', 'wb'))