In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
df= pd.read_csv('telecom.csv')
df

FileNotFoundError: [Errno 2] No such file or directory: 'telecom.csv'

In [None]:
df.sample(5)

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.columns = df.columns.str.lower()

In [None]:
df.columns

In [None]:
df.drop(columns=['customerid'],inplace=True)

In [None]:
df.columns

In [None]:
#replacing ' ' empty values with nan
df['totalcharges'] = df['totalcharges'].replace(' ', np.nan).astype(float)

In [None]:
df.describe()

In [None]:
df['churn'].value_counts()

Data is highly imbalanced

In [None]:
100*df['churn'].value_counts()/len(df['churn'])

74% of the Churn data is "No" and 26% is "Yes"

In [None]:
df.select_dtypes(include=['object']).shape

In [None]:
df.select_dtypes(include=['float64','int64']).shape

In [None]:
df.isnull().sum()

11 null values in Total Charges

In [None]:
df.loc[df ['totalcharges'].isnull() == True]

In [None]:
#missing value ratio
(11/7044)*100

In [None]:
#Removing missing values 
df.dropna(how = 'any', inplace = True)

In [None]:
df_updated = df.copy()

 Now Dividing customers into bins based on tenure e.g. for tenure < 12 months: assign a tenure group if 1-12, for tenure between 1 to 2 Yrs, tenure group of 13-24; so on...

In [None]:
# Get the max tenure
print(df_updated['tenure'].max())

In [None]:
# # Group the tenure in bins of 12 months
# labels = ["{0} - {1}".format(i, i + 11) for i in range(1, 72, 12)]

# df_updated['tenure_group'] = pd.cut(df_updated.tenure, range(1, 80, 12), right=False, labels=labels)

In [None]:
# df_updated['tenure_group'].value_counts()

In [None]:
#null value is dropped
df.isnull().sum()

In [None]:
# Check for any duplicated rows
print("\nNumber of duplicated rows before cleaning:", df.duplicated().sum())

In [None]:
# Remove duplicated rows
df = df.drop_duplicates()

In [None]:
# Check for any duplicated rows after cleaning
print("Number of duplicated rows after cleaning:", df.duplicated().sum())

In [None]:
sns.histplot(df['totalcharges'],kde=True)


The ‘totalcharges’ data is right-skewed, indicating most customers have lower charges. The distribution peaks on the left, showing a large number of customers with low charges. As charges increase, customer frequency decreases. A long tail on the right suggests a few customers have very high charges, but they are relatively rare.

In [None]:
sns.boxplot(df['totalcharges'],orient='h')

In [None]:
sns.countplot(x = df['gender'])
plt.title('Telecom Churn Gender')
plt.show()

# Data Exploration

# Univariate Analysis
1. Plot distibution of individual predictors by churn

In [None]:
for i, predictor in enumerate(df.drop(columns=['churn', 'totalcharges', 'monthlycharges'])):
    plt.figure(i)
    sns.countplot(data=df, x=predictor, hue='churn')

2. Convert the target variable 'Churn' in a binary numeric variable i.e. Yes=1 ; No = 0

In [None]:
df_updated['churn'] = np.where(df_updated.churn == 'Yes',1,0)

In [None]:
df_updated.head()

In [None]:
# Convert all the categorical variables into dummy variables
df_updated_dummies = pd.get_dummies(df_updated)
df_updated_dummies.head()

In [None]:
# Churn by Monthly charges
Mth = sns.kdeplot(df_updated_dummies.monthlycharges[(df_updated_dummies["churn"] == 0) ],
                color="Red", shade = True)
Mth = sns.kdeplot(df_updated_dummies.monthlycharges[(df_updated_dummies["churn"] == 1) ],
                ax =Mth, color="Blue", shade= True)
Mth.legend(["No Churn","Churn"],loc='upper right')
Mth.set_ylabel('Density')
Mth.set_xlabel('Monthly Charges')
Mth.set_title('Monthly charges by churn')

Churn is high when Monthly Charges ar high

In [None]:
# Churn by Total charges
Mth = sns.kdeplot(df_updated_dummies.totalcharges[(df_updated_dummies["churn"] == 0) ],
                color="Red", shade = True)
Mth = sns.kdeplot(df_updated_dummies.totalcharges[(df_updated_dummies["churn"] == 1) ],
                ax =Mth, color="Blue", shade= True)
Mth.legend(["No Churn","Churn"],loc='upper right')
Mth.set_ylabel('Density')
Mth.set_xlabel('Total Charges')
Mth.set_title('Total charges by churn')

increased Churn at reduced Total Charges
Nonetheless, the picture becomes somewhat clearer when we combine the insights of the three criteria, namely tenure, monthly charges, and total charges:A higher monthly charge at a shorter tenure equals a lower total charge. Therefore, there is a correlation between high churn and all three of these factors: a higher monthly charge, a lower tenure, and a lower total charge.

In [None]:
plt.figure(figsize=(19,7))
df_updated_dummies.corr()['churn'].sort_values(ascending = False).plot(kind='bar')

 HIGH Churn seen in case of Month to month contracts, No online security, No Tech support, First year of subscription and Fibre Optics Internet

LOW Churn is seens in case of Long term contracts, Subscriptions without internet service and The customers engaged for 5+ years

Factors like Gender, Availability of PhoneService and of multiple lines have alomost NO impact on Churn

This is also evident from the Heatmap below


In [None]:
plt.figure(figsize=(10,10))
sns.heatmap(df_updated_dummies.corr(), cmap="Paired")

# bivariate analysis

In [None]:
new_df1_target0=df_updated.loc[df_updated["churn"]==0]
new_df1_target1=df_updated.loc[df_updated["churn"]==1]

In [None]:
def uniplot(df,col,title,hue =None):
    
    sns.set_style('whitegrid')
    sns.set_context('talk')
    plt.rcParams["axes.labelsize"] = 20
    plt.rcParams['axes.titlesize'] = 22
    plt.rcParams['axes.titlepad'] = 30
    
    
    temp = pd.Series(data = hue)
    fig, ax = plt.subplots()
    width = len(df[col].unique()) + 7 + 4*len(temp.unique())
    fig.set_size_inches(width , 8)
    plt.xticks(rotation=45)
    plt.yscale('log')
    plt.title(title)
    ax = sns.countplot(data = df, x= col, order=df[col].value_counts().index,hue = hue,palette='bright') 
        
    plt.show()

In [None]:
uniplot(new_df1_target1,col='partner',title='Distribution of Gender for Churned Customers',hue='gender')

In [None]:
uniplot(new_df1_target0,col='partner',title='Distribution of Gender for Non Churned Customers',hue='gender')

In [None]:
uniplot(new_df1_target1,col='paymentmethod',title='Distribution of PaymentMethod for Churned Customers',hue='gender')

In [None]:
uniplot(new_df1_target1,col='contract',title='Distribution of Contract for Churned Customers',hue='gender')

In [None]:
uniplot(new_df1_target1,col='techsupport',title='Distribution of TechSupport for Churned Customers',hue='gender')

In [None]:
uniplot(new_df1_target1,col='seniorcitizen',title='Distribution of SeniorCitizen for Churned Customers',hue='gender')

Electronic check medium are the highest churners

Contract Type - Monthly customers are more likely to churn because of no contract terms, as they are free to go customers.

No Online security, No Tech Support category are high churners

Non senior Citizens are high churners

In [None]:

sns.scatterplot(x='monthlycharges', y='totalcharges',data=df)

Total Charges Increase as Monthly charges Increases

In [None]:
#selecting only categorical column
cat_cols = df.select_dtypes(include=['object']).columns
cat_cols

In [None]:
#dropping out target column from categorical
cat_cols = cat_cols.drop('churn')

In [None]:
cat_cols

In [None]:
sns.histplot(df['monthlycharges'],kde=True)

In [None]:
sns.histplot(df['tenure'],kde=True)


In [None]:
# Plot count plots for categorical columns
plt.figure(figsize=(25, 12))
sns.countplot(x='churn', data=df)
plt.title('Distribution of Categorical columns')
plt.show()

Data is disimbalanced so we will have to balance the data later

In [None]:
# Assuming 'Yes' is encoded as 1 and 'No' as 0 (instead of Hot-coding)
df_updated['gender'] = df_updated['gender'].replace({'Female': 0,'Male': 1})
df_updated['partner'] = df_updated['partner'].replace({'Yes': 1, 'No': 0})
df_updated['phoneservice'] = df_updated['phoneservice'].replace({'Yes': 1, 'No': 0})
df_updated['dependents'] = df_updated['dependents'].replace({'Yes': 1, 'No': 0})
df_updated['multiplelines'] = df_updated['multiplelines'].replace({'Yes': 1, 'No': 0, 'No phone service':2})
df_updated['internetservice'] = df_updated['internetservice'].replace({'DSL': 1, 'Fiber optic': 2,'No':0})
df_updated['onlinesecurity'] = df_updated['onlinesecurity'].replace({'Yes': 1, 'No': 0, 'No internet service': 0})
df_updated['onlinebackup'] = df_updated['onlinebackup'].replace({'Yes': 1, 'No': 0, 'No internet service': 0})
df_updated['deviceprotection'] = df_updated['deviceprotection'].replace({'Yes': 1, 'No': 0, 'No internet service': 0})
df_updated['techsupport'] = df_updated['techsupport'].replace({'Yes': 1, 'No': 0, 'No internet service': 0})
df_updated['streamingtv'] = df_updated['streamingtv'].replace({'Yes': 1, 'No': 0, 'No internet service': 0})
df_updated['streamingmovies'] = df_updated['streamingmovies'].replace({'Yes': 1, 'No': 0, 'No internet service': 0})
df_updated['contract'] = df_updated['contract'].replace({'Month-to-month': 1, 'One year': 0, 'Two year': 0})
df_updated['paperlessbilling'] = df_updated['paperlessbilling'].replace({'Yes': 1, 'No': 0})
df_updated['paymentmethod'] = df_updated['paymentmethod'].replace({'Electronic check': 1, 'Mailed check': 0, 'Bank transfer (automatic)':2,'Credit card (automatic)':3})
df_updated['churn'] = df_updated['churn'].replace({'Yes': 1, 'No': 0})


In [None]:
df_updated.sample(10)

In [None]:
df.tenure.describe()

In [None]:
df.select_dtypes(include=['object']).columns

In [None]:
df.churn.unique()

In [None]:
# Calculate correlation matrix
correlation_matrix = df_updated[['phoneservice', 'multiplelines', 'internetservice', 'churn']].corr()

In [None]:
# Create heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f", linewidths=0.5)
plt.title('Heatmap of PhoneService, MultipleLines, InternetService, and Churn')
plt.show()


In [None]:
# Calculate correlation matrix
correlation_matrix_cats = df_updated[['gender', 'partner', 'dependents', 'phoneservice', 'multiplelines',
       'internetservice', 'onlinesecurity', 'onlinebackup', 'deviceprotection',
       'techsupport', 'streamingtv', 'streamingmovies', 'contract',
       'paperlessbilling', 'paymentmethod', 'churn']].corr()

In [None]:
# Create heatmap for categorical data
plt.figure(figsize=(20, 9))
sns.heatmap(correlation_matrix_cats, annot=True, cmap='coolwarm', fmt=".2f", linewidths=0.5)
plt.title('Heatmap of PhoneService, MultipleLines, InternetService, and Churn')
plt.show()

In [None]:
num_cols = df.select_dtypes(include=['float64', 'int64','int32']).columns
for column in num_cols:
    unique_values = df[column].unique()
    print(f"Column '{column}': {unique_values}")

In [None]:
# Assigning feature variable to X
x = df_updated.drop(['churn'], axis=1)

x.head()

In [None]:
y = df_updated['churn']

y.head()

# Train Test Split

In [None]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2)

# Decision Tree Classifier

In [None]:
model_dt=DecisionTreeClassifier(criterion = "gini",random_state = 100,max_depth=6, min_samples_leaf=8)

In [None]:
model_dt.fit(x_train,y_train)

In [None]:
y_pred=model_dt.predict(x_test)
y_pred

In [None]:
model_dt.score(x_test,y_test)

In [None]:
print(classification_report(y_test, y_pred, labels=[0,1]))

As you can see that the accuracy is quite low, and as it's an imbalanced dataset, we shouldn't consider Accuracy as our metrics to measure the model, as Accuracy is cursed in imbalanced datasets.
Hence, we need to check recall, precision & f1 score for the minority class, and it's quite evident that the precision, recall & f1 score is too low for Class 1, i.e. churned customers.