In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from PIL import Image
import plotly.offline as py # visualization
py.init_notebook_mode(connected=True) # visualization
import plotly.graph_objs as go # visualization
from plotly.subplots import make_subplots
import plotly.figure_factory as ff # visualization
from sklearn.model_selection import train_test_split

In [None]:
# !pip install plotly --upgrade
# ref : https://www.kaggle.com/code/bhartiprasad17/customer-churn-prediction

In [None]:
df = pd.read_csv("/content/drive/My Drive/customerChurn/Telecom_Customer_Churn.csv")

In [None]:
pd.set_option("display.max_columns", None)
df.head()

In [None]:
# Get number of unique values in each column - helps identify categorical and continuous variables.
df.nunique()

In [None]:
df.shape

In [None]:
## 21 columns/ features and 7043 rows of data

In [None]:
df.describe()

In [None]:
# Mean Tenure of any customer is 32 months
# A customer can stay at the company anywhere between zero to 72 months.
# More than 50% of customers stay for more than 29 months

In [None]:
df.columns.values

In [None]:
msno.matrix(df)

In [None]:
# looks like there is no missing data
# Lets check for nulls.

In [None]:
df['TotalCharges'] = pd.to_numeric(df.TotalCharges, errors = 'coerce')
df.isnull().sum()

In [None]:
df[np.isnan(df['TotalCharges'])==True]

In [None]:
# The tenure is also zero for most of these columns.
# Thus the totalcharges cannot be calculated as well.
# Therefore, we drop it as this will lead the model to learn wrong information

In [None]:
df.drop(labels=df[df['tenure'] == 0].index, axis=0, inplace=True)
df.shape

In [None]:
# 11 columns have been dropped

In [None]:
# Convert categorical variables to numbers
df.nunique()

In [None]:
len(df[df['Churn']=='Yes'])

In [None]:
df.drop(columns = ['customerID'], inplace = True)

In [None]:
def encode_yes_no(col, df2):
    df2[col] = df2[col].map({'Yes':1, 'No':0})

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Sample data (replace with your DataFrame)
data = {
    "Churn": ["Yes", "No"],
    "Count": [df["Churn"].value_counts()["Yes"], df["Churn"].value_counts()["No"]]
}

# Create a DataFrame from the sample data
churn_df = pd.DataFrame(data)

# Create a pie chart using Seaborn and Matplotlib
plt.figure(figsize=(6, 6))
sns.set_palette("pastel")
sns.set(style="whitegrid")

plt.pie(churn_df["Count"], labels=churn_df["Churn"], autopct='%1.1f%%', startangle=90, pctdistance=0.85)
plt.title("Customer Churn")

# Draw a circle in the center to make it look like a donut chart
centre_circle = plt.Circle((0,0),0.70,fc='white')
fig = plt.gcf()
fig.gca().add_artist(centre_circle)

plt.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.
plt.show()


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

correlation = df.corr()

plt.figure(figsize=(10, 8))

# Create a heatmap
sns.heatmap(correlation, annot=True, cmap="viridis", fmt=".2f", linewidths=0.5)

# Set the title
plt.title("Feature Correlation Matrix")

# Show the plot
plt.show()


In [None]:
df["Churn"][df["Churn"]=="Yes"].groupby(by=df["SeniorCitizen"]).count()

In [None]:
df["Churn"][df["Churn"]=="No"].groupby(by=df["SeniorCitizen"]).count()

In [None]:
def get_size_list(df, col):
  a = df["Churn"][df["Churn"]=="Yes"].groupby(by=df[col]).count()[1]
  b = df["Churn"][df["Churn"]=="Yes"].groupby(by=df[col]).count()[0]
  c = df["Churn"][df["Churn"]=="No"].groupby(by=df[col]).count()[1]
  d = df["Churn"][df["Churn"]=="No"].groupby(by=df[col]).count()[0]
  l = [a, b, c, d]
  return l

In [None]:
plt.figure(figsize=(6, 6))
labels =["Churn: Yes","Churn:No"]
values = [1869,5163]
labels_gender = ["S","N","S","N"]
sizes_gender = [476,1393,666,4497]
colors = ['#ff6666', '#66b3ff']
colors_gender = ['#c2c2f0','#ffb3e6', '#c2c2f0','#ffb3e6']
explode = (0.3,0.3)
explode_gender = (0.1,0.1,0.1,0.1)
textprops = {"fontsize":15}
#Plot
plt.pie(values, labels=labels,autopct='%1.1f%%',pctdistance=1.08, labeldistance=0.8,colors=colors, startangle=90,frame=True, explode=explode,radius=10, textprops =textprops, counterclock = True, )
plt.pie(sizes_gender,labels=labels_gender,colors=colors_gender,startangle=90, explode=explode_gender,radius=7, textprops =textprops, counterclock = True, )
#Draw circle
centre_circle = plt.Circle((0,0),5,color='black', fc='white',linewidth=0)
fig = plt.gcf()
fig.gca().add_artist(centre_circle)

plt.title('Churn Distribution w.r.t Senior Citizen: Senior(S), Not Senior(N)', fontsize=15, y=1.1)

# show plot

plt.axis('equal')
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(6, 6))
labels =["Churn: Yes","Churn:No"]
values = [1869,5163]
labels_gender = ["Y","N","Y","N"]
sizes_gender = get_size_list(df, 'Dependents')
colors = ['#ff6666', '#66b3ff']
colors_gender = ['#c2c2f0','#ffb3e6', '#c2c2f0','#ffb3e6']
explode = (0.3,0.3)
explode_gender = (0.1,0.1,0.1,0.1)
textprops = {"fontsize":15}
#Plot
plt.pie(values, labels=labels,autopct='%1.1f%%',pctdistance=1.08, labeldistance=0.8,colors=colors, startangle=90,frame=True, explode=explode,radius=10, textprops =textprops, counterclock = True, )
plt.pie(sizes_gender,labels=labels_gender,colors=colors_gender,startangle=90, explode=explode_gender,radius=7, textprops =textprops, counterclock = True, )
#Draw circle
centre_circle = plt.Circle((0,0),5,color='black', fc='white',linewidth=0)
fig = plt.gcf()
fig.gca().add_artist(centre_circle)

plt.title('Churn Distribution w.r.t Dependents: Having Dependents(Y), No Dependents(N)', fontsize=15, y=1.1)

# show plot

plt.axis('equal')
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(6, 6))
labels =["Churn: Yes","Churn:No"]
values = [1869,5163]
labels_gender = ["Y","N","Y","N"]
sizes_gender = get_size_list(df, 'Partner')
colors = ['#ff6666', '#66b3ff']
colors_gender = ['#c2c2f0','#ffb3e6', '#c2c2f0','#ffb3e6']
explode = (0.3,0.3)
explode_gender = (0.1,0.1,0.1,0.1)
textprops = {"fontsize":15}
#Plot
plt.pie(values, labels=labels,autopct='%1.1f%%',pctdistance=1.08, labeldistance=0.8,colors=colors, startangle=90,frame=True, explode=explode,radius=10, textprops =textprops, counterclock = True, )
plt.pie(sizes_gender,labels=labels_gender,colors=colors_gender,startangle=90, explode=explode_gender,radius=7, textprops =textprops, counterclock = True, )
#Draw circle
centre_circle = plt.Circle((0,0),5,color='black', fc='white',linewidth=0)
fig = plt.gcf()
fig.gca().add_artist(centre_circle)

plt.title('Churn Distribution w.r.t Partner: Being Partner(Y), Not a Partner(N)', fontsize=15, y=1.1)

# show plot

plt.axis('equal')
plt.tight_layout()
plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(7, 5))

# Group data by Contract and Churn
grouped_data = df.groupby(['Contract', 'Churn']).size().unstack()

# Plot the histogram
grouped_data.plot(kind='bar', stacked=False, ax=ax)
ax.set_xlabel('Contract')
ax.set_ylabel('Count')
ax.set_title('Customer contract distribution')
ax.legend(title='Churn')


In [None]:
df['PhoneService'].unique()

In [None]:
import matplotlib.pyplot as plt

# List of values to replace 'Contract' in each plot
contract_values = ['Contract', 'PaymentMethod', 'OnlineSecurity', 'OnlineBackup', 'TechSupport', 'PhoneService', 'DeviceProtection']  # Replace with your values

# Create a subplot for each contract value
for contract_value in contract_values:
    fig, ax = plt.subplots(figsize=(7, 5))

    # Group data by Churn for the current contract value
    grouped_data = df.groupby([contract_value, 'Churn']).size().unstack()

    # Plot the histogram
    grouped_data.plot(kind='bar', stacked=False, ax=ax)
    ax.set_xlabel('Churn')
    ax.set_ylabel('Count')
    ax.set_title(f'Customer contract distribution for {contract_value}')
    ax.legend(title='Churn')

plt.show()  # Show all the plots


In [None]:
df.nunique()

In [None]:
# Convert Categorical to Numeric

In [None]:
le = LabelEncoder()

In [None]:
df['gender'].unique()

In [None]:
df['Partner'].unique()

In [None]:
df['Dependents'].unique()

In [None]:
df['PhoneService'].unique()

In [None]:
cols_to_encode = [col for col in df.columns if df[col].nunique()>1 and df[col].nunique()<5]

In [None]:
cols_to_encode

In [None]:
df['SeniorCitizen'].unique()

In [None]:
cols_to_encode.remove('SeniorCitizen')

In [None]:
for col in cols_to_encode:
  df[col] = le.fit_transform(df[col])

In [None]:
df.head()

In [None]:
X = df.drop(columns = ['Churn'])
y = df['Churn'].values

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.3 0, random_state = 42)
# X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size = 0.15, random_state = 42)

In [None]:
def distplot(feature, frame, color='r'):
    plt.figure(figsize=(8,3))
    plt.title("Distribution for {}".format(feature))
    ax = sns.distplot(frame[feature], color= color)

num_cols = ["tenure", 'MonthlyCharges', 'TotalCharges']
for feat in num_cols: distplot(feat, df)

In [None]:
scaler= StandardScaler()

X_train[num_cols] = scaler.fit_transform(X_train[num_cols])
X_test[num_cols] = scaler.transform(X_test[num_cols])


In [None]:
num_cols = ["tenure", 'MonthlyCharges', 'TotalCharges']
for feat in num_cols: distplot(feat, df)

In [None]:
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve

In [None]:
model_rf = RandomForestClassifier(n_estimators=100 , oob_score = True, n_jobs = -1,
                                  random_state =40, max_features = "sqrt",
                                  max_leaf_nodes = 60)
model_rf.fit(X_train, y_train)

# Make predictions
prediction_test = model_rf.predict(X_test)
print (metrics.accuracy_score(y_test, prediction_test))

In [None]:
print(classification_report(y_test, prediction_test))

In [None]:
plt.figure(figsize=(4,3))
sns.heatmap(confusion_matrix(y_test, prediction_test),
                annot=True,fmt = "d",linecolor="k",linewidths=3)

plt.title(" RANDOM FOREST CONFUSION MATRIX",fontsize=14)
plt.show()

In [None]:
y_rfpred_prob = model_rf.predict_proba(X_test)[:,1]
fpr_rf, tpr_rf, thresholds = roc_curve(y_test, y_rfpred_prob)
plt.plot([0, 1], [0, 1], 'k--' )
plt.plot(fpr_rf, tpr_rf, label='Random Forest',color = "r")
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Random Forest ROC Curve',fontsize=16)
plt.show();

In [None]:
a_model = AdaBoostClassifier()
a_model.fit(X_train,y_train)
a_preds = a_model.predict(X_test)
print("AdaBoost Classifier accuracy")
metrics.accuracy_score(y_test, a_preds)


In [None]:
print(classification_report(y_test, a_preds))


In [None]:
plt.figure(figsize=(4,3))
sns.heatmap(confusion_matrix(y_test, a_preds),
                annot=True,fmt = "d",linecolor="k",linewidths=3)

plt.title("AdaBoost Confusion Matrix",fontsize=14)
plt.show()

In [None]:
y_rfpred_prob = a_model.predict_proba(X_test)[:,1]
fpr_rf, tpr_rf, thresholds = roc_curve(y_test, y_rfpred_prob)
plt.plot([0, 1], [0, 1], 'k--' )
plt.plot(fpr_rf, tpr_rf, label='Random Forest',color = "r")
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Random Forest ROC Curve',fontsize=16)
plt.show();

In [None]:
gb = GradientBoostingClassifier()
gb.fit(X_train, y_train)
gb_pred = gb.predict(X_test)
print("Gradient Boosting Classifier", metrics.accuracy_score(y_test, gb_pred))

In [None]:
print(classification_report(y_test, gb_pred))

In [None]:
y_rfpred_prob = gb.predict_proba(X_test)[:,1]
fpr_rf, tpr_rf, thresholds = roc_curve(y_test, y_rfpred_prob)
plt.plot([0, 1], [0, 1], 'k--' )
plt.plot(fpr_rf, tpr_rf, label='Random Forest',color = "r")
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Random Forest ROC Curve',fontsize=16)
plt.show();