<a href="https://colab.research.google.com/github/ManullangJihan/100-Day-ML-Challenge/blob/main/09_Telco_Churn_IBM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Telco Churn

[Source](https://www.kaggle.com/datasets/yeanzc/telco-customer-churn-ibm-dataset)

In [1]:
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio

In [2]:
import random

def Default_seed(s):
    np.random.seed(s)
    random.seed(s)

global_seed = 42
Default_seed(global_seed)

# Setting Visualization Default
pio.templates["pio.default.templates"] = "plotly_dark"
plt.rcParams["figure.figsize"] = [12, 9]
plt.rcParams.update({"font.size":12})

In [3]:
main_df = pd.read_excel("/content/drive/MyDrive/Telco Churn IBM/Telco_customer_churn.xlsx")
n_samples, n_features = main_df.shape

print(f"Number of samples: {n_samples}")
print(f"Number of features: {n_features}\n")

main_df.head()

Number of samples: 7043
Number of features: 33



Unnamed: 0,CustomerID,Count,Country,State,City,Zip Code,Lat Long,Latitude,Longitude,Gender,...,Contract,Paperless Billing,Payment Method,Monthly Charges,Total Charges,Churn Label,Churn Value,Churn Score,CLTV,Churn Reason
0,3668-QPYBK,1,United States,California,Los Angeles,90003,"33.964131, -118.272783",33.964131,-118.272783,Male,...,Month-to-month,Yes,Mailed check,53.85,108.15,Yes,1,86,3239,Competitor made better offer
1,9237-HQITU,1,United States,California,Los Angeles,90005,"34.059281, -118.30742",34.059281,-118.30742,Female,...,Month-to-month,Yes,Electronic check,70.7,151.65,Yes,1,67,2701,Moved
2,9305-CDSKC,1,United States,California,Los Angeles,90006,"34.048013, -118.293953",34.048013,-118.293953,Female,...,Month-to-month,Yes,Electronic check,99.65,820.5,Yes,1,86,5372,Moved
3,7892-POOKP,1,United States,California,Los Angeles,90010,"34.062125, -118.315709",34.062125,-118.315709,Female,...,Month-to-month,Yes,Electronic check,104.8,3046.05,Yes,1,84,5003,Moved
4,0280-XJGEX,1,United States,California,Los Angeles,90015,"34.039224, -118.266293",34.039224,-118.266293,Male,...,Month-to-month,Yes,Bank transfer (automatic),103.7,5036.3,Yes,1,89,5340,Competitor had better devices


In [None]:
main_df.isna().any()

In [None]:
main_df["Churn Reason"].isna().sum()

5174

In [None]:
main_df["Churn Value"].value_counts()

0    5174
1    1869
Name: Churn Value, dtype: int64

Churn Value: 1 = the customer left the company this quarter. 0 = the customer remained with the company.

In [None]:
# Plot the distribution of target value to check balancesses of the dataset

target_counts = main_df["Churn Value"].value_counts().values
target_percentage = target_counts / main_df.shape[0] * 100

trace = go.Bar(
    x = sorted(main_df["Churn Value"].unique().tolist()),
    y = target_counts,
    text = [f"{target_percentage[0]:.2f}%", f"{target_percentage[1]:.2f}%"],
    textposition="inside"
)

layout = go.Layout(
    title="Distribution of the target feature",
    height=500,
    width=700,
)

fig = go.Figure(data=[trace], layout=layout)
fig.update_traces(
    marker_color='rgb(158,202,225)', 
    marker_line_color='rgb(8,48,107)',
    marker_line_width=1.5, 
    opacity=0.6)

fig.show()

In [4]:
# Balance the dataset
df = main_df.copy()
left = df.loc[df["Churn Value"] == 1]
stay = df.loc[df["Churn Value"] == 0][:1869]

df = pd.concat([left, stay], axis=0)
df = df.sample(frac=1)
df = df.reset_index(drop=True)

df["Churn Value"].value_counts()

1    1869
0    1869
Name: Churn Value, dtype: int64

In [None]:
df.head()

In [None]:
category_cols = df.columns[df.dtypes == "object"]

features_unique_val = []
features_n_unique = []

for col in category_cols:
    features_unique_val.append(df[col].unique())
    features_n_unique.append(df[col].nunique())

features_n_unique_percentage = np.array(features_n_unique) / df.shape[0] * 100
   
category_df = pd.DataFrame(
    {"Features": category_cols,
     "N uniques": features_n_unique,
     "N uniques in Percentage": features_n_unique_percentage,
     "Unique values:": features_unique_val},
).sort_values(by="N uniques", ascending=False)


category_df.head()

Unnamed: 0,Features,N uniques,N uniques in Percentage,Unique values:
0,CustomerID,3738,100.0,"[6267-DCFFZ, 5134-IKDAY, 3887-PBQAO, 0618-XWMS..."
21,Total Charges,3506,93.793472,"[2627.2, 69.8, 1216.6, 2068.55, 372.45, 4131.9..."
4,Lat Long,1652,44.194757,"[41.042003, -120.506086, 40.022184, -121.06238..."
3,City,1129,30.203317,"[Madeline, Twain, Hermosa Beach, Wasco, Shaver..."
23,Churn Reason,20,0.535045,"[nan, Service dissatisfaction, Long distance c..."


In [5]:
df["Total Charges"] = pd.to_numeric(df["Total Charges"], errors="coerce") 
df =  df.drop(["CustomerID", "Lat Long", "City"], axis=1)

In [6]:
df["Churn Reason"] = df["Churn Reason"].fillna(0)

In [7]:
df["Total Charges"] = df["Total Charges"].fillna(df["Total Charges"].median())

In [None]:
# Plot the distribution of the feature

In [8]:
df = pd.get_dummies(df)
X = df.drop(["Churn Value"], axis=1)
y = df["Churn Value"]

In [None]:
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA, TruncatedSVD

tsne = TSNE(n_components=3)
X_tsne = tsne.fit_transform(X)

pca = PCA(n_components=3)
X_pca = pca.fit_transform(X)

svd = TruncatedSVD(n_components=3)
X_svd = svd.fit_transform(X)

In [None]:
def plot_data(data, title=None):
    fig = go.Figure()
    
    for i in range(2):
        fig.add_trace(
            go.Scatter3d(
                x = data[y == i, 0][:100],
                y = data[y == i, 1][:100],
                z = data[y == i, 2][:100],
                mode = "markers",
                name = f"Target {i}",
            )
        )
    
    fig.update_layout(
        title = title,
        height = 500,
        width = 800,
        scene=dict(
            xaxis = dict(
                title="Component 1",
                showticklabels=False),
            yaxis = dict(
                title="Component 2",
                showticklabels=False),
            zaxis = dict(
                title="Component 3",
                showticklabels=False)
            ),
        )
    
    fig.show()


plot_data(X_tsne, title="<b>TSNE Decomposition</b>")

In [19]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, shuffle=True, random_state=global_seed
)


In [20]:
ss = StandardScaler()
X_train = ss.fit_transform(X_train)
X_test = ss.transform(X_test)

In [21]:
svc = SVC()
svc.fit(X_train, y_train)

acc = svc.score(X_test, y_test)
print(f"Accuracy: {acc:.4f}")

Accuracy: 1.0000


In [22]:
clf = LogisticRegression(max_iter=1e6)
clf.fit(X_train, y_train)

acc = clf.score(X_test, y_test)
print(f"Accuracy: {acc:.4f}")

Accuracy: 1.0000


In [23]:
tree = DecisionTreeClassifier()
tree.fit(X_train, y_train)

acc = tree.score(X_test, y_test)
print(f"Accuracy: {acc:.4f}")

Accuracy: 1.0000


In [None]:
# Plot the learning curve for every classifier
# Only use the training data when we want to adding the meta-data (SMOTE etc)
# Plot the confusion matrix
# Use also deep learning model to compare with traditional ML