In [35]:
#Lib
import pandas as pd
from joblib import load
#Clustering
from numpy import unique
from numpy import where
from sklearn.cluster import KMeans
#Zips
import py7zr
#elbow point
%pip install kneed
from kneed import KneeLocator
from sklearn.preprocessing import MinMaxScaler
# plots 
%pip install plotly
import plotly.express as px
import matplotlib.pyplot as plt
#Save plots as .png
%pip install -U kaleido

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [36]:
#Accessing transformed data
with py7zr.SevenZipFile('../../../../data/WA_Fn-UseC_-Telco-Customer-Churn/full_transformated_data.7z', mode='r') as z:
    z.extractall(path='full_transformed_data/')

In [37]:
#Defining target column
target = 'Churn'
target_value = 'Yes'

In [38]:
#Reading transformed data
df = pd.read_csv('./full_transformed_data/full_transformated_data.csv')
df

Unnamed: 0,tenure,StreamingTV,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,1.247129,1.549122,-0.274724,-1.467651,1.083457,No
1,1.613587,0.238870,1.466061,0.652651,1.187405,No
2,1.613587,0.238870,1.466061,-0.369281,1.599502,No
3,-0.503722,0.238870,-1.145117,0.650990,0.770558,No
4,-1.236636,-1.071381,-1.145117,0.700840,1.506635,Yes
...,...,...,...,...,...,...
7038,-1.277354,-1.071381,0.595668,-0.648443,-1.649262,No
7039,-0.870179,-1.071381,-1.145117,0.052785,-1.040349,Yes
7040,-1.114484,1.549122,1.466061,-1.492576,0.827017,Yes
7041,-1.236636,-1.071381,-1.145117,0.695855,-0.269975,Yes


In [39]:
#Preparing dataset for clustering
y = df[target]
x = df.drop([target], axis=1)
x

Unnamed: 0,tenure,StreamingTV,PaymentMethod,MonthlyCharges,TotalCharges
0,1.247129,1.549122,-0.274724,-1.467651,1.083457
1,1.613587,0.238870,1.466061,0.652651,1.187405
2,1.613587,0.238870,1.466061,-0.369281,1.599502
3,-0.503722,0.238870,-1.145117,0.650990,0.770558
4,-1.236636,-1.071381,-1.145117,0.700840,1.506635
...,...,...,...,...,...
7038,-1.277354,-1.071381,0.595668,-0.648443,-1.649262
7039,-0.870179,-1.071381,-1.145117,0.052785,-1.040349
7040,-1.114484,1.549122,1.466061,-1.492576,0.827017
7041,-1.236636,-1.071381,-1.145117,0.695855,-0.269975


In [40]:
#Getting number of k-means clusters
def elbow(dataset):
    scaler = MinMaxScaler()
    scaler.fit(dataset)
    X=scaler.transform(dataset)
    inertia = []
    max_clusters = dataset.shape[1] + 1 #number of variables
    cluster_number = list(range(1, max_clusters))
    for i in cluster_number:
        kmeans = KMeans(
            n_clusters=i, init="k-means++",
            n_init=10,
            tol=1e-04, random_state=42
        )
        kmeans.fit(X)
        inertia.append(kmeans.inertia_)
    #locating elbow point
    kneedle = KneeLocator(cluster_number, inertia, S=1.0, curve="convex", direction="decreasing")
    clusters_number = kneedle.knee
    if (cluster_number != int):
        kneedle = KneeLocator(cluster_number, inertia, S=0.0, curve="convex", direction="decreasing")
        clusters_number = kneedle.knee
    return clusters_number

In [41]:
clusters_number = elbow(x)
clusters_number

3

In [42]:
#Clustering function
def clustering(dataframe, clusters_number):
    dataframe.reset_index
    #k-means algorithm
    kmeans = KMeans(
            n_clusters=clusters_number, #number of clusters for general dataset
            init="k-means++",
            n_init=10,
            tol=1e-04, 
            random_state=42
        )
    #fitting the algorithm
    kmeans.fit(dataframe)
    #labeling
    clusters=pd.DataFrame(dataframe)
    clusters['label']=kmeans.labels_
    return clusters

In [43]:
#Transformed dataframe labeled 
df = pd.DataFrame(clustering(x, clusters_number))
#Main clustering polar graph
polar=df.groupby("label").mean().reset_index()
polar=pd.melt(polar,id_vars=["label"])
fig0 = px.line_polar(polar, r="value", theta="variable", color="label", line_close=True,height=800,width=1400)
fig0.show()
fig0.write_image(f'main_cluster_img.png')
#Main clustering pie plot to see clustering distribution
pie0=df.groupby('label').size().reset_index()
pie0.columns=['label','value']
pie0 = px.pie(pie0,values='value',names='label')
pie0.show()
pie0.write_image(f'main_cluster_distribution.png')



The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



In [44]:
df.to_csv(f'./main_cluster.csv') #creating filtered cvs 
#Main dataframe clustered
df

Unnamed: 0,tenure,StreamingTV,PaymentMethod,MonthlyCharges,TotalCharges,label
0,1.247129,1.549122,-0.274724,-1.467651,1.083457,2
1,1.613587,0.238870,1.466061,0.652651,1.187405,0
2,1.613587,0.238870,1.466061,-0.369281,1.599502,0
3,-0.503722,0.238870,-1.145117,0.650990,0.770558,1
4,-1.236636,-1.071381,-1.145117,0.700840,1.506635,1
...,...,...,...,...,...,...
7038,-1.277354,-1.071381,0.595668,-0.648443,-1.649262,1
7039,-0.870179,-1.071381,-1.145117,0.052785,-1.040349,1
7040,-1.114484,1.549122,1.466061,-1.492576,0.827017,2
7041,-1.236636,-1.071381,-1.145117,0.695855,-0.269975,1


In [45]:
#Create a csv focusing in cluster churns
clust = 0 #iterator
df[target] = y
while clust < clusters_number:
    dataframe = pd.DataFrame(df.loc[df['label'] == clust]) #Filter by cluster
    dataframe = dataframe.loc[dataframe[target] == target_value] #Filter by positive targets
    dataframe = dataframe.drop(columns=[target,'label'])
    dataframe.to_csv(f'./cluster{clust}.csv') #creating filtered cvs 
    clust = clust + 1 #iterator
dataframe

Unnamed: 0,tenure,StreamingTV,PaymentMethod,MonthlyCharges,TotalCharges
99,-1.277354,1.549122,-0.274724,-1.500885,-1.452975
312,-1.277354,1.549122,-0.274724,-1.469313,-0.914768
366,-1.277354,1.549122,-1.145117,-1.489253,0.458715
463,-1.277354,1.549122,0.595668,-1.485930,-1.321061
479,-1.277354,1.549122,-0.274724,-1.456019,-0.257311
...,...,...,...,...,...
6845,-1.277354,1.549122,-0.274724,-1.490915,-1.453502
6965,-1.277354,1.549122,-0.274724,-1.505870,-1.614437
7013,-0.585157,1.549122,0.595668,-1.482606,-0.750668
7028,-1.195919,0.238870,1.466061,-0.982441,0.299891


In [46]:
clust = 0 #iterator
while clust < clusters_number:
    sub_cluster = pd.read_csv(f'./cluster{clust}.csv') #read every subcluster .csv
    sub_cluster = sub_cluster.drop(columns=['Unnamed: 0']) #drop the added column
    sub_clusters_number = elbow(sub_cluster) #calculate elbow point for every sub-cluster
    clusters = pd.DataFrame(clustering(sub_cluster, sub_clusters_number)) #k-means clustering method
    #polar sub-clusters graph
    sub_polar=clusters.groupby("label").mean().reset_index() 
    sub_polar=pd.melt(sub_polar,id_vars=["label"])
    fig = px.line_polar(sub_polar, r="value", theta="variable", color="label", line_close=True,height=800,width=1400)
    fig.show() #print here
    fig.write_image(f'cluster{clust}img.png') #save as .png file
    #Pie plot to see sub-cluster's distribution
    pie=clusters.groupby('label').size().reset_index()
    pie.columns=['label','value']
    pie = px.pie(pie,values='value',names='label')
    pie.show() #print here
    pie.write_image(f'cluster{clust}distribution.png') #save as .png file
    clust = clust + 1
    
    


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.




The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.




No knee/elbow found


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.

