In [1]:
#create dataset
import numpy as np 
import pandas as pd

operating_systems = ["Android","iOS"]
isp_names = ["Cox","HughesNet","Xfinity","AT&T"]

data = []
for i in range(100):
    row = []
    row.append(np.random.choice(operating_systems)) #OS
    row.append(np.random.choice(isp_names)) #ISP
    row.append(np.random.poisson(lam=25)) #Age 
    row.append(np.random.uniform(low=0.5, high=1000)) #Time Spent
    data.append(row)

customers = pd.DataFrame(data, columns = ['OS', 'ISP','Age','Time Spent'])

In [2]:
customers.head()

Unnamed: 0,OS,ISP,Age,Time Spent
0,Android,Xfinity,24,313.349416
1,iOS,HughesNet,26,889.388494
2,Android,Cox,28,647.061506
3,iOS,HughesNet,20,938.587428
4,iOS,AT&T,28,419.232237


In [17]:
from sklearn import preprocessing
customers_norm = customers.copy()
scaler = preprocessing.MinMaxScaler()
customers_norm[['Age','Time Spent']] = scaler.fit_transform(customers_norm[['Age','Time Spent']])

In [18]:
from kmodes.kprototypes import KPrototypes
kproto = KPrototypes(n_clusters=3)
clusters = kproto.fit_predict(customers_norm, categorical=[0, 1])
#join data with labels 
labels = pd.DataFrame(clusters)
labeledCustomers = pd.concat((customers,labels),axis=1)
labeledCustomers = labeledCustomers.rename({0:'labels'},axis=1)

In [19]:
labeledCustomers

Unnamed: 0,OS,ISP,Age,Time Spent,labels
0,Android,Xfinity,24,313.349416,2
1,iOS,HughesNet,26,889.388494,1
2,Android,Cox,28,647.061506,2
3,iOS,HughesNet,20,938.587428,1
4,iOS,AT&T,28,419.232237,0
...,...,...,...,...,...
95,iOS,Cox,24,495.128035,1
96,iOS,Cox,23,796.184094,1
97,iOS,Cox,17,695.493478,1
98,Android,Xfinity,25,189.237671,0


# K-Means with One Hot Encoding

In [13]:
customers_norm = pd.get_dummies(customers, columns=["OS","ISP"])

In [14]:
customers_norm.head()

Unnamed: 0,Age,Time Spent,OS_Android,OS_iOS,ISP_AT&T,ISP_Cox,ISP_HughesNet,ISP_Xfinity
0,24,313.349416,1,0,0,0,0,1
1,26,889.388494,0,1,0,0,1,0
2,28,647.061506,1,0,0,1,0,0
3,20,938.587428,0,1,0,0,1,0
4,28,419.232237,0,1,1,0,0,0


In [15]:
from sklearn.cluster import KMeans
kmeans = KMeans(3)
clusters = kmeans.fit_predict(customers_norm)
labels = pd.DataFrame(clusters)
labeledCustomers = pd.concat((customers,labels),axis=1)
labeledCustomers = labeledCustomers.rename({0:'labels'},axis=1)

In [16]:
labeledCustomers

Unnamed: 0,OS,ISP,Age,Time Spent,labels
0,Android,Xfinity,24,313.349416,1
1,iOS,HughesNet,26,889.388494,2
2,Android,Cox,28,647.061506,0
3,iOS,HughesNet,20,938.587428,2
4,iOS,AT&T,28,419.232237,0
...,...,...,...,...,...
95,iOS,Cox,24,495.128035,0
96,iOS,Cox,23,796.184094,2
97,iOS,Cox,17,695.493478,0
98,Android,Xfinity,25,189.237671,1
