## K-Means Clustering in Python

Source: http://stamfordresearch.com/k-means-clustering-in-python/ 

Source: https://www.youtube.com/watch?v=Lm1c2U8BmoA (PySpark, ML)

In [None]:
# Disable warnings
import warnings
warnings.filterwarnings('ignore')

In [None]:
#set Matplotlib inline plotting and load Pandas package
%matplotlib inline
import pandas as pd
pd.options.display.mpl_style = 'default'

In [None]:
# Load data 

data = hive_ctx.sql("Select * from bi_temp_kmeanClusteringtable")

In [None]:
# Look at the first 5 results

df = data.toPandas()

df.head(5).transpose()

In [None]:
# Number of features

len(df.columns)

### Summary Statistics

In [None]:
df.describe().transpose().tail(5)

### Replace Nas by mean of column for firstsessionduration


In [None]:
_ = df['firstsessionduration'].fillna(df['firstsessionduration'].mean(), inplace = True)

### Replace Nas by 0 for other columns

In [None]:
_ = df.fillna(0, inplace = True)

In [None]:
df.head(5).transpose()

In [None]:
# Keep only 

cols = df.columns[(df.dtypes == 'int64') | (df.dtypes == 'float64')] # '|' = or 

len(cols)

In [None]:
df[cols].head(5).transpose()

In [None]:
# New DataFrame without the s__uid 

df = df[cols]

In [None]:
# Matrix of covariance

sampled_data = df[cols].sample(frac=0.1) 

axs = pd.scatter_matrix(sampled_data, figsize=(12, 12)); 

# Rotate axis labels and remove axis ticks
n = len(sampled_data.columns)
for i in range(n):
    v = axs[i, 0]
    v.yaxis.label.set_rotation(0)
    v.yaxis.label.set_ha('right')
    v.set_yticks(())
    h = axs[n-1, i]
    h.xaxis.label.set_rotation(90)
    h.set_xticks(())

In [None]:
# Other type of visuzalization

# Source: http://datascience.stackexchange.com/questions/10459/calculation-and-visualization-of-correlation-matrix-with-pandas

def correlation_matrix(df):
    import numpy as np
    from matplotlib import pyplot as plt
    from matplotlib import cm as cm

    fig = plt.figure()
    ax1 = fig.add_subplot(111)
    cmap = cm.get_cmap('jet', 30)
    cax = ax1.imshow(df.corr(), interpolation="nearest", cmap=cmap)
    ax1.grid(True)
    plt.title('Feature Correlation')
    labels= df.columns # not sure
    ax1.set_xticklabels(labels,fontsize=6)
    ax1.set_yticklabels(labels,fontsize=6)
    # Add colorbar, make sure to specify tick locations to match desired ticklabels
    cbar = fig.colorbar(cax, ticks=[.25,.3,.35,.4,.45,.5,.55,.6,.65,.70,.75,.8,.85,.90,.95,1])
    plt.show()
    
correlation_matrix(sampled_data)

 ### Standardize Features
 
 source: http://stackoverflow.com/questions/12525722/normalize-data-in-pandas 
 
"In cluster analysis variables with large values contribute more to the distance calculations. Variables measured on different scales should be standardized prior to clustering, so that the solution is not driven by variables measured on larger scales." 

In [None]:
# standardize the data attributes

from sklearn import preprocessing

min_max_scaler = preprocessing.MinMaxScaler()
np_scaled = min_max_scaler.fit_transform(df)
df_normalized = pd.DataFrame(np_scaled)

df_normalized.head(5).transpose()

In [None]:
# Set column names back

df_normalized.columns = cols

df_normalized.head().transpose()

### Features Selection 

- RandomForrest
- Lasso
- PCA

## K-mean clustering using Spark ML 

In [None]:
from pyspark.mllib.clustering import KMeans, KMeansModel
from numpy import array
from math import sqrt

In [None]:
df_normalized.dtypes

type(1)

In [None]:
from pyspark.mllib.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

df_normalized = sqlContext.createDataFrame(df_normalized)

'''
vectorAssembler = VectorAssembler(inputCols= df_normalized.columns,
                                  outputCol="features")

df = vectorAssembler.transform(df_normalized)
'''

rdd = df_normalized.map(lambda data: Vectors.dense([float(c) for c in data]))

clusters = KMeans.train(rdd, 5, maxIterations=10, initializationMode="random") # 5 clusters

In [None]:
from sklearn.base import TransformerMixin
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction import DictVectorizer


class RowIterator(TransformerMixin):
    """ Prepare dataframe for DictVectorizer """
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return (row for _, row in X.iterrows())


vectorizer = make_pipeline(RowIterator(), DictVectorizer())

In [None]:
# Compute the sum of Squared Error:

def error(point):
    center = clusters.centers[clusters.predict(point)]
    return sqrt(sum([x**2 for x in (point - center)]))

In [None]:
# Within-cluster sum of squares

WSSE = (rdd.map(lambda point: error(point))
                   .reduce(lambda x,y: x+y))

print("Within Set Sum of Squared Error = " + str(WSSE))

In [None]:
# Try with a range of number of clusters

for l in range(1,6):
    clusters = KMeans.train(rdd, l, maxIterations = 100, runs = 100, initializationMode = 'random')
    WSSSE = (rdd.map(lambda point: error(point))
                .reduce(lambda x,y: x+y))
    print("With " + str(l) + ' clusters: Within Set Sum of Squared Error =' + str(WSSE))

## K-mean clustering using sklearn

### Sources

MasterClass: https://github.com/Marie-de-Leseleuc/Python-Code/blob/master/exo%2B1%20(2).ipynb 

1. https://www.datascience.com/blog/introduction-to-k-means-clustering-algorithm-learn-data-science-tutorials 

2. https://datasciencelab.wordpress.com/2013/12/12/clustering-with-k-means-in-python/

3. http://mnemstudio.org/clustering-k-means-example-1.htm

4. https://www.dataquest.io/blog/k-means-clustering-us-senators/

### Method 1

In [None]:
# Source: http://stackoverflow.com/questions/28017091/will-pandas-dataframe-object-work-with-sklearn-kmeans-clustering 

from sklearn.cluster import KMeans

dataset = df_normalized

# Convert DataFrame to matrix
mat = dataset.as_matrix()

# Using sklearn
km = KMeans(n_clusters=5)
km.fit(mat)

# Get cluster assignment labels
labels = km.labels_

# Format results as a DataFrame
results = pd.DataFrame([dataset.index,labels]).T # return a df with the cluster corresponding to each index

In [None]:
results.head(5)

In [None]:
# Add cluster number to  

print(len(dataset), len(results))

dataset_f = dataset

dataset_f['cluster'] = results[1] # Add cluster number to df

dataset_f['s__uid'] = data.toPandas()['s__uid'] # add s__uid to df

dataset_f[['cluster', 's__uid']].head(5) # produce cluster by player

In [None]:
# Describe the clusters

labels = dataset_f.cluster.unique()

for label in set(labels):
    print("Label:",label)
    print(dataset_f[dataset_f["cluster"]==label].describe())
    dataset_f[dataset_f["cluster"]==label].hist()

### Method 2

In [None]:
dataset.drop('s__uid', axis=1, inplace=True)

dataset.drop('cluster', axis=1, inplace=True)

In [None]:
# Source: http://stackoverflow.com/questions/34958994/how-to-use-scikit-kmeans-when-i-have-a-dataframe

import sklearn
from sklearn import cross_validation
from sklearn.cross_validation import train_test_split

sample_df_train, sample_df_test = sklearn.cross_validation.train_test_split(dataset, train_size=0.6)

cluster = sklearn.cluster.KMeans(n_clusters=5, init='k-means++', n_init=10, max_iter=300, tol=0.0001, 
                                 precompute_distances='auto', verbose=0, random_state=None, copy_x=True, n_jobs=1)

cluster.fit(sample_df_train)

result = cluster.predict(sample_df_test)

In [None]:
result = pd.DataFrame(result)

result.head(5)

In [None]:
# Note: could have join the two table using the index as key (function Merge(on=)). 

dataset_f2 = dataset

dataset_f2['cluster'] = result[0] # Add cluster number to df

dataset_f2['s__uid'] = data.toPandas()['s__uid'] # add s__uid to df

dataset_f2[['cluster', 's__uid']].head(5) # produce cluster by player

In [None]:
# Describe the clusters

labels = dataset_f2.cluster.unique()

for label in set(labels):
    print("Label:",label)
    print(dataset_f2[dataset_f2["cluster"]==label].describe())
    dataset_f2[dataset_f2["cluster"]==label].hist()

In [None]:
dataset_f2.cluster.unique()

### Method 3

In [None]:
dataset.drop('s__uid', axis=1, inplace=True)

dataset.drop('cluster', axis=1, inplace=True)

In [None]:
#Source: 
#https://www.coursera.org/learn/machine-learning-data-analysis/lecture/Ebb2M/running-a-k-means-cluster-analysis-in-python-pt-1

import numpy as np
import matplotlib.pylab as plt
from sklearn.cross_validation import train_test_split
from sklearn import preprocessing

# split data into train and test sets
clus_train, clus_test = train_test_split(dataset, test_size=.3, random_state=123)

# k-means cluster analysis for 1-9 clusters                                                           
from scipy.spatial.distance import cdist
clusters=range(1,10)
meandist=[] #store distance values from the cluster centroids

for k in clusters:
    model=KMeans(n_clusters=k)  # specify number of clusters to use for the analysis
    model.fit(clus_train)  # cluster analysis
    clusassign=model.predict(clus_train) # cluster number that is assigned to each obs. based on the cluster analysis
    meandist.append(sum(np.min(cdist(clus_train, model.cluster_centers_, 'euclidean'), axis=1)) # computes the average of 
                                            # the sum of the distances between each observation in the cluster centroids
    / clus_train.shape[0]) 

In [None]:
"""
Plot average distance from observations from the cluster centroid
to use the Elbow Method to identify number of clusters to choose
"""

plt.plot(clusters, meandist)
plt.xlabel('Number of clusters')
plt.ylabel('Average distance')
plt.title('Selecting k with the Elbow Method')

In [None]:
# Interpret 3 cluster solution
model3=KMeans(n_clusters=3)
model3.fit(clus_train)
clusassign=model3.predict(clus_train)
# plot clusters

from sklearn.decomposition import PCA
pca_2 = PCA(2)
plot_columns = pca_2.fit_transform(clus_train)
plt.scatter(x=plot_columns[:,0], y=plot_columns[:,1], c=model3.labels_,)
plt.xlabel('Canonical variable 1')
plt.ylabel('Canonical variable 2')
plt.title('Scatterplot of Canonical Variables for 3 Clusters')
plt.show()

In [None]:
"""
BEGIN multiple steps to merge cluster assignment with clustering variables to examine
cluster variable means by cluster
"""
# create a unique identifier variable from the index for the 
# cluster training data to merge with the cluster assignment variable
clus_train.reset_index(level=0, inplace=True)
# create a list that has the new index variable
cluslist=list(clus_train['index'])
# create a list of cluster assignments
labels=list(model3.labels_)

In [None]:
# combine index variable list with cluster assignment list into a dictionary
newlist=dict(zip(cluslist, labels))
newlist

In [None]:
# convert newlist dictionary to a dataframe
newclus= pd.DataFrame.from_dict(newlist, orient='index')
newclus

In [None]:
# rename the cluster assignment column
newclus.columns = ['cluster']

In [None]:
# now do the same for the cluster assignment variable
# create a unique identifier variable from the index for the 
# cluster assignment dataframe 
# to merge with cluster training data
newclus.reset_index(level=0, inplace=True)
# merge the cluster assignment dataframe with the cluster training variable dataframe
# by the index variable
merged_train=pd.merge(clus_train, newclus, on='index')
merged_train.head(n=100)

In [None]:
# cluster frequencies
merged_train.cluster.value_counts()

"""
END multiple steps to merge cluster assignment with clustering variables to examine
cluster variable means by cluster
"""

In [None]:
# FINALLY calculate clustering variable means by cluster
clustergrp = merged_train.groupby('cluster').mean()
print ("Clustering variable means by cluster")
print(clustergrp)

# validate clusters in training data by examining cluster differences in GPA using ANOVA
# first have to merge GPA with clustering variables and cluster assignment data 
gpa_data=data_clean['GPA1']
# split GPA data into train and test sets
gpa_train, gpa_test = train_test_split(gpa_data, test_size=.3, random_state=123)
gpa_train1=pd.DataFrame(gpa_train)
gpa_train1.reset_index(level=0, inplace=True)
merged_train_all=pd.merge(gpa_train1, merged_train, on='index')
sub1 = merged_train_all[['GPA1', 'cluster']].dropna()

import statsmodels.formula.api as smf
import statsmodels.stats.multicomp as multi 

gpamod = smf.ols(formula='GPA1 ~ C(cluster)', data=sub1).fit()
print (gpamod.summary())

print ('means for GPA by cluster')
m1= sub1.groupby('cluster').mean()
print (m1)

print ('standard deviations for GPA by cluster')
m2= sub1.groupby('cluster').std()
print (m2)

mc1 = multi.MultiComparison(sub1['GPA1'], sub1['cluster'])
res1 = mc1.tukeyhsd()
print(res1.summary())

### Method 4