In [3]:
import numpy as np
 
def cluster_points(X, mu):
    clusters  = {}
    for x in X:
        bestmukey = min([(i[0], np.linalg.norm(x-mu[i[0]])) \
                    for i in enumerate(mu)], key=lambda t:t[1])[0]
        try:
            clusters[bestmukey].append(x)
        except KeyError:
            clusters[bestmukey] = [x]
    return clusters
 
def reevaluate_centers(mu, clusters):
    newmu = []
    keys = sorted(clusters.keys())
    for k in keys:
        newmu.append(np.mean(clusters[k], axis = 0))
    return newmu
 
def has_converged(mu, oldmu):
    return (set([tuple(a) for a in mu]) == set([tuple(a) for a in oldmu]))
 

def find_centers(X, K):
    # Initialize to K random centers
    oldmu = random.sample(X, K)
    mu = random.sample(X, K)
    while not has_converged(mu, oldmu):
        oldmu = mu
        # Assign all points in X to clusters
        clusters = cluster_points(X, mu)
        # Reevaluate centers
        mu = reevaluate_centers(oldmu, clusters)
    return(mu, clusters)

In [4]:
import sys
import math
import random
import subprocess

"""
This is a pure Python implementation of the K-Means Clustering algorithmn. The
original can be found here:
http://pandoricweb.tumblr.com/post/8646701677/python-implementation-of-the-k-means-clustering
I have refactored the code and added comments to aid in readability.
After reading through this code you should understand clearly how K-means works.
If not, feel free to email me with questions and suggestions. (iandanforth at
gmail)
This script specifically avoids using numpy or other more obscure libraries. It
is meant to be *clear* not fast.
I have also added integration with the plot.ly plotting service. If you put in
your (free) plot.ly credentials below, it will automatically plot the discovered
clusters and their centroids.
To use plotly integration you will need to:
1. Get a username/key from www.plot.ly/api and enter them below
2. Install the plotly module: pip install plotly
"""

PLOTLY_USERNAME = None
PLOTLY_KEY = None

if PLOTLY_USERNAME:
    from plotly import plotly

def main():
    
    # How many points are in our dataset?
    num_points = 10
    
    # For each of those points how many dimensions do they have?
    dimensions = 2
    
    # Bounds for the values of those points in each dimension
    lower = 0
    upper = 200
    
    # The K in k-means. How many clusters do we assume exist?
    num_clusters = 4
    
    # When do we say the optimization has 'converged' and stop updating clusters
    opt_cutoff = 0.5
    
    # Generate some points
    points = [makeRandomPoint(dimensions, lower, upper) for i in xrange(num_points)]
    
    # Cluster those data!
    clusters = kmeans(points, num_clusters, opt_cutoff)

    # Print our clusters
    for i,c in enumerate(clusters):
        for p in c.points:
            print " Cluster: ", i, "\t Point :", p
    
    # Display clusters using plotly for 2d data
    # This uses the 'open' command on a URL and may only work on OSX.
    if dimensions == 2 and PLOTLY_USERNAME:
        print "Plotting points, launching browser ..."
        plotClusters(clusters)

class Point:
    '''
    An point in n dimensional space
    '''
    def __init__(self, coords):
        '''
        coords - A list of values, one per dimension
        '''
        
        self.coords = coords
        self.n = len(coords)
        
    def __repr__(self):
        return str(self.coords)

class Cluster:
    '''
    A set of points and their centroid
    '''
    
    def __init__(self, points):
        '''
        points - A list of point objects
        '''
        
        if len(points) == 0: raise Exception("ILLEGAL: empty cluster")
        # The points that belong to this cluster
        self.points = points
        
        # The dimensionality of the points in this cluster
        self.n = points[0].n
        
        # Assert that all points are of the same dimensionality
        for p in points:
            if p.n != self.n: raise Exception("ILLEGAL: wrong dimensions")
            
        # Set up the initial centroid (this is usually based off one point)
        self.centroid = self.calculateCentroid()
        
    def __repr__(self):
        '''
        String representation of this object
        '''
        return str(self.points)
    
    def update(self, points):
        '''
        Returns the distance between the previous centroid and the new after
        recalculating and storing the new centroid.
        '''
        old_centroid = self.centroid
        self.points = points
        self.centroid = self.calculateCentroid()
        shift = getDistance(old_centroid, self.centroid) 
        return shift
    
    def calculateCentroid(self):
        '''
        Finds a virtual center point for a group of n-dimensional points
        '''
        numPoints = len(self.points)
        # Get a list of all coordinates in this cluster
        coords = [p.coords for p in self.points]
        # Reformat that so all x's are together, all y'z etc.
        unzipped = zip(*coords)
        # Calculate the mean for each dimension
        centroid_coords = [math.fsum(dList)/numPoints for dList in unzipped]
        
        return Point(centroid_coords)

def kmeans(points, k, cutoff):
    
    # Pick out k random points to use as our initial centroids
    initial = random.sample(points, k)
    
    # Create k clusters using those centroids
    clusters = [Cluster([p]) for p in initial]
    
    # Loop through the dataset until the clusters stabilize
    loopCounter = 0
    while True:
        # Create a list of lists to hold the points in each cluster
        lists = [ [] for c in clusters]
        clusterCount = len(clusters)
        
        # Start counting loops
        loopCounter += 1
        # For every point in the dataset ...
        for p in points:
            # Get the distance between that point and the centroid of the first
            # cluster.
            smallest_distance = getDistance(p, clusters[0].centroid)
        
            # Set the cluster this point belongs to
            clusterIndex = 0
        
            # For the remainder of the clusters ...
            for i in range(clusterCount - 1):
                # calculate the distance of that point to each other cluster's
                # centroid.
                distance = getDistance(p, clusters[i+1].centroid)
                # If it's closer to that cluster's centroid update what we
                # think the smallest distance is, and set the point to belong
                # to that cluster
                if distance < smallest_distance:
                    smallest_distance = distance
                    clusterIndex = i+1
            lists[clusterIndex].append(p)
        
        # Set our biggest_shift to zero for this iteration
        biggest_shift = 0.0
        
        # As many times as there are clusters ...
        for i in range(clusterCount):
            # Calculate how far the centroid moved in this iteration
            shift = clusters[i].update(lists[i])
            # Keep track of the largest move from all cluster centroid updates
            biggest_shift = max(biggest_shift, shift)
        
        # If the centroids have stopped moving much, say we're done!
        if biggest_shift < cutoff:
            print "Converged after %s iterations" % loopCounter
            break
    return clusters

def getDistance(a, b):
    '''
    Euclidean distance between two n-dimensional points.
    Note: This can be very slow and does not scale well
    '''
    if a.n != b.n:
        raise Exception("ILLEGAL: non comparable points")
    
    ret = reduce(lambda x,y: x + pow((a.coords[y]-b.coords[y]), 2),range(a.n),0.0)
    return math.sqrt(ret)

def makeRandomPoint(n, lower, upper):
    '''
    Returns a Point object with n dimensions and values between lower and
    upper in each of those dimensions
    '''
    p = Point([random.uniform(lower, upper) for i in range(n)])
    return p

def plotClusters(data):
    '''
    Use the plotly API to plot data from clusters.
    
    Gets a plot URL from plotly and then uses subprocess to 'open' that URL
    from the command line. This should open your default web browser.
    '''
    
    # List of symbols each cluster will be displayed using    
    symbols = ['circle', 'cross', 'triangle-up', 'square']

    # Convert data into plotly format.
    traceList = []
    for i, c in enumerate(data):
        data = []
        for p in c.points:
            data.append(p.coords)
        # Data
        trace = {}
        trace['x'], trace['y'] = zip(*data)
        trace['marker'] = {}
        trace['marker']['symbol'] = symbols[i]
        trace['name'] = "Cluster " + str(i)
        traceList.append(trace)
        # Centroid (A trace of length 1)
        centroid = {}
        centroid['x'] = [c.centroid.coords[0]]
        centroid['y'] = [c.centroid.coords[1]]
        centroid['marker'] = {}
        centroid['marker']['symbol'] = symbols[i]
        centroid['marker']['color'] = 'rgb(200,10,10)'
        centroid['name'] = "Centroid " + str(i)
        traceList.append(centroid)
    
    # Log in to plotly
    py = plotly(username=PLOTLY_USERNAME, key=PLOTLY_KEY)

    # Style the chart
    datastyle = {'mode':'markers',
             'type':'scatter',
             'marker':{'line':{'width':0},
                       'size':12,
                       'opacity':0.6,
                       'color':'rgb(74, 134, 232)'}}
    
    resp = py.plot(*traceList, style = datastyle)
    
    # Display that plot in a browser
    cmd = "open " + resp['url']
    subprocess.call(cmd, shell=True)

if __name__ == "__main__": 
    main()

Converged after 2 iterations
 Cluster:  0 	 Point : [89.96414655004223, 18.38519860018839]
 Cluster:  0 	 Point : [35.23550391919492, 36.85302554728449]
 Cluster:  0 	 Point : [53.493855329763676, 8.045527560869248]
 Cluster:  1 	 Point : [104.40858864345768, 165.83245625564297]
 Cluster:  1 	 Point : [119.90939649593165, 123.2629442611433]
 Cluster:  1 	 Point : [81.48866152480238, 132.0061421383687]
 Cluster:  1 	 Point : [3.1806701793372527, 131.7165815542929]
 Cluster:  2 	 Point : [170.95244759557318, 49.757635765189455]
 Cluster:  2 	 Point : [81.51323509110952, 52.47401430129912]
 Cluster:  2 	 Point : [92.62045700843697, 21.17508976902245]


In [5]:
from pylab            import plot,show
from numpy            import vstack,array
from numpy.random     import rand
from scipy.cluster.vq import kmeans, vq, whiten

import csv

if __name__ == "__main__":

    # clusters
    K = 3

    data_arr = []
    meal_name_arr = []

    with open(.csv', 'rb') as f:
        reader = csv.reader(f)
        for row in reader:
            data_arr.append([float(x) for x in row[1:]])
            meal_name_arr.append([row[0]])

    data = vstack( data_arr )
    meal_name = vstack(meal_name_arr)

    # normalization
    data = whiten(data)

    # computing K-Means with K (clusters)
    centroids, distortion = kmeans(data,3)
    print "distortion = " + str(distortion)

    # assign each sample to a cluster
    idx,_ = vq(data,centroids)

    # some plotting using numpy's logical indexing
    plot(data[idx==0,0], data[idx==0,1],'ob',
         data[idx==1,0], data[idx==1,1],'or',
         data[idx==2,0], data[idx==2,1],'og')

    print meal_name
    print data

    for i in range(K):
        result_names = meal_name[idx==i, 0]
        print "================================="
        print "Cluster " + str(i+1)
        for name in result_names:
            print name

    plot(centroids[:,0],
         centroids[:,1],
         'sg',markersize=8)

    show()

ValueError: could not convert string to float: Violation Date

In [1]:
import os
import numpy as np

# kmeans clustering algorithm
# data = set of data points
# k = number of clusters
# c = initial list of centroids (if provided)
#
def kmeans(data, k, c):
    centroids = []

    centroids = randomize_centroids(data, centroids, k)  

    old_centroids = [[] for i in range(k)] 

    iterations = 0
    while not (has_converged(centroids, old_centroids, iterations)):
        iterations += 1

        clusters = [[] for i in range(k)]

        # assign data points to clusters
        clusters = euclidean_dist(data, centroids, clusters)

        # recalculate centroids
        index = 0
        for cluster in clusters:
            old_centroids[index] = centroids[index]
            centroids[index] = np.mean(cluster, axis=0).tolist()
            index += 1


    print("The total number of data instances is: " + str(len(data)))
    print("The total number of iterations necessary is: " + str(iterations))
    print("The means of each cluster are: " + str(centroids))
    print("The clusters are as follows:")
    for cluster in clusters:
        print("Cluster with a size of " + str(len(cluster)) + " starts here:")
        print(np.array(cluster).tolist())
        print("Cluster ends here.")

    return

# Calculates euclidean distance between
# a data point and all the available cluster
# centroids.      
def euclidean_dist(data, centroids, clusters):
    for instance in data:  
        # Find which centroid is the closest
        # to the given data point.
        mu_index = min([(i[0], np.linalg.norm(instance-centroids[i[0]])) \
                            for i in enumerate(centroids)], key=lambda t:t[1])[0]
        try:
            clusters[mu_index].append(instance)
        except KeyError:
            clusters[mu_index] = [instance]

    # If any cluster is empty then assign one point
    # from data set randomly so as to not have empty
    # clusters and 0 means.        
    for cluster in clusters:
        if not cluster:
            cluster.append(data[np.random.randint(0, len(data), size=1)].flatten().tolist())

    return clusters


# randomize initial centroids
def randomize_centroids(data, centroids, k):
    for cluster in range(0, k):
        centroids.append(data[np.random.randint(0, len(data), size=1)].flatten().tolist())
    return centroids


# check if clusters have converged    
def has_converged(centroids, old_centroids, iterations):
    MAX_ITERATIONS = 1000
    if iterations > MAX_ITERATIONS:
        return True
    return old_centroids == centroids

In [113]:
# -*- coding: utf-8 -*-
"""
A program to carry out Kmeans clustering where K=4
on data relating to wine marketing from book 
"Data Smart: Using Data Science to Transform Information into Insight"

Requires csv input file OfferInfo.csv with headings
'Campaign', 'Varietal', 'Minimum Qty (kg)', 'Discount (%)', 'Origin', 'Past Peak'
and input file Transactions.csv with headings
'Customer Last Name', 'Offer #'
"""

#make more similar to Python 3
from __future__ import print_function, division, absolute_import, unicode_literals

#other stuff we need to import
import csv
import numpy as np
from sklearn.cluster import KMeans
 

#beginning of main program

#read in OfferInfo.csv
csvf = open('EcbNoticeofViolations.csv','rU')
rows = csv.reader(csvf)
ecb_sheet = [row for row in rows]
#print (ecb_sheet)
csvf.close()

#read in Transactions.csv
csvf = open('ParkingViolations.csv','rU')
rows = csv.reader(csvf)
parking_sheet = [row for row in rows]
#print (parking_sheet)
csvf.close()

#converting time to secs
def get_sec(s):
    l = s.split(':')
    return int(l[0]) 

#first row of each spreadsheet is column headings, so we remove them
ecb_sheet_data = ecb_sheet[1:]
parking_sheet_data = parking_sheet[1:]
#print (ecb_sheet_data)
#print (parking_sheet_data)


K=2 #five clusters
num_times = len(ecb_sheet) #assume listed offers are distinct
#print (num_times)

#find the sorted list of customer last names
county_names = []
for row in ecb_sheet_data:
    county_names.append(row[4])
county_names = list(set(county_names))
x=county_names.sort()
#print (county_names)
num_counties = len(county_names)

#times = []
#for row in ecb_sheet_data:
 #   times.append(row[2])
#times = list(set(times))
#num_times = len(times)
#print (num_times)

#create a num_deals x num_customers matrix of which customer took which deal
county_time_matrix = np.zeros((num_times,num_counties))
for row in ecb_sheet_data:
    county_number = county_names.index(row[4])
    #print (range(county_number))
    #time_secs = ecb_sheet_data(row[2])
    #time_num = get_sec(time_secs)
    time_number = int(get_sec(row[2]))
    #print (time_number)
    county_time_matrix[time_number-1,county_number] = 1
time_county_matrix = county_time_matrix.transpose()
print (county_time_matrix)
print (county_number)
print (time_number)

#initialize and carry out clustering
km = KMeans(n_clusters = K)
km.fit(time_county_matrix)

#find center of clusters
centers = km.cluster_centers_
centers[centers<0] = 0 #the minimization function may find very small negative numbers, we threshold them to 0
centers = centers.round(2)
print('\n--------Centers of the four different clusters--------')
print('Time\t Cent1\t Cent2')
for i in range(num_times):
    print(i+1,'\t',centers[0,i],'\t',centers[1,i])


#find which cluster each customer is in
prediction = km.predict(time_county_matrix)
#print (prediction)
print('\n--------Which cluster each county is in--------')
print('{:<15}\t{}'.format('County','Cluster'))
for i in range(len(prediction)):
    #print (len(prediction))
    print('{:<15}\t{}'.format(county_names[i],prediction[i]+1))
    
#determine which deals are most often in each cluster
county_cluster_matrix = np.zeros((num_times,K),dtype=np.int)
print('\n-----How many of each deal involve a customer in each cluster-----')
print('Time\t Clust1\t Clust2')            
for i in range(num_times):
    for j in range(num_counties):
        if county_time_matrix[i,j] == 1:
            county_cluster_matrix[i,prediction[j]] += 1

for i in range(num_times):
    print(i+1,'\t',end='')
    for j in range(K):
        print(county_cluster_matrix[i,j],'\t',end='')
    print()
print()

print('The total distance of the solution found is',sum((km.transform(time_county_matrix)).min(axis=1)))


[[ 0.  0.]
 [ 0.  0.]
 [ 0.  0.]
 [ 0.  0.]
 [ 0.  0.]
 [ 0.  0.]
 [ 0.  0.]
 [ 0.  0.]
 [ 0.  0.]
 [ 1.  0.]
 [ 1.  1.]
 [ 0.  1.]
 [ 0.  1.]
 [ 0.  1.]
 [ 1.  1.]
 [ 0.  0.]
 [ 1.  0.]]
1
11

--------Centers of the four different clusters--------
Time	 Cent1	 Cent2
1 	 0.0 	 0.0
2 	 0.0 	 0.0
3 	 0.0 	 0.0
4 	 0.0 	 0.0
5 	 0.0 	 0.0
6 	 0.0 	 0.0
7 	 0.0 	 0.0
8 	 0.0 	 0.0
9 	 0.0 	 0.0
10 	 0.0 	 1.0
11 	 1.0 	 1.0
12 	 1.0 	 0.0
13 	 1.0 	 0.0
14 	 1.0 	 0.0
15 	 1.0 	 1.0
16 	 0.0 	 0.0
17 	 0.0 	 1.0

--------Which cluster each county is in--------
County         	Cluster
MANHATTAN      	2
QUEENS         	1

-----How many of each deal involve a customer in each cluster-----
Time	 Clust1	 Clust2
1 	0 	0 	
2 	0 	0 	
3 	0 	0 	
4 	0 	0 	
5 	0 	0 	
6 	0 	0 	
7 	0 	0 	
8 	0 	0 	
9 	0 	0 	
10 	0 	1 	
11 	1 	1 	
12 	1 	0 	
13 	1 	0 	
14 	1 	0 	
15 	1 	1 	
16 	0 	0 	
17 	0 	1 	

The total distance of the solution found is 0.0


In [125]:
# -*- coding: utf-8 -*-
"""
A program to carry out Kmeans clustering where K=4
on data relating to wine marketing from book 
"Data Smart: Using Data Science to Transform Information into Insight"

Requires csv input file OfferInfo.csv with headings
'Campaign', 'Varietal', 'Minimum Qty (kg)', 'Discount (%)', 'Origin', 'Past Peak'
and input file Transactions.csv with headings
'Customer Last Name', 'Offer #'
"""

#make more similar to Python 3
from __future__ import print_function, division, absolute_import, unicode_literals

#other stuff we need to import
import csv
import numpy as np
from sklearn.cluster import KMeans

#beginning of main program

#read in OfferInfo.csv
csvf = open('EcbNoticeofViolations.csv','rU')
rows = csv.reader(csvf)
ecb_sheet = [row for row in rows]
csvf.close()

#read in Transactions.csv
#csvf = open('Transactions.csv','rU')
#rows = csv.reader(csvf)
#transaction_sheet = [row for row in rows]
#csvf.close()

#first row of each spreadsheet is column headings, so we remove them
ecb_sheet_data = ecb_sheet[1:]
#print (ecb_sheet_data)
#transaction_sheet_data = transaction_sheet[1:]

K=4 #four clusters
#num_tickets = len(ecb_sheet_data) #assume listed offers are distinct

#find the sorted list of customer last names
times = []
tickets = []
for row in ecb_sheet_data:
    tickets.append(row[0])
    times.append(row[2])
tickets = list(set(tickets))
times = list(set(times))
#print (times)
#print (tickets)
times.sort()
num_times = len(times)

#create a num_deals x num_customers matrix of which customer took which deal
ticket_time_matrix = np.zeros((num_tickets,num_times))
for row in ecb_sheet_data:
    time_number = times.index(row[2])
    ticket_number = tickets.index(row[0])
    ticket_time_matrix[ticket_number-1,time_number] = 1
time_ticket_matrix = ticket_time_matrix.transpose()

#initialize and carry out clustering
km = KMeans(n_clusters = K)
km.fit(time_ticket_matrix)

#find center of clusters
centers = km.cluster_centers_
centers[centers<0] = 0 #the minimization function may find very small negative numbers, we threshold them to 0
centers = centers.round(2)
print('\n--------Centers of the four different clusters--------')
print('Deal\t Cent1\t Cent2\t Cent3\t Cent4')
for i in range(num_tickets):
    print(i+1,'\t',centers[0,i],'\t',centers[1,i],'\t',centers[2,i],'\t',centers[3,i])

#find which cluster each customer is in
prediction = km.predict(time_ticket_matrix)
print(prediction)
print('\n--------Which cluster each customer is in--------')
print('{:<15}\t{}'.format('Time','Cluster'))
for i in range(len(prediction)):
    print('{:<15}\t{}'.format(times[i],prediction[i]+1))
    

#determine which deals are most often in each cluster
time_cluster_matrix = np.zeros((num_tickets,K),dtype=np.int)
print('\n-----How many of each deal involve a customer in each cluster-----')
print('Time\t Clust1\t Clust2\t Clust3\t Clust4')            
for i in range(num_tickets):
    for j in range(num_times):
        if ticket_time_matrix[i,j] == 1:
            time_cluster_matrix[i,prediction[j]] += 1

for i in range(num_tickets):
    print(i+1,'\t',end='')
    for j in range(K):
        print(time_cluster_matrix[i,j],'\t',end='')
    print()
print()

print('The total distance of the solution found is',sum((km.transform(time_ticket_matrix)).min(axis=1)))



--------Centers of the four different clusters--------
Deal	 Cent1	 Cent2	 Cent3	 Cent4
1 	 0.0 	 0.0 	 0.0 	 1.0
2 	 0.0 	 0.0 	 1.0 	 0.0
3 	 0.0 	 0.14 	 0.0 	 0.0
4 	 0.0 	 0.14 	 0.0 	 0.0
5 	 0.0 	 0.0 	 0.0 	 1.0
6 	 1.0 	 0.0 	 0.0 	 0.0
7 	 0.0 	 0.0 	 1.0 	 0.0
8 	 0.0 	 0.14 	 0.0 	 0.0
9 	 0.0 	 0.0 	 1.0 	 0.0
10 	 0.0 	 0.14 	 0.0 	 0.0
11 	 0.0 	 0.14 	 0.0 	 0.0
12 	 0.0 	 0.14 	 0.0 	 0.0
13 	 1.0 	 0.0 	 0.0 	 0.0
14 	 0.0 	 0.14 	 0.0 	 0.0
15 	 0.0 	 0.0 	 1.0 	 0.0
16 	 0.0 	 0.14 	 0.0 	 0.0
[0 1 1 1 1 3 1 2 1 1]

--------Which cluster each customer is in--------
Time           	Cluster
10:03:00       	1
11:13:00       	2
11:45:00       	2
12:29:00       	2
13:30:00       	2
13:53:00       	4
14:45:00       	2
15:06:00       	3
15:39:00       	2
17:38:00       	2

-----How many of each deal involve a customer in each cluster-----
Time	 Clust1	 Clust2	 Clust3	 Clust4
1 	0 	0 	0 	1 	
2 	0 	0 	1 	0 	
3 	0 	1 	0 	0 	
4 	0 	1 	0 	0 	
5 	0 	0 	0 	1 	
6 	1 	0 	0 	0 	
7 

In [131]:
# -*- coding: utf-8 -*-
"""
A program to carry out Kmeans clustering where K=4
on data relating to wine marketing from book 
"Data Smart: Using Data Science to Transform Information into Insight"

Requires csv input file OfferInfo.csv with headings
'Campaign', 'Varietal', 'Minimum Qty (kg)', 'Discount (%)', 'Origin', 'Past Peak'
and input file Transactions.csv with headings
'Customer Last Name', 'Offer #'
"""

#make more similar to Python 3
from __future__ import print_function, division, absolute_import, unicode_literals

#other stuff we need to import
import csv
import numpy as np
from sklearn.cluster import KMeans

#beginning of main program

#read in OfferInfo.csv
csvf = open('EcbNoticeofViolations.csv','rU')
rows = csv.reader(csvf)
ecb_sheet = [row for row in rows]
csvf.close()

#read in Transactions.csv
#csvf = open('Transactions.csv','rU')
#rows = csv.reader(csvf)
#transaction_sheet = [row for row in rows]
#csvf.close()

#first row of each spreadsheet is column headings, so we remove them
ecb_sheet_data = ecb_sheet[1:]
#print (ecb_sheet_data)
#transaction_sheet_data = transaction_sheet[1:]

K=4 #four clusters
#num_tickets = len(ecb_sheet_data) #assume listed offers are distinct

#find the sorted list of customer last names
times = []
tickets = []
for row in ecb_sheet_data:
    tickets.append(row[0])
    times.append(row[2])
tickets = list(set(tickets))
times = list(set(times))
#print (times)
#print (tickets)
times.sort()
num_times = len(times)

#create a num_deals x num_customers matrix of which customer took which deal
ticket_time_matrix = np.zeros((num_tickets,num_times))
for row in ecb_sheet_data:
    time_number = times.index(row[2])
    ticket_number = tickets.index(row[0])
    ticket_time_matrix[ticket_number-1,time_number] = 1
time_ticket_matrix = ticket_time_matrix.transpose()
print (time_ticket_matrix)

#initialize and carry out clustering
km = KMeans(n_clusters = K)
km.fit(time_ticket_matrix)

#find center of clusters
centers = km.cluster_centers_
centers[centers<0] = 0 #the minimization function may find very small negative numbers, we threshold them to 0
centers = centers.round(2)
print('\n--------Centers of the four different clusters--------')
print('Deal\t Cent1\t Cent2\t Cent3\t Cent4')
for i in range(num_tickets):
    print(i+1,'\t',centers[0,i],'\t',centers[1,i],'\t',centers[2,i],'\t',centers[3,i])

#find which cluster each customer is in
prediction = km.predict(time_ticket_matrix)
print(prediction)
print('\n--------Which cluster each customer is in--------')
print('{:<15}\t{}'.format('Time','Cluster'))
for i in range(len(prediction)):
    print('{:<15}\t{}'.format(times[i],prediction[i]+1))

[[ 0.  0.  0.  0.  0.  1.  0.  0.  0.  0.  0.  0.  1.  0.  0.  0.]
 [ 0.  0.  1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  1.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  1.  0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
 [ 1.  0.  0.  0.  1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  1.  0.  0.  0.  0.]
 [ 0.  1.  0.  0.  0.  0.  1.  0.  1.  0.  0.  0.  0.  0.  1.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  1.  0.  1.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  1.  0.  0.  0.  0.  0.  0.]]

--------Centers of the four different clusters--------
Deal	 Cent1	 Cent2	 Cent3	 Cent4
1 	 0.0 	 0.0 	 0.0 	 1.0
2 	 0.0 	 0.0 	 1.0 	 0.0
3 	 0.14 	 0.0 	 0.0 	 0.0
4 	 0.14 	 0.0 	 0.0 	 0.0
5 	 0.0 	 0.0 	 0.0 	 1.0
6 	 0.0 	 1.0 	 0.0 	 0.0
7 	 0.0 	 0.0 	 1.0 	 0.0
8 	 0.14 	 0.0 	 0.0 	 0.0
9 	 0.0 	 0.0 	 1.0 	 0.0
10 

In [133]:
# -*- coding: utf-8 -*-
"""
A program to carry out Kmeans clustering where K=4
on data relating to wine marketing from book 
"Data Smart: Using Data Science to Transform Information into Insight"

Requires csv input file OfferInfo.csv with headings
'Campaign', 'Varietal', 'Minimum Qty (kg)', 'Discount (%)', 'Origin', 'Past Peak'
and input file Transactions.csv with headings
'Customer Last Name', 'Offer #'
"""

#make more similar to Python 3
from __future__ import print_function, division, absolute_import, unicode_literals

#other stuff we need to import
import csv
import numpy as np
from sklearn.cluster import KMeans

#beginning of main program

#read in OfferInfo.csv
csvf = open('ECB_Notice_of_Violations-3.csv','rU')
rows = csv.reader(csvf)
ecb_sheet = [row for row in rows]
csvf.close()

#read in Transactions.csv
#csvf = open('Transactions.csv','rU')
#rows = csv.reader(csvf)
#transaction_sheet = [row for row in rows]
#csvf.close()

#first row of each spreadsheet is column headings, so we remove them
ecb_sheet_data = ecb_sheet[1:]
#print (ecb_sheet_data)
#transaction_sheet_data = transaction_sheet[1:]

K=2 #four clusters
#num_countys = len(ecb_sheet_data) #assume listed offers are distinct

#find the sorted list of customer last names
times = []
countys = []
for row in ecb_sheet_data:
    countys.append(row[4])
    times.append(row[2])
countys = list(set(countys))
times = list(set(times))
#print (times)
#print (countys)
times.sort()
num_times = len(times)
num_countys = len(countys)

#create a num_deals x num_customers matrix of which customer took which deal
county_time_matrix = np.zeros((num_countys,num_times))
for row in ecb_sheet_data:
    time_number = times.index(row[2])
    county_number = countys.index(row[4])
    county_time_matrix[county_number-1,time_number] = 1
time_county_matrix = county_time_matrix.transpose()
print (time_county_matrix)

#initialize and carry out clustering
km = KMeans(n_clusters = K)
km.fit(time_county_matrix)

#find center of clusters
centers = km.cluster_centers_
centers[centers<0] = 0 #the minimization function may find very small negative numbers, we threshold them to 0
centers = centers.round(2)
print('\n--------Centers of the four different clusters--------')
print('Deal\t Cent1\t Cent2')
for i in range(num_countys):
    print(i+1,'\t',centers[0,i],'\t',centers[1,i])

#find which cluster each customer is in
prediction = km.predict(time_county_matrix)
print(prediction)
print('\n--------Which cluster each customer is in--------')
print('{:<15}\t{}'.format('Time','Cluster'))
for i in range(len(prediction)):
    print('{:<15}\t{}'.format(times[i],prediction[i]+1))

#determine which deals are most often in each cluster
time_cluster_matrix = np.zeros((num_countys,K),dtype=np.int)
print('\n-----How many of each deal involve a customer in each cluster-----')
print('County\t Clust1\t Clust2')            
for i in range(num_countys):
    for j in range(num_times):
        if county_time_matrix[i,j] == 1:
            time_cluster_matrix[i,prediction[j]] += 1

for i in range(num_countys):
    print(i+1,'\t',end='')
    for j in range(K):
        print(time_cluster_matrix[i,j],'\t',end='')
    print()
print()

print('The total distance of the solution found is',sum((km.transform(time_county_matrix)).min(axis=1)))

KeyboardInterrupt: 