In [1]:
#Import libraries
import numpy as np
import scipy as sp
import pandas as pd
import matplotlib.pyplot as plt



#Set display options to display all data
pd.set_option('display.max_columns', 1000000)
pd.set_option('display.max_rows', 1000000)

In [2]:
#Import csv and fill Na values 
data = pd.read_csv('Amazon.csv')
print(data.shape)
data.Summary = data.Summary.fillna('0')
data.ProfileName = data.ProfileName.fillna(' ')

(455000, 13)


In [3]:
#Create columns stating number of reviews written pre user, and per product
data['UserCounts'] = data.groupby('UserId')['UserId'].transform('count')
data['ProductCounts'] = data.groupby('ProductId')['ProductId'].transform('count')
data['Time'] = pd.to_datetime(data['Time'],unit='s')

In [4]:
#removing review outliears
data = data[data.UserCounts <= 150]
data = data[data.UserCounts >= 2]
#removing product outliers
data = data[data.ProductCounts <= 550]
data.shape

(300664, 15)

In [5]:
#grouping userIDs with their Products
product_by_pid = data.groupby('ProductId')['UserId'].apply(lambda x: ' '.join(x)).reset_index()
product_by_pid.shape

(46754, 2)

In [6]:
#transform to matrix
uidArray = product_by_pid.UserId.as_matrix()


In [7]:
from sklearn.feature_extraction.text import CountVectorizer
count = CountVectorizer()

In [8]:
#Count Vectorize
userFeatures = count.fit_transform(uidArray)
userFeatures.shape

(46754, 67205)

In [9]:
#Fit to Kmeans
from sklearn.cluster import KMeans

km = KMeans(n_clusters=11, max_iter=100, n_init=1000)
km.fit(userFeatures)

KMeans(copy_x=True, init='k-means++', max_iter=100, n_clusters=11,
    n_init=1000, n_jobs=1, precompute_distances='auto', random_state=None,
    tol=0.0001, verbose=0)

In [10]:
#Re-format
convert_km = pd.DataFrame(km.labels_)
results = pd.concat([product_by_pid, convert_km], axis=1, ignore_index=True)
results.columns = ['ProductId', 'UserIds', 'cluster']
clusters = pd.merge(results, data, how='left', on='ProductId')
#cluster_results = clusters.Summary.groupby(clusters.cluster)

In [11]:
#Show clustering results
clusters.cluster.value_counts()

1     277442
8       4097
2       3445
9       3270
7       2476
4       2058
0       1914
3       1756
6       1755
10      1290
5       1161
Name: cluster, dtype: int64

In [12]:
#Splitting up clusters for analyzing
cluster0 = clusters.loc[clusters['cluster'] == 0]
cluster1 = clusters.loc[clusters['cluster'] == 1]
cluster2 = clusters.loc[clusters['cluster'] == 2]
cluster3 = clusters.loc[clusters['cluster'] == 3]
cluster4 = clusters.loc[clusters['cluster'] == 4]
cluster5 = clusters.loc[clusters['cluster'] == 5]
cluster6 = clusters.loc[clusters['cluster'] == 6]
cluster7 = clusters.loc[clusters['cluster'] == 7]
cluster8 = clusters.loc[clusters['cluster'] == 8]
cluster9 = clusters.loc[clusters['cluster'] == 9]
cluster10 = clusters.loc[clusters['cluster'] == 10]



In [13]:
# set options to show more data
pd.set_option('max_colwidth',2000)

In [14]:
print(cluster0.Summary.head(15))
print(cluster1.Summary.head(15))
print(cluster2.Summary.head(15))
print(cluster3.Summary.head(15))
print(cluster4.Summary.head(15))
print(cluster5.Summary.head(15))
print(cluster6.Summary.head(15))
print(cluster7.Summary.head(15))
print(cluster8.Summary.head(15))
print(cluster9.Summary.head(15))
print(cluster10.Summary.head(15))


255972                                                                        Will reach for this product in the store...
255973                                                                                                          Very good
255974                                                                            Tap water for the price of spring water
255975                                                                      Zero difference from any other bottled water.
255976                                                                             Refreshing....tastes like water should
255977    This nutrition-nut/water  connoisseur agrees that Essentia Water delivers unmatched hydration, health and taste
255978                                                                                            Not for giraffe-rinsing
255979                                                                         Doesn't taste as good as some other waters
255980                  

In [15]:
# 0       1914 water
# 1     277442 books/dogtreats
# 2       3445 herb tea
# 3       1756 vitamins
# 4       2058 dogtreats
# 5       1161 jacklinks
# 6       1755 sweet Coffees
# 7       2476 tangy chips
# 8       4097 Popchips
# 9       3270 strong coffee
# 10      1290 hot chocolate


In [16]:
print(cluster0.Text.head(5))
print(cluster1.Text.head(5))
print(cluster2.Text.head(5))
print(cluster3.Text.head(5))
print(cluster4.Text.head(5))
print(cluster5.Text.head(5))
print(cluster6.Text.head(5))
print(cluster7.Text.head(5))
print(cluster8.Text.head(15))
print(cluster9.Text.head(15))
print(cluster10.Text.head(15))



255972                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  