## clustering items via kmeans

In [11]:
from sklearn.cluster import AgglomerativeClustering, KMeans
from sklearn.preprocessing import Normalizer
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline

from nltk.tokenize import word_tokenize
from pymongo import MongoClient

import pandas as pd

helper function

In [3]:
def preprocess(s, tokenize_words=False):
    s = s.lower()
    s = s.replace(' - ', ' ')
    s = ''.join(x for x in s if x not in ['(', ')', '[', ']', ',', '®', ':', '-', '+'])
    s = s.replace(' mm', 'mm')
    s = s.replace('/', ' ')
    s = s.replace('gold edition', 'gold_edition')
    s = s.replace('premium edition', 'premium_edition')
    s = s.replace('standard edition', 'standard_edition')
    if tokenize_words:
        stoplist = ['&', 'a', 'and', 'the', 'for', 'of', 'to', 'in', 'into']
        s = [word for word in s.split() if word not in stoplist]
        s = word_tokenize(' '.join(s))
    return s


def normalize_in_list(row, type='cat'):
    tmp = []
    for x in row:
        if type == 'cat':
            tmp.append(preprocess(x, tokenize_words=False))
        if type == 'item':
            tmp.append(preprocess(x, tokenize_words=True))
    return tmp


def flatten_lists(l):
    return [item for sublist in l for item in sublist]

load data

In [4]:
client = MongoClient()
db = client.irwin  # changed from amazonproducts
products = db.products
data = products.find({'asin': {'$exists': True}, 'name': {'$exists': True}})

df = pd.DataFrame(list(data))


In [5]:
df.head(3)

Unnamed: 0,__v,_id,alsoBought,alsoViewed,asin,boughtAfterView,category,frequentlyBought,name
0,0,578814f0c23dfe317ef08b9c,[],"[B00PV515DU, B00F27JDUY, B0081O0NT0, B0071NH8B...",B0099HOHW2,"[Sony PlayStation Vita WiFi, PlayStation Vita ...","[Video Games, PlayStation Vita, Consoles]",[],Crystal White Sony Playstation PS Vita Portabl...
1,0,578814f2c23dfe317ef08baa,[],"[B00HLT0YT0, B00PV515DU, B003O6EATE, B014QAIBL...",B00AW930RE,[Borderlands 2 - Limited Edition - PlayStation...,"[Video Games, PlayStation Vita, Consoles]",[],Cosmic Red Sony PlayStation PS Vita Portable H...
2,0,578814f4c23dfe317ef08bad,"[B014G02C7I, B01D3K7SIK, B014G03PH4, B014FZZJ0...",[],B014FZYV3C,"[Pokémon Yellow Version - 3DS [Digital Code], ...","[Video Games, Kids & Family, Nintendo 3DS, Games]",[],Animal Crossing: New Leaf - 3DS [Digital Code]


In [9]:
# messing around with the data to see if i could get anything useful so not all these columns are necessary

df['name_n'] = df['name'].apply(lambda x: normalize_in_list([x], type='item'))
df['boughtAfterView_n'] = df['boughtAfterView'].apply(lambda x: normalize_in_list(x, type='item'))
df['category_n'] = df['category'].apply(lambda x: normalize_in_list(x, type='cat'))
df['to_embed'] = df.apply(lambda row: sum(row['name_n'] + [flatten_lists(row['boughtAfterView_n']) + row['category_n']],[]), axis=1)
df['to_embed2'] = df['to_embed'].apply(lambda x: ' '.join(x))


# item ASIN dict possibly necessary later
asin = {}
for i, row in df.iterrows():
    asin[str(row['_id'])] = row['name']
rev_asin = {}
for k, v in asin.items():
    rev_asin[v] = k

In [12]:
vectorizer = TfidfVectorizer(max_df=0.5, 
                             max_features=None, 
                             stop_words='english', 
                             use_idf=True)
X = vectorizer.fit_transform(df.to_embed2)

svd = TruncatedSVD(10)
normalizer = Normalizer(copy=False)
pipe_it_up = make_pipeline(svd, normalizer)
X2 = pipe_it_up.fit_transform(X)

km = KMeans(n_clusters=4, max_iter=10, n_init=1, verbose=2)
km.fit(X2)

df['predicted_cluster'] = km.predict(X2)

Initialization complete
Iteration  0, inertia 7.670
Iteration  1, inertia 4.683
Converged at iteration 1


In [13]:
df[['name', 'predicted_cluster']]

Unnamed: 0,name,predicted_cluster
0,Crystal White Sony Playstation PS Vita Portabl...,2
1,Cosmic Red Sony PlayStation PS Vita Portable H...,2
2,Animal Crossing: New Leaf - 3DS [Digital Code],3
3,Kirby: Planet Robobot - 3DS [Digital Code],3
4,Nintendo 3DS XL Black/Black - Nintendo 3DS XL,3
5,Yoshi's Woolly World - Wii U [Digital Code],0
6,Super Smash Bros. - Wii U [Digital Code],0
7,Mario Party 10 - Wii U [Digital Code],0
8,The Legend of Zelda: Twilight Princess HD - Wi...,1
9,Star Fox Zero + Star Fox Guard - Wii U [Digita...,1


These make sense and while the dataset is really small and skewed (you will need to adjust n_clusters and how you munge) it looks right.  If you take an item like *Mario Party 10 - Wii U [Digital Code]* its in cluster 0 which has items like *Yoshi's Woolly World - Wii U [Digital Code]* to these would be the original clusters + names you can than train in a neural network of some sort.  Theres other possible ways to cluster and you can use word2vec but then you need to cluster off the vestors and map the names to the clusters which is more work.  Try something simple usually first to just understand what you are doing before you start throwing complicated and new techniques at it.