In [53]:
import pandas as pd
import numpy as np
import nltk
from pprint import pprint

from nltk.corpus import stopwords
# if this is first time you use nltk
#   >>> import nltk
#   >>> nltk.download('stopwords')
#   >>> nltk.download('punkt')

stops = set(stopwords.words('english'))
import string
puns = string.punctuation

# Word2Vec
1. 簡介
    
    Word2Vec其實是Word to Vector的簡稱，意在將每一個字轉換成一條向量，並讓這字的語意透過這條向量描繪出來。早期做自然語言處理時，很難對讓電腦對詞背後的意思有更深一層的理解，因此詞與詞之間的關係很難被挖掘出來，像是相似詞、相反詞、對應詞等，因此Word2Vec在這樣的背景下產生就顯得極其珍貴。
    
    
2. 作用&賣點
    1. 它可以找到相似的字。
    2. 它可以加減，像是Taiwan-Taipei=Germany-Berlin。
    
3. 訓練方法: 參照[這篇網誌](https://goatwang.github.io/2017/06/06/Train-Wiki-Corpus-by-gensim-Word2vec/)

4. 已經訓練好的model:
    1. [Various Models](http://ahogrammer.com/2017/01/20/the-list-of-pretrained-word-embeddings/)
    2. 本課程使用之[GloVe Word2Vec](https://nlp.stanford.edu/projects/glove/)(時間考量已經壓縮過，只留下這個文件中會用到的字)。

5. 專案: 將e-commerce商品標籤分群

# Load Data

In [16]:
with open('all_categories.list', 'r', encoding='utf8') as f:
    all_categories = eval(f.read())
print("商品標籤個數:", len(all_categories))
print("前10個商品標籤:", all_categories[:10])


商品標籤個數: 910
前10個商品標籤: ['Small Animal', 'Kitchen', 'Fragrance', 'Track & Sweat Suits', 'Wallet', 'Favors', 'Quilts', 'Sticker', 'Pets', 'Skirt']


In [17]:
word_vec_mapping = {}
path = "glove.twitter.27B.50d.txt"
with open(path, 'r', encoding='utf8') as f:  ## 這個文檔的格式是一行一個字並配上他的向量，以空白鍵分隔
    count = 0
    for line in f:  
        tokens = line.split()
        token = tokens[0]  ## 第一個token就是詞彙
        vec = tokens[1:]  ## 後面的token向量
        word_vec_mapping[token] = np.array(vec, dtype=np.float32)  ## 把整個model做成一個字典，以利查找字對應的向量
        count += 1
vec_dimensions = len(word_vec_mapping.get('men'))  ## 記錄這個mdoel每一個字的維度
print("vec_dimensions:", vec_dimensions)
pprint(list(word_vec_mapping.items())[:10])

vec_dimensions 50
[('protection',
  array([-2.6041e-01,  9.3470e-03, -1.2779e+00,  9.3997e-01,  1.3464e-01,
        2.8652e-01,  4.7567e-01, -5.1847e-01,  6.8337e-01, -6.8621e-01,
        2.5913e-01,  2.9725e-01, -2.7242e+00,  3.8473e-01,  1.2560e+00,
        9.2542e-01, -1.0193e-01,  1.5966e-01,  2.2935e-03, -3.8759e-01,
       -1.0683e+00, -5.3661e-01,  6.1156e-03,  1.5376e-01,  3.5490e-01,
        5.9846e-01,  8.5329e-02,  8.7829e-01,  2.1870e-01,  1.0114e+00,
       -6.9214e-03, -5.1215e-01, -2.7296e-01, -8.3198e-01,  8.5664e-01,
       -5.2144e-01, -4.6561e-01,  9.8429e-01, -6.7122e-01, -9.6129e-01,
        7.8881e-01, -6.1323e-01,  2.6551e-01, -3.9457e-01,  5.8291e-01,
       -5.6443e-01,  3.0565e-01,  4.1577e-02,  8.3677e-01, -4.5295e-01],
      dtype=float32)),
 ('headsets',
  array([-0.026823 , -0.092859 , -1.0312   ,  0.90884  , -0.49068  ,
        0.3701   ,  0.63185  , -0.61496  ,  0.57759  , -1.0434   ,
       -0.21304  ,  1.0892   , -0.67756  ,  0.39059  ,  0.12729  ,
   

In [33]:
def tokenize(Doc):
    if pd.notnull(Doc):
        tokens = nltk.wordpunct_tokenize(Doc)
        words = [w.lower() for w in tokens if w not in stops and w not in puns]
        return words
    else:
        return None
    
print("before tokenize:", all_categories[0])
print("after tokenize:", tokenize(all_categories[0]))


before tokenize: Small Animal
after tokenize: ['small', 'animal']


# category to vec

In [34]:
def doc2vec(doc, word2vec=word_vec_mapping):
    docvec=np.zeros(vec_dimensions, )  ## 先處使劃一條向量，如果某個類別裡面的字都沒有在字典裡，那麼會回傳這條向量
    vec_count = 1
    
    if pd.notnull(doc):
        terms = tokenize(doc)  ## 把類別tokenize成一個個的詞彙
        for term in terms:
            termvec = word_vec_mapping.get(term, None)  ## 得到詞向量
            if termvec is not None:
                docvec += np.array(termvec, dtype=np.float32)  ## 把詞向量家道類別向量中
                vec_count += 1              
    return (docvec/vec_count)  ##  記得加了幾條向量，就要處以相應的數字取平均

all_categories_vecs = np.concatenate((pd.Series(all_categories).apply(doc2vec).values)).reshape(len(all_categories), -1)
all_categories_vecs[:10]

array([[-1.06568001e-01, -2.19300002e-01, -5.04960010e-01,
         7.89400041e-02,  4.33136672e-01, -2.04033335e-01,
         2.69650002e-01, -4.01289991e-02,  6.66566640e-02,
        -3.47369999e-01, -1.45897331e-01,  5.23586671e-02,
        -2.56810006e+00, -5.97266654e-02,  2.70169998e-01,
         3.27853332e-01,  1.49886663e-01, -3.41743320e-01,
         4.56019998e-01, -3.18910003e-01, -2.26503337e-01,
        -1.98499958e-02,  1.73773328e-01, -7.36466646e-02,
        -1.91801341e-01,  3.99000247e-02, -4.33019996e-01,
         1.95446660e-01,  3.02536656e-01, -2.64469996e-01,
         1.43448664e-01, -2.21346666e-01,  2.67573337e-01,
        -1.34714002e-01,  1.03713324e-01, -1.49645003e-01,
        -3.23186656e-01,  3.05600067e-02, -1.53926671e-01,
        -4.57656662e-01, -6.81633313e-01, -2.12436676e-01,
        -9.16280023e-03, -1.15173330e-01,  2.89599995e-01,
         2.76519994e-01,  6.86560015e-01,  5.35286645e-01,
        -6.97303365e-02, -1.31800175e-02],
       [-1.34

# Clustering

In [38]:
from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import DBSCAN
from collections import Counter
X = all_categories_vecs
n_clusters= 15

## K means

In [39]:
kmeans = KMeans(n_clusters=n_clusters, random_state=0)
all_categories_labels_kmeans = kmeans.fit_predict(X)
Counter(all_categories_labels_kmeans)

Counter({7: 56,
         13: 62,
         9: 58,
         4: 66,
         1: 50,
         6: 82,
         2: 64,
         5: 47,
         14: 58,
         8: 77,
         0: 54,
         3: 81,
         11: 44,
         10: 55,
         12: 56})

## Hireachy(single link)

In [40]:
linkage = ['ward', 'average', 'complete']
all_categories_labels_single = AgglomerativeClustering(linkage=linkage[0], n_clusters=n_clusters).fit_predict(X)
Counter(all_categories_labels_single)

Counter({2: 74,
         9: 46,
         14: 43,
         3: 83,
         4: 68,
         0: 95,
         5: 109,
         10: 65,
         6: 46,
         12: 33,
         1: 93,
         8: 48,
         7: 49,
         13: 29,
         11: 29})

## Hireachy(average link)

In [41]:
linkage = ['ward', 'average', 'complete']
all_categories_labels_average = AgglomerativeClustering(linkage=linkage[1], n_clusters=n_clusters).fit_predict(X)
Counter(all_categories_labels_average)

Counter({0: 837,
         2: 48,
         6: 2,
         5: 2,
         9: 1,
         12: 1,
         13: 1,
         14: 2,
         4: 7,
         11: 1,
         1: 4,
         7: 1,
         10: 1,
         8: 1,
         3: 1})

## Hireachy(complete link)

In [42]:
linkage = ['ward', 'average', 'complete']
all_categories_labels_complete = AgglomerativeClustering(linkage=linkage[2], n_clusters=n_clusters).fit_predict(X)
Counter(all_categories_labels_complete)

Counter({4: 98,
         6: 111,
         8: 53,
         1: 178,
         12: 56,
         2: 40,
         9: 69,
         10: 4,
         5: 71,
         7: 25,
         0: 93,
         3: 63,
         11: 38,
         14: 3,
         13: 8})

## DBSCAN

In [43]:
all_categories_labels_dbscam = DBSCAN().fit_predict(X)
Counter(all_categories_labels_dbscam)

Counter({-1: 898, 0: 12})

In [44]:
df_cat = pd.DataFrame(all_categories_labels_dbscam, index=all_categories, columns=['label'])
print(list(df_cat[df_cat['label'] == 0].index))

['Teethers', 'Playards', 'Epilators', 'Sweatercoat', 'Rainwear', 'Needlecraft', 'Bedspreads & Coverlets', 'Dehumidifiers', 'Humidifiers', 'Paperweights', 'Papermaking', 'other']


# PCA

In [69]:
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from sklearn.decomposition import PCA

def draw_PCA(X, Y, title):
#     plt.rcParams["figure.figsize"] = (20,3)
    fig = plt.figure(1, figsize=(8, 6))
    fig.set_size_inches(40, 40)
    ax = Axes3D(fig, elev=-150, azim=110)

    X_reduced = PCA(n_components=3).fit_transform(X)
    ax.scatter(X_reduced[:, 0], X_reduced[:, 1], X_reduced[:, 2], c=Y,
               cmap=plt.cm.Set1, edgecolor='k', s=40)
    ax.set_title(title)
    ax.set_xlabel("1st eigenvector")
    ax.w_xaxis.set_ticklabels([])
    ax.set_ylabel("2nd eigenvector")
    ax.w_yaxis.set_ticklabels([])
    ax.set_zlabel("3rd eigenvector")
    ax.w_zaxis.set_ticklabels([])

    plt.show()

In [70]:
draw_PCA(X, all_categories_labels_kmeans, 'kmeans')
draw_PCA(X, all_categories_labels_single, 'single link')
draw_PCA(X, all_categories_labels_average, 'average link')
draw_PCA(X, all_categories_labels_complete, 'complete link')
draw_PCA(X, all_categories_labels_dbscam, 'dbscan')

MemoryError: In RendererAgg: Out of memory

<Figure size 2880x2880 with 1 Axes>

# Evaluation

In [122]:
df_cat = pd.DataFrame(all_categories_labels_kmeans, index=all_categories, columns=['label'])
for i in range(len(set(all_categories_labels_kmeans))):
    cats = list(df_cat[df_cat['label'] == i].index)    
    print("cluster " + str(i) + ": ")
    print(list(cats))
    print("=============================================")
    print("=============================================")


cluster 0: 
['Sticker', 'Laptop', 'Button', 'Patch', 'Full Zip', 'Pin', 'Magic', 'Print', 'Mug', 'Paper', 'Tablet', 'Camera', 'Headphones', 'Block', 'Pad', 'Case', 'Gadget', 'Poster', 'Tape', 'Box', 'Clock', 'Telephone', 'iPad', 'Mini', 'Screen Protectors', 'Blu-Ray', 'Notebook', 'Pinback Button', 'Tag', 'Standard']
cluster 1: 
['Track & Sweat Suits', 'Plate', 'Full-Length', 'Straight, Pencil', 'Feet', 'V-Neck', 'Ring', 'Golf Balls', 'Tops & T-Shirts', 'Three Button', 'Tank', 'Slim, Skinny', 'Change Purse', 'Button-Front', 'Scoop Neck', 'Training Pants', 'Loafers & Slip-Ons', 'Top & T-shirts', 'T-Shirts', 'Track Jacket', 'Leg Warmers', 'Clutch', 'Two Button', 'Cuff Links', 'Cross Stitch', 'Knit Top', 'Button Down Shirt', 'Rings', 'Boot Cut', 'Full Skirt', 'Belt', 'Above Knee, Mini', 'Backpack Style', 'Straight Leg', 'Track & Sweat Pants', 'Dress - Flat Front', 'Mid-Calf', 'Wrap', 'Wide Leg', 'Board Shorts', 'Classic, Straight Leg', 'Knee-Length', 'Shoulder Bag', 'Hip Bag', 'T-shirts', 

In [123]:
df_cat = pd.DataFrame(all_categories_labels_single, index=all_categories, columns=['label'])
for i in range(len(set(all_categories_labels_kmeans))):
    cats = list(df_cat[df_cat['label'] == i].index)    
    print("cluster " + str(i) + ": ")
    print(sorted(list(cats)))
    print("=============================================")
    print("=============================================")

cluster 0: 
['Batteries', 'Bicycle Child Seats & Trailers', 'Cables & Adapters', 'Camera', 'Camera & Photo Accessories', 'Cameras & Photography', 'Car', 'Car Care', 'Car Electronics & Accessories', 'Car Seats', 'Car Seats & Accessories', 'Car Security & Convenience', 'Car Speakers & Systems', 'Car Stereos & Components', 'Car Subwoofers', 'Cell Phone Accessories', 'Cell Phones & Accessories', 'Cell Phones & Smartphones', 'Computers & Tablets', 'Digital Cameras', 'Flashes & Flash Accessories', 'Highchairs & Booster Seats', 'Home Speakers & Subwoofers', 'Ink & Toner', 'Laptop', 'Laptops & Netbooks', 'Lenses & Filters', 'Mini', 'Notebook', 'Pad', 'Performance Parts & Accessories', 'Portable Audio & Accessories', 'RV Parts & Accessories', 'Replacement Parts', 'Replacement Parts & Tools', 'Tablet', 'iPad']
cluster 1: 
['Activity Centers & Entertainers', 'Automotive', 'Automotive Enthusiast Merchandise', 'Basic Supplies', 'Components & Parts', 'Educational', 'Electrical Safety', 'Electronic',