In [9]:
import numpy as np
import pandas as pd

In [5]:
df = pd.read_csv('C:/ML/download/ecommerce-data.csv',encoding='cp1252')

In [6]:
df.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,12/1/2010 8:26,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,12/1/2010 8:26,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,12/1/2010 8:26,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,12/1/2010 8:26,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,12/1/2010 8:26,3.39,17850.0,United Kingdom


In [70]:
df1 = df.pivot_table(values='Quantity', index='CustomerID', columns='StockCode',
                     aggfunc=lambda x: 1 if x is not None else 0)
df1.fillna(0, inplace = True)
df1.reset_index(inplace = True)

In [71]:
pivot_cols = set(df1.columns) - {'StockCode', 'CustomerID'}
len(pivot_cols)

def find_column(row):
    row = row[pivot_cols]
    return row.index[row > 0].tolist()

df1['new'] = df1.apply(find_column,axis=1)

In [74]:
corpus = list(df1['new'])

In [79]:
!pip install gensim --quiet

You should consider upgrading via the 'c:\python37\python.exe -m pip install --upgrade pip' command.


In [81]:
from gensim.models import Word2Vec
#model = Word2Vec(sentences=common_texts, vector_size=100, window=5, min_count=1, workers=4)
model = Word2Vec(corpus, vector_size=100, min_count=1)

In [None]:
model.wv.vocab

In [99]:
from sklearn.decomposition import PCA
# fit a 2d PCA model to the vectors
vocab = model.wv.key_to_index.keys()
words = list(model.wv.key_to_index.keys())
#vectors = [model.wv.get_index(word) for word in vocab]
# here you load indices - with whom you can find an index of the particular word in your model 
w2v_indices = {word: model.wv.get_index(word) for word in vocab} 

In [100]:
w2v_vectors = model.wv.vectors

In [127]:
model.vector_size

100

In [110]:
def vectorize(list_of_docs, model=model):
    """Generate vectors for list of documents using a Word Embedding
    Args:
       list_of_docs: List of documents
       model: Gensim's Word Embedding

   Returns:
       List of document vectors
   """
    features = []
    wv = model.wv
    
    for tokens in list_of_docs:
        zero_vector = np.zeros(model.vector_size)
        vectors = []
        for token in tokens:
            if token in wv:
                try:
                    vectors.append(wv[token])
                except KeyError:
                    continue
        if vectors:
            vectors = np.asarray(vectors)
            avg_vec = vectors.mean(axis=0)
            features.append(avg_vec)
        else:
            features.append(zero_vector)
    return features

In [111]:
vectorized_docs = vectorize(corpus,model=model)

In [115]:
from sklearn.cluster import MiniBatchKMeans
from sklearn.metrics import silhouette_score
from sklearn.metrics import silhouette_samples

def mbkmeans_clusters(
    X,
    k,
    mb,
    print_silhouette_values,
):
    """Generate clusters and print Silhouette metrics using MBKmeans

    Args:
        X: Matrix of features.
        k: Number of clusters.
        mb: Size of mini-batches.
        print_silhouette_values: Print silhouette values per cluster.

    Returns:
        Trained clustering model and labels based on X.
    """
    km = MiniBatchKMeans(n_clusters=k, batch_size=mb).fit(X)
    print(f"For n_clusters = {k}")
    print(f"Silhouette coefficient: {silhouette_score(X, km.labels_):0.2f}")
    print(f"Inertia:{km.inertia_}")

    if print_silhouette_values:
        sample_silhouette_values = silhouette_samples(X, km.labels_)
        print(f"Silhouette values:")
        silhouette_values = []
        for i in range(k):
            cluster_silhouette_values = sample_silhouette_values[km.labels_ == i]
            silhouette_values.append(
                (
                    i,
                    cluster_silhouette_values.shape[0],
                    cluster_silhouette_values.mean(),
                    cluster_silhouette_values.min(),
                    cluster_silhouette_values.max(),
                )
            )
        silhouette_values = sorted(
            silhouette_values, key=lambda tup: tup[2], reverse=True
        )
        for s in silhouette_values:
            print(
                f"    Cluster {s[0]}: Size:{s[1]} | Avg:{s[2]:.2f} | Min:{s[3]:.2f} | Max: {s[4]:.2f}"
            )
    return km, km.labels_


clustering, cluster_labels = mbkmeans_clusters(
    X=vectorized_docs,
    k=10,
    mb=500,
    print_silhouette_values=True,
)

For n_clusters = 10
Silhouette coefficient: 0.08
Inertia:5548.513556591896
Silhouette values:
    Cluster 2: Size:9 | Avg:0.35 | Min:0.08 | Max: 0.49
    Cluster 5: Size:32 | Avg:0.31 | Min:-0.13 | Max: 0.54
    Cluster 3: Size:1958 | Avg:0.24 | Min:0.03 | Max: 0.39
    Cluster 0: Size:263 | Avg:0.09 | Min:-0.13 | Max: 0.34
    Cluster 1: Size:810 | Avg:-0.05 | Min:-0.22 | Max: 0.16
    Cluster 7: Size:755 | Avg:-0.06 | Min:-0.26 | Max: 0.15
    Cluster 9: Size:42 | Avg:-0.09 | Min:-0.32 | Max: 0.20
    Cluster 6: Size:183 | Avg:-0.09 | Min:-0.35 | Max: 0.22
    Cluster 4: Size:141 | Avg:-0.11 | Min:-0.39 | Max: 0.23
    Cluster 8: Size:179 | Avg:-0.15 | Min:-0.43 | Max: 0.15


In [128]:
df_clusters = pd.DataFrame({
    "tokens": corpus,
    "cluster": cluster_labels
})

In [137]:
df1['cluster'] = df_clusters['cluster']

In [119]:
print("Most representative terms per cluster (based on centroids):")

wv = model.wv
for i in range(10):
    tokens_per_cluster = ""
    most_representative = wv.most_similar(positive=[clustering.cluster_centers_[i]], topn=5)
    for t in most_representative:
        tokens_per_cluster += f"{t[0]} "
    print(f"Cluster {i}: {tokens_per_cluster}")

Most representative terms per cluster (based on centroids):
Cluster 0: 35817P 90122C 90214B 90180B 20666 
Cluster 1: 90185A 84824 90214U 90031 90210A 
Cluster 2: 22971 20723 22839 21497 22457 
Cluster 3: 90122C 35817P 90124B 90133 90176E 
Cluster 4: 90019A 90198A 84760S 17084A 90173 
Cluster 5: 22636 23211 22470 23306 23073 
Cluster 6: 90120B 90001D 90120C 47344B 35637C 
Cluster 7: 90133 84388 79323P 62097B 90122C 
Cluster 8: 79157V 90174 90040B 20821 90161D 
Cluster 9: 84804A 21348 21664 84661A 84660C 


In [168]:
from sklearn.cluster import DBSCAN
dbscan = DBSCAN(metric='cosine', eps=0.01, min_samples=100) # you can change these parameters, given just for example 
cluster_labels = dbscan.fit_predict(vectorized_docs) # where X - is your matrix, where each row corresponds to one document (line) from the docs, you need to cluster 

In [169]:
unique, counts = np.unique(cluster_labels, return_counts=True)
print(unique, counts)

[-1  0] [2369 2003]


In [171]:
pca = PCA(n_components=10)
PCA_result = pca.fit_transform(vectorized_docs)

In [176]:
len(PCA_result[0])

10