In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from joblib import Parallel, delayed
from Constants import Constants
from umap import UMAP
import plotly.express as px

GRAPH_FILE_NAME = Constants.GRAPH_FILE_NAME.value
print(GRAPH_FILE_NAME)

undirected_weighted_product_views_graph.parquet


In [2]:
df_full = pd.read_parquet('../Data/optimised_raw_data.parquet')
df_node2vec = pd.read_parquet('../Data/Embedding_Data/node2vec_embedding_df_{}.parquet'.format(GRAPH_FILE_NAME.split('.')[0]))

In [3]:
df_full = df_full[['product_id', 'category_code']]
df_full.drop_duplicates(subset=['product_id'], inplace = True)
df_full.columns = ['pid', 'category_code']
df_node2vec = df_node2vec.merge(df_full, on=['pid'], how='left')

In [4]:
df_node2vec

Unnamed: 0,pid,embedding_vector,category_code
0,17301504,"[0.44851154, 0.40735474, -0.172185, 0.07374085...",apparel.shoes.sandals
1,17301505,"[0.47892618, 0.20487863, 0.021507299, -0.62510...",apparel.shoes.sandals
2,17301506,"[0.25423482, 0.47845665, -0.001093431, 0.03616...",apparel.shoes.sandals
3,17301507,"[-0.10483544, 0.22859468, -0.16854355, 0.21888...",apparel.jeans
4,17301508,"[0.83427334, 0.5198858, 0.011705197, 0.4507888...",apparel.shoes.sandals
...,...,...,...
211856,17301498,"[0.27920097, 0.5529921, -0.0058155376, -0.0065...",apparel.shoes.sandals
211857,17301499,"[0.53230333, 0.36346158, -0.28965843, 0.069170...",apparel.shoes.sandals
211858,17301500,"[0.32938805, 0.38010123, -0.07451511, 0.039433...",apparel.shoes.sandals
211859,17301501,"[0.31186602, 0.463752, 0.08176216, -0.19684744...",apparel.shoes.sandals


In [5]:
def level_split(category):
    result = [None]*3
    if not category or type(category)!=str or category=='':
        return result
    try:
        d = category.split('.')
        result[:0] = d
#         print(result[:3])
    except:
        print("Error", category)
        pass
    return result[:3]

result_level = Parallel(n_jobs=-1, verbose=0)(delayed(level_split)(x) for x in tqdm(df_node2vec.category_code.values))
category_split_df = pd.DataFrame(result_level, columns= ['L1', 'L2', 'L3'])
category_split_df.head()

100%|██████████| 211861/211861 [00:02<00:00, 96348.11it/s] 


Unnamed: 0,L1,L2,L3
0,apparel,shoes,sandals
1,apparel,shoes,sandals
2,apparel,shoes,sandals
3,apparel,jeans,
4,apparel,shoes,sandals


In [6]:
df_node2vec[['L1', 'L2', 'L3']] = category_split_df[['L1', 'L2', 'L3']]

### UMAP ( Uniform Manifold Approximation and Projection )
Uniform Manifold Approximation and Projection created in 2018 by (Leland McInnes, John Healy, James Melville) is a general-purpose manifold learning and dimension reduction algorithm.
<br>

<b>UMAP is a nonlinear dimensionality reduction method, it is very effective for visualizing clusters or groups of data points and their relative proximities.</b> <br>

The significant difference with TSNE is scalability, it can be applied directly to sparse matrices thereby eliminating the need to applying any Dimensionality reduction such as PCA or Truncated SVD(Singular Value Decomposition) as a prior pre-processing step.

In [7]:
%%time
embedding = np.stack(df_node2vec[~df_node2vec.L1.isna()].embedding_vector.values.tolist())

umap_2d = UMAP(n_components=2, init='random', random_state=0, n_jobs=-1, verbose=True, metric='cosine', low_memory=False)
umap_3d = UMAP(n_components=3, init='random', random_state=0, n_jobs=-1, verbose=True, metric='cosine', low_memory=False)

proj_2d = umap_2d.fit_transform(embedding)
proj_3d = umap_3d.fit_transform(embedding)


UMAP(angular_rp_forest=True, dens_frac=0.0, dens_lambda=0.0, init='random',
     low_memory=False, metric='cosine', random_state=0, verbose=True)
Construct fuzzy simplicial set
Sun Mar 13 23:35:16 2022 Finding Nearest Neighbors
Sun Mar 13 23:35:16 2022 Building RP forest with 26 trees
Sun Mar 13 23:35:18 2022 NN descent for 17 iterations
	 1  /  17
	 2  /  17
	 3  /  17
	 4  /  17
	 5  /  17
	 6  /  17
	Stopping threshold met -- exiting after 6 iterations
Sun Mar 13 23:35:37 2022 Finished Nearest Neighbor Search
Sun Mar 13 23:35:39 2022 Construct embedding
	completed  0  /  200 epochs
	completed  20  /  200 epochs
	completed  40  /  200 epochs
	completed  60  /  200 epochs
	completed  80  /  200 epochs
	completed  100  /  200 epochs
	completed  120  /  200 epochs
	completed  140  /  200 epochs
	completed  160  /  200 epochs
	completed  180  /  200 epochs
Sun Mar 13 23:37:31 2022 Finished embedding
UMAP(angular_rp_forest=True, dens_frac=0.0, dens_lambda=0.0, init='random',
     low_memo

In [11]:
fig_2d = px.scatter(
    proj_2d[:30000], x=0, y=1,
    color=df_node2vec[~df_node2vec.L1.isna()].head(30000).L1, labels={'color': 'L1'}
)
fig_3d = px.scatter_3d(
    proj_3d[:30000], x=0, y=1, z=2,
    color=df_node2vec[~df_node2vec.L1.isna()].head(30000).L1, labels={'color': 'L1'}
)
fig_3d.update_traces(marker_size=5)

fig_2d.show()
fig_3d.show()