In [1]:
# Required Libraries
from sklearn.metrics.pairwise import cosine_similarity
import seaborn as sns
import pickle
import matplotlib.pyplot as plt

In [2]:
import pandas as pd
import numpy as np

import plotly.graph_objects as go

seed = 58
np.random.seed(seed)

In [3]:
# Input vector, returns nearest customer(s)
def Cosine_Similarity(centroid,embeddedData,word_to_index,vocab_size,index_to_word,threshold):
              
    similarity_index = []
    similarity_theta = []

    for i in range(embeddedData.shape[0]):
        
        person = embeddedData.iloc[i,:]
        
        theta_sum = np.dot(centroid, person)
        theta_den = np.linalg.norm(centroid) * np.linalg.norm(person)
        theta = theta_sum / theta_den
        
        if theta >= threshold:
            similarity_index.append(i)
            similarity_theta.append(theta)
        
    return similarity_index, similarity_theta

In [4]:
data      = pd.read_csv('data/data4analytics.csv')
centroids = pd.read_csv('data/kmeans_centroids.csv')
embeddedData = pd.read_csv('data/project_dataset.csv')

In [5]:
with open('data/word_to_index_v3.pkl', 'rb') as fp:
    word_to_index = pickle.load(fp)
    
with open('data/index_to_word_v3.pkl', 'rb') as fp:
    index_to_word = pickle.load(fp)    

In [6]:
VOCAB_SIZE = len(index_to_word)

In [7]:
data.shape

(48842, 17)

In [8]:
embeddedData.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
0,0.079357,-0.013016,-0.064891,0.164042,0.109075,0.367152,0.616238,-0.041863,0.281724,-0.073042,...,-0.345417,-0.15184,-0.057761,0.085861,-0.15763,-0.150261,-0.007651,0.14838,0.03127,-0.303689
1,0.065084,0.079508,0.003485,0.211437,-0.006988,0.095358,0.383679,-0.199834,0.217875,-0.109475,...,-0.566354,0.038567,-0.029137,0.413811,-0.343286,0.103419,-0.532482,0.248444,-0.127649,-0.202146
2,0.286153,0.138509,0.249087,0.356722,0.252658,0.572814,0.591843,-0.075903,0.224935,0.083812,...,-0.218306,-0.345675,-0.43851,-0.031946,-0.494963,0.240659,-0.375938,0.171001,-0.018633,-0.470645
3,0.093605,0.349311,0.114927,0.341508,0.034264,0.494125,0.306043,-0.364169,0.197562,0.166602,...,-0.182497,-0.463281,-0.16895,0.161085,-0.201562,0.363891,-0.611611,0.347564,0.15364,-0.257974
4,-0.004717,0.531995,-0.112232,0.45444,-0.023014,0.034041,0.002723,0.185025,0.136317,0.324738,...,0.167132,-0.346875,0.210444,0.029913,-0.037009,-0.390999,0.053203,-0.082046,0.241039,-0.411027


In [9]:
centroids.shape

(2, 300)

In [10]:
cluster0 = centroids.iloc[0,:]
cluster1 = centroids.iloc[1,:]

In [14]:
index, thetas = Cosine_Similarity(cluster0,embeddedData,word_to_index,VOCAB_SIZE,index_to_word,0.8)

Similarity_cluster0 = []
for idx in index:
    Similarity_cluster0.append(data.iloc[idx,:])

Similarity_cluster0 = pd.DataFrame(Similarity_cluster0)
Similarity_cluster0['theta'] = thetas

Similarity_cluster0 = Similarity_cluster0[Similarity_cluster0.cluster==0]

Similarity_cluster0 = Similarity_cluster0.sort_values(by=['theta'],ascending=False).copy()

In [15]:
Similarity_cluster0.to_csv("data/cluster0.csv",index=False)

In [16]:
index, thetas = Cosine_Similarity(cluster1,embeddedData,word_to_index,VOCAB_SIZE,index_to_word,0.8)

Similarity_cluster1 = []
for i in index:
    Similarity_cluster1.append(data.iloc[i,:])

Similarity_cluster1 = pd.DataFrame(Similarity_cluster1)
Similarity_cluster1['theta'] = thetas

Similarity_cluster1 = Similarity_cluster1[Similarity_cluster1.cluster==1]

Similarity_cluster1 = Similarity_cluster1.sort_values(by=['theta'],ascending=False).copy()

In [17]:
Similarity_cluster1.to_csv("data/cluster1.csv",index=False)