In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import seaborn as sns
import matplotlib.pyplot as plt

from gensim.models import KeyedVectors
from sentence_transformers import SentenceTransformer
import zipfile

from narrative_to_vec import tokenize_column, add_column_average_genre_vector

  from tqdm.autonotebook import tqdm, trange


In [2]:
# Data Loading
data_path = os.path.join("data", "data_eda.zip")
data = pd.read_csv(data_path)
data.head()

Unnamed: 0,product,narrative,narrative_len,narrative_prep,text_lemma,narrative_prep_len,sentiment
0,credit_card,purchase order day shipping amount receive pro...,230,purchase order day shipping receive product we...,purchase order day shipping receive product we...,203,0.078905
1,credit_card,forwarded message date tue subject please inve...,132,forwarded message date tue subject investigate...,forward message date tue subject investigate c...,121,-0.016748
2,retail_banking,forwarded message cc sent friday pdt subject f...,173,forwarded message cc sent friday pdt subject f...,forward message cc send friday pdt subject fin...,147,-0.01
3,credit_reporting,payment history missing credit report speciali...,131,payment history missing credit report speciali...,payment history miss credit report specialized...,110,0.061483
4,credit_reporting,payment history missing credit report made mis...,123,payment history missing credit report mistake ...,payment history miss credit report mistake acc...,102,0.061483


***Goal: Try out different embeddings to see, how (or if) the product categories can be clustered by cosine similarity.***

In [3]:
# write a function to get and plot cosine similiraty by product category
def plot_cosine_similarity(embeddings, name):
    cosine_sim = cosine_similarity(embeddings)
    # plot the cosine similarity matrix
    sns.heatmap(cosine_sim, annot=True, cmap='coolwarm',
                xticklabels=embeddings.index,
                yticklabels=embeddings.index
                ).set_title("Cosine Similarity Matrix for " + name)  

In [4]:
data_w2v = tokenize_column(data, text_col="narrative_prep", new_col="narrative_tokenized")
data_w2v = add_column_average_genre_vector(data_w2v, 
                                           model_path="narrative_word2vec.model", 
                                           col_name="narrative_tokenized")

In [5]:
data_w2v.head()

Unnamed: 0,product,narrative,narrative_len,narrative_prep,text_lemma,narrative_prep_len,sentiment,narrative_tokenized,narrative_tokenized_vector
0,credit_card,purchase order day shipping amount receive pro...,230,purchase order day shipping receive product we...,purchase order day shipping receive product we...,203,0.078905,"[purchase, order, day, shipping, receive, prod...","[-0.1932458, 0.30312586, 0.5581857, -0.0850606..."
1,credit_card,forwarded message date tue subject please inve...,132,forwarded message date tue subject investigate...,forward message date tue subject investigate c...,121,-0.016748,"[forwarded, message, date, tue, subject, inves...","[-0.39480367, 0.5977232, 0.8885377, -0.2920917..."
2,retail_banking,forwarded message cc sent friday pdt subject f...,173,forwarded message cc sent friday pdt subject f...,forward message cc send friday pdt subject fin...,147,-0.01,"[forwarded, message, cc, sent, friday, pdt, su...","[-0.04925495, -0.10232089, -0.025370974, -0.16..."
3,credit_reporting,payment history missing credit report speciali...,131,payment history missing credit report speciali...,payment history miss credit report specialized...,110,0.061483,"[payment, history, missing, credit, report, sp...","[-0.17455277, 0.8896315, 0.7093058, 0.28461203..."
4,credit_reporting,payment history missing credit report made mis...,123,payment history missing credit report mistake ...,payment history miss credit report mistake acc...,102,0.061483,"[payment, history, missing, credit, report, mi...","[-0.31288502, 0.97806394, 0.88697356, 0.400012..."


In [7]:
similarity_matrix = cosine_similarity(data_w2v['narrative_tokenized_vector'].tolist())