In [5]:
import torch
from torch.utils.data import DataLoader
import math
from sentence_transformers import models, losses
from sentence_transformers import LoggingHandler, SentenceTransformer, InputExample
import logging
from datetime import datetime
import gzip
import sys
import tqdm
import os
import random
from sklearn.decomposition import PCA
import numpy as np
import pandas as pd
import re
from transformers import AutoTokenizer, AutoModel

  from .autonotebook import tqdm as notebook_tqdm


In [7]:
# If there's a GPU available...
if torch.cuda.is_available():    
# Tell PyTorch to use the GPU.    
    device = torch.device("cuda")
    n_gpu = torch.cuda.device_count()
    print('There are %d GPU(s) available.' % n_gpu)
    print('We will use the GPU:', [torch.cuda.get_device_name(i) for i in range(n_gpu)])


There are 1 GPU(s) available.
We will use the GPU: ['NVIDIA GeForce RTX 4060 Laptop GPU']


In [9]:
#### Just some code to print debug information to stdout
logging.basicConfig(format='%(asctime)s - %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S',
                    level=logging.INFO,
                    handlers=[LoggingHandler()])
#### /print debug information to stdout

In [4]:
# Training parameters
model_name = 'distilroberta-base'
train_batch_size = 32
max_seq_length = 128
num_epochs = 1

In [5]:
folder = 'data'
model_output_path = 'output/simcse-{}-{}-{}'.format(model_name, train_batch_size, datetime.now().strftime("%Y-%m-%d_%H-%M-%S"))

filepath = os.path.join(folder,'training_text.txt')

In [6]:
print(filepath)
print("Model path:", model_output_path)

data\training_text.txt
Model path: output/simcse-distilroberta-base-32-2023-10-27_03-04-16


In [7]:
# Use Huggingface/transformers model (like BERT, RoBERTa, XLNet, XLM-R) for mapping tokens to embeddings
word_embedding_model = models.Transformer(model_name, max_seq_length=max_seq_length)

In [8]:
# Apply mean pooling to get one fixed sized sentence vector
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())
model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

2023-10-27 03:04:22 - Use pytorch device: cuda


In [52]:
def create_train_samples(filepath = ''):
    ################# Read the train corpus  #################
    train_samples = []
    with gzip.open(filepath, 'rt', encoding='utf8') if filepath.endswith('.gz') else open(filepath, encoding='utf8') as fIn:
        for line in tqdm.tqdm(fIn, desc='Read file'):
            line = line.strip()
            if len(line) >= 10:
                train_samples.append(InputExample(texts=[line, line]))
    return train_samples

In [48]:
def load_sentence(filepath=''):
    sentences = []
    with open(filepath, encoding='utf8') as fIn:
        for line in tqdm.tqdm(fIn, desc='Read file'):
            line = line.strip()
            if len(line) >= 8:
                sentences.append(line)
    return sentences

In [28]:
train_samples = create_train_samples(filepath)
logging.info("Train sentences: {}".format(len(train_samples)))

# We train our model using the MultipleNegativesRankingLoss
train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=train_batch_size, drop_last=True)
train_loss = losses.MultipleNegativesRankingLoss(model)

Read file: 34123it [00:00, 734766.54it/s]

2023-10-27 03:50:44 - Train sentences: 34023





In [11]:
warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1)  # 10% of train data for warm-up
logging.info("Warmup-steps: {}".format(warmup_steps))

2023-10-27 03:04:31 - Warmup-steps: 107


In [12]:
# Train the model
model.fit(train_objectives=[(train_dataloader, train_loss)],
          epochs=num_epochs,
          warmup_steps=warmup_steps,
          optimizer_params={'lr': 5e-5},
          checkpoint_path=model_output_path,
          checkpoint_save_steps=5000,
          show_progress_bar=True,
          use_amp=True
          
          )

Iteration: 100%|██████████| 1063/1063 [03:32<00:00,  5.00it/s]
Epoch: 100%|██████████| 1/1 [03:32<00:00, 212.55s/it]


2023-10-27 03:08:15 - Save model to output/simcse-distilroberta-base-32-2023-10-27_03-04-16\1063


In [50]:
def pca(file, new_dimension = 128):
    sentences = load_sentence(filepath=file)
    random.shuffle(sentences)

    model = SentenceTransformer('./output/simcse-distilroberta-base-32-2023-10-27_03-04-16/1063')
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)

    embeddings = model.encode(sentences, convert_to_numpy=True, show_progress_bar=True)

    pca = PCA(n_components=new_dimension)
    pca.fit(embeddings)
    pca_comp = np.asarray(pca.components_)

    dense = models.Dense(in_features=model.get_sentence_embedding_dimension(), out_features=new_dimension, bias=False, activation_function=torch.nn.Identity())
    dense.linear.weight = torch.nn.Parameter(torch.tensor(pca_comp))
    model.add_module('dense', dense)

    model.save('simcse-distileroberta-base-pca-128')

In [51]:
pca(filepath)

Read file: 34123it [00:00, 1219441.88it/s]


2023-10-27 04:16:00 - Load pretrained SentenceTransformer: ./output/simcse-distilroberta-base-32-2023-10-27_03-04-16/1063
2023-10-27 04:16:01 - Use pytorch device: cuda


Batches: 100%|██████████| 1065/1065 [01:19<00:00, 13.34it/s]


2023-10-27 04:17:22 - Save model to simcse-distileroberta-base-pca-128


In [10]:
model = SentenceTransformer('./output/simcse-distileroberta-base-pca-128')
sentence = 'I am a sentence for which I would like to get its embedding.'
sentence_embedding = model.encode(sentence, convert_to_tensor=True)

2023-10-28 18:51:02 - Load pretrained SentenceTransformer: ./output/simcse-distileroberta-base-pca-128
2023-10-28 18:51:03 - Use pytorch device: cuda


Batches: 100%|██████████| 1/1 [00:01<00:00,  1.08s/it]


In [11]:
sentence_embedding.size()

torch.Size([128])

In [136]:
df  = pd.read_csv('data/merged_product_info_list.csv')

In [88]:
df.to_csv('data/merged_product_info_list.csv', index=False)

In [135]:
text_col = ['product_desc', 'expert_review', 'customer_review', 'varietal_description', 'taste_description']

In [114]:
for column_name in text_col:
    embedding_column_name = column_name + '_embedding'
    df[embedding_column_name] = None 

In [115]:
def encode_and_average_sentences(text):
    if pd.notna(text) and isinstance(text, str):
        text = text.replace('\n', ' ')
        sentences = re.split(r'[.!?]', text)
        sentences = [s.strip() for s in sentences if s.strip()]
        sentence_embeddings = model.encode(sentences)
        if sentence_embeddings is not None and len(sentence_embeddings) > 0:
            avg_embedding = np.mean(sentence_embeddings, axis=0)
            return avg_embedding
    return None


In [116]:
for column in text_col:
    df[column + '_embedding'] = df[column].apply(encode_and_average_sentences)

Batches: 100%|██████████| 1/1 [00:00<00:00,  1.55it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 13.94it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 17.11it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 11.02it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 12.58it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 47.69it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 57.58it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 103.37it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 54.15it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 55.94it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 44.01it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 76.84it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 52.86it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 65.47it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 55.11it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 101.93it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 70.94it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 107.22it/s]
Batches

In [117]:
df['pair_embedding'] = df['food_pair_description'].apply(encode_and_average_sentences)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches: 100%|██████████| 1/1 [00:00<00:00,  1.47it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 40.00it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 49.88it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 40.48it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 42.09it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 28.67it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 34.16it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 49.71it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 34.72it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 30.93it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 42.08it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 37.12it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 40.43it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 31.48it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 32.10it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 41.20it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 82.47it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 134.76it/s]
Batches: 

In [131]:
df.to_csv('data/merged_product_info_list_embedding.csv', index=False)

In [105]:
df = pd.read_csv('data/merged_product_info_list_embedding.csv')

In [129]:
text_columns = ['product_desc_embedding', 'expert_review_embedding', 'customer_review_embedding', 'varietal_description_embedding', 'taste_description_embedding','pair_embedding','average_embedding']


In [125]:
def convert_to_numpy_array(text):
    if isinstance(text, str):
        # Use a regular expression to find all float values in the string
        float_values = [float(match) for match in re.findall(r'-?\d+\.\d+', text)]
        # Convert the list of floats to a NumPy array
        embedding = np.array(float_values)
        return embedding
    else:
        # Handle NaN or empty values
        return None

In [126]:
for col in text_columns:
    df[col] = df[col].apply(convert_to_numpy_array)

In [127]:
def calculate_mean_embedding(row, columns):
    non_empty_columns = [col for col in columns if row[col] is not None]
    if len(non_empty_columns) == 0:
        return None
    embeddings = [row[col] for col in non_empty_columns]
    avg_embedding = np.nanmean(embeddings, axis=0)
    return avg_embedding



In [132]:
embeddings = df[text_columns]

In [133]:
embeddings.head()

Unnamed: 0,product_desc_embedding,expert_review_embedding,customer_review_embedding,varietal_description_embedding,taste_description_embedding,pair_embedding,average_embedding
0,,,,,,"[4.8636637, -0.4610777, 0.19725111, 0.01768136...",
1,,,,,,,
2,,,,,,,
3,,,,,,,
4,,,,,,,


In [82]:
embeddings = embeddings.reset_index()
embeddings = embeddings.rename(columns={'index': 'id'})
embeddings.head()

Unnamed: 0,id,Product_desc_embedding,Expert Review_embedding,Customer Review_embedding,varietal_description_embedding,taste_description_embedding,pair_embedding,average_embedding
0,0,"[-2.78680921, -4.4225297, -3.67019683, 2.94014...",,,"[1.0992494, 0.11980239, -1.1965942, -0.6213527...","[1.24974823, 6.60748124, -1.18105352, 3.253543...","[4.8636637, -0.4610777, 0.19725111, 0.01768136...","[-0.14593719, 0.76825131, -2.01594818, 1.85744..."
1,1,"[-0.34948108, -0.552049, -1.304156, 2.2099485,...","[2.3558176, -1.0680372, -0.5110016, 1.805713, ...","[0.8821483, 1.571991, -0.32468018, 0.59542143,...","[1.0992494, 0.11980239, -1.1965942, -0.6213527...",,,"[0.99693356, 0.0179268, -0.834108, 0.99743256,..."
2,2,"[-0.7318651, -0.8412946, -0.7144544, 0.1883505...",,,"[-4.7530478, 3.7818083, -1.0901152, -2.4799588...",,,"[-2.74245645, 1.47025685, -0.9022848, -1.14580..."
3,3,"[-2.01763719, 1.06042397, -9.38756168, 1.15159...","[-1.7358017, 3.25934291, -1.47061899, 1.144613...","[2.85526943, 5.63872218, -9.00960684, -2.79006...","[-6.64737105, 1.04634619, -1.9441855, -2.13883...",,,"[-1.88638513, 2.75120881, -5.45299325, -0.6581..."
4,4,"[0.00194785, 0.0703249, -1.1558341, 1.3818144,...","[9.8633367, -6.1955935, -1.200542, 9.519456, -...",,"[4.61960137, 1.24758649, -1.2608068, -2.549257...",,,"[4.82829531, -1.62589404, -1.20572763, 2.78400..."


In [128]:
df['average_embedding'] = df.apply(lambda row: calculate_mean_embedding(row, text_columns), axis=1)

In [37]:
print(df['average_embedding'])

0       [-0.14593719333333333, 0.7682513100000001, -2....
1       [0.996933555, 0.01792679749999997, -0.83410799...
2       [-2.74245645, 1.47025685, -0.9022848000000001,...
3       [-1.8863851275, 2.7512088125, -5.4529932525, -...
4       [4.828295306666667, -1.6258940366666668, -1.20...
                              ...                        
3432    [1.28107386, 0.22313031000000005, -3.191813363...
3433    [-1.92560435, -0.773119445, 0.47317768, -1.052...
3434    [-1.74217765, 0.561122865, -0.2898887500000000...
3435    [-1.146436515, -2.696524975, 4.84202403, -0.54...
3436    [-0.51374485, -3.526678946666667, 3.7025275033...
Name: average_embedding, Length: 3437, dtype: object


In [52]:
df = df.reset_index()


In [55]:
embeddings = df['average_embedding'].copy()
embeddings = embeddings.reset_index()
embeddings = embeddings.rename(columns={'index': 'id'})
embeddings.head()

Unnamed: 0,id,average_embedding
0,0,"[-0.14593719333333333, 0.7682513100000001, -2...."
1,1,"[0.996933555, 0.01792679749999997, -0.83410799..."
2,2,"[-2.74245645, 1.47025685, -0.9022848000000001,..."
3,3,"[-1.8863851275, 2.7512088125, -5.4529932525, -..."
4,4,"[4.828295306666667, -1.6258940366666668, -1.20..."


In [83]:
embeddings.to_pickle('data/embeddings.pkl')

In [15]:
from sklearn.metrics.pairwise import cosine_similarity
from scipy import spatial


In [43]:

def calc_similarity(input):
    data = df.copy()
    input_vector = model.encode(input)
    s = data['average_embedding'].apply(lambda x: 1 - spatial.distance.cosine(x, input_vector))
    data = data.assign(similarity = s)
    return(data.sort_values('similarity',ascending=False))

In [102]:
"A wine with a deep ruby red color and intense aromas of cherries and blackberries, with hints of eucalyptus and liquorice. On the palate, it is smooth and balanced, with a long and pleasant finish.sample_input_text = "

In [44]:
results = calc_similarity(sample_input_text)
results.head(20)

Batches: 100%|██████████| 1/1 [00:00<00:00, 19.25it/s]


Unnamed: 0,product_id,product_name,product_price,product_link,Product_desc,Expert Review,Customer Review,Appearance,Nose,Food Pairing,...,taste_description,food_pair_description,varietal_description,Product_desc_embedding,Expert Review_embedding,Customer Review_embedding,varietal_description_embedding,taste_description_embedding,average_embedding,similarity
1143,product_2478,reservado-rose-750ml,22.95,https://www.paneco.com.sg/products/reservado-r...,\n The Reservado Rose is made...,,,Nice light pink color,"Wild flowers, spices and red fruits","'Duck', 'Fish', 'Salmon', 'Lamb'",...,"It is Nice light pink color, smells like Wild ...","suitable serving for Duck, Fish, Salmon, Lamb","It's a Red Blends Wine from Chile ,made by Con...","[-5.4859123, -0.45781717, -1.6262776, -1.20931...",,,"[-3.2406397, 0.7799745, 0.05348945, -2.9278297...","[0.65287226, -1.9368249, -0.18129954, 0.022127...","[-2.69122658, -0.5382225233333333, -0.58469589...",0.486102
1177,product_3673,santa-rita-medalla-real-cabernet-sauvignon-750ml,29.95,https://www.paneco.com.sg/products/santa-rita-...,"\n Deep ruby-red in color, wi...",,,Deep ruby red,"Black currant, blueberries and cherries","'Beef', 'Cheese', 'Lamb', 'Venison'",...,"It is Deep ruby red, smells like Black currant...","suitable serving for Beef, Cheese, Lamb, Venison","It's a Cabernet Sauvignon Wine from Chile ,mad...","[-1.680761, -1.2877069, 0.27059203, 1.3738551,...",,,"[-4.2200613, 0.08830339, -0.873512, -3.160303,...","[-2.7810972, -1.2960175, 0.39842796, -0.498590...","[-2.8939731666666666, -0.8318070033333335, -0....",0.473398
1173,product_3524,santa-rita-120-cabernet-sauvignon-750ml,55.5,https://www.paneco.com.sg/products/santa-rita-...,\n 120 Cabernet Sauvignon is ...,,,Intense purplish red,Cherry with hints of vanilla and tobacco comin...,"'Beef', 'Chicken', 'Lamb', 'Venison'",...,"It is Intense purplish red, smells like Cherry...","suitable serving for Beef, Chicken, Lamb, Venison","It's a Cabernet Sauvignon Wine from Chile ,mad...","[-1.8615493, 1.298033, -0.2039566, -0.1012992,...",,,"[-4.2200613, 0.08830339, -0.873512, -3.160303,...","[-2.5990057, 2.6347723, 1.8159394, 0.88826096,...","[-2.893538766666667, 1.3403695633333335, 0.246...",0.455577
1089,product_2900,ornellaia-le-serre-nuove-750ml-w-gift-box,84.95,https://www.paneco.com.sg/products/ornellaia-l...,\n Made with the same passion...,"“Characterized by a lively ruby red colour, Le...",,Deep ruby red,"Aromas of berries, blackberry, raspberry and r...","'Beef', 'Pasta', 'Lamb', 'Venison'",...,"It is Deep ruby red, smells like Aromas of ber...","suitable serving for Beef, Pasta, Lamb, Venison","It's a Red Blends Wine from Italy ,made by Orn...","[-2.2723742, -2.1782522, -2.5633135, -0.309749...","[-2.441083, -0.38465324, -1.5533272, 1.6290039...",,"[-2.7382183, -0.52281475, -2.2716973, -2.73773...","[-1.1502156, -0.4732044, 0.4513827, 1.6088786,...","[-2.150472775, -0.8897311475, -1.4842388249999...",0.391614
1231,product_1075,torres-sangre-de-toro-original-750ml,33.9,https://www.paneco.com.sg/products/torres-sang...,"\nA family owned winery based in Catalunya, no...",,,Ruby red,Rich aromas of spices and wild blackberries,"'Beef', 'Chicken', 'Duck', 'Pasta'",...,"It is Ruby red, smells like Rich aromas of spi...","suitable serving for Beef, Chicken, Duck, Pasta","It's a Red Blends Wine from Spain ,made by Torres","[-2.2826958, -1.6003072, -2.140713, 0.3337516,...",,,"[-3.7205243, 1.7544336, -1.0633206, -2.8278708...","[-2.01938, 0.8432442, 0.27953333, 1.3604186, 3...","[-2.6742000333333333, 0.3324568666666667, -0.9...",0.37657
268,product_3466,chateau-la-cabanne-pomerol-2016-750ml,119.95,https://www.paneco.com.sg/products/chateau-la-...,\nThe 2016 Château La Cabanne checks in as 96%...,,,Dark ruby red,Red cherries and crushed strawberry laced with...,"'Beef', 'Chicken', 'Lamb', 'Venison'",...,"It is Dark ruby red, smells like Red cherries ...","suitable serving for Beef, Chicken, Lamb, Venison","It's a Merlot Wine from France ,made by Chatea...","[-2.3851905, -0.14757854, -0.9029401, 0.815986...",,,"[-2.8800623, 0.26828703, 0.35828483, -4.054077...","[-2.018568, 0.67021245, 0.74248195, 0.7493804,...","[-2.427940266666667, 0.2636403133333333, 0.065...",0.373721
210,product_2933,aristocratico-valpolicella-ripsasso-doc-750ml,42.4,https://www.paneco.com.sg/products/aristocrati...,\nAristocratico Ripasso in an exceptional red ...,,,Ruby red with purple hues,"Complex with notes of violet and spices, ripe ...","'Beef', 'Pasta', 'Lamb', 'Venison'",...,"It is Ruby red with purple hues, smells like C...","suitable serving for Beef, Pasta, Lamb, Venison","It's a Corvina Wine from Italy ,made by Aristo...","[-1.553513, -1.1298779, -1.7593825, 0.8388137,...",,,"[-1.2417691, 0.6267468, -2.0309124, -1.9817634...","[-1.027494, -1.1362634, -0.25760835, -0.168477...","[-1.2742586999999999, -0.5464648333333334, -1....",0.364523
650,product_2597,cantine-san-marzano-the-god-father-salento-igp...,64.95,https://www.paneco.com.sg/products/cantine-san...,\n The God Father displays an...,,,Very deep purple red,,"'Beef', 'Chicken', 'Duck', 'Cheese'",...,It is Very deep purple red,"suitable serving for Beef, Chicken, Duck, Cheese","It's a Cabernet Sauvignon Wine from Italy ,mad...","[-0.9281678, 1.4068283, 0.67409563, 2.050038, ...",,,"[-3.2997918, -0.83056486, -1.7592344, -2.82984...","[1.0018579, 2.4543407, -2.784124, -0.7943627, ...","[-1.0753672333333333, 1.01020138, -1.289754256...",0.350826
1041,product_2078,mezzacorona-cabernet-sauvignon-750ml,28.95,https://www.paneco.com.sg/products/mezzacorona...,\n Surrounded by Italian Alps...,,,Ruby red,"Black currant, blueberry and fresh red fruit n...","'Beef', 'Chicken', 'Duck', 'Cheese'",...,"It is Ruby red, smells like Black currant, blu...","suitable serving for Beef, Chicken, Duck, Cheese","It's a Cabernet Sauvignon Wine from Italy ,mad...","[-1.7628769, -1.6950258, -0.2868221, -0.855529...",,,"[-3.4281995, -1.1630175, -1.2062216, -3.305646...","[0.17817372, -0.09434402, -0.7502748, 0.832353...","[-1.67096756, -0.9841291066666668, -0.74777283...",0.348003
903,product_3934,hardys-hrb-cabernet-sauvignon,79.95,https://www.paneco.com.sg/products/hardys-hrb-...,\n History shows that Thomas ...,,,Brilliant dark red with purple hue,"Lifted mulberries, violets and blackberries wi...","'Beef', 'Chicken', 'Duck', 'Lamb'",...,"It is Brilliant dark red with purple hue, smel...","suitable serving for Beef, Chicken, Duck, Lamb",It's a Cabernet Sauvignon Wine from Australia ...,"[-1.9391962, -1.1473081, -1.8463756, 0.6604227...",,,"[-3.7955904, -1.4523511, -1.4337853, -3.713808...","[-1.0725117, -0.8436068, 2.2711086, 0.9632998,...","[-2.269099433333333, -1.1477553333333332, -0.3...",0.344236


In [137]:
embeddings = pd.read_pickle('data/embeddings.pkl')

In [138]:
product_df_embedding = pd.merge(df, embeddings, on='id', how='inner')

In [140]:
product_df_embedding.to_csv('data/merged_product_info_list_embedding.csv', index=False)