<a href="https://colab.research.google.com/github/KseniiaRyuma/notebooks/blob/main/ecom.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install cohere pinecone-client datasets

Collecting cohere
  Downloading cohere-4.51-py3-none-any.whl (52 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/52.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m52.0/52.0 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pinecone-client
  Downloading pinecone_client-3.1.0-py3-none-any.whl (210 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.0/211.0 kB[0m [31m13.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-2.17.1-py3-none-any.whl (536 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m536.7/536.7 kB[0m [31m38.4 MB/s[0m eta [36m0:00:00[0m
Collecting backoff<3.0,>=2.0 (from cohere)
  Downloading backoff-2.2.1-py3-none-any.whl (15 kB)
Collecting fastavro<2.0,>=1.8 (from cohere)
  Downloading fastavro-1.9.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
COHERE_KEY = "fXDGvwuPvSQnv8F6qGoUj8bfvG4YEvvtoAVrhf9z"
PINECONE_KEY = "8b7f687e-06e4-4d4c-b666-0bdac83c04d7"  # app.pinecone.io

## Create Embeddings

In [None]:
import cohere
import numpy as np

co = cohere.Client(COHERE_KEY)

In [None]:
import pandas as pd

products = pd.read_csv('/content/sample_product_descriptions.csv')

In [None]:
products

Unnamed: 0,Product Description
0,Elevate your kitchen tasks with our premium dr...
1,Maximize your beauty routines with the dynamic...
2,Embrace the art of pet care with our exquisite...
3,"Step into the future with our blender, a game-..."
4,Experience unparalleled comfort with these fic...
...,...
495,Embrace the art of home improvement projects w...
496,Revolutionize your sports training with our cu...
497,Maximize your beauty routines with the dynamic...
498,Elevate your athletic activities with our prem...


In [None]:
embeds = co.embed(
    texts=products['Product Description'].tolist(),
    model='embed-multilingual-v2.0',
    truncate='LEFT'
).embeddings

In [None]:
shape = np.array(embeds).shape
shape

(500, 768)

## Storing the Embeddings

In [None]:
from pinecone import Pinecone

pc = Pinecone(api_key=PINECONE_KEY)

index_name = 'products'

# connect to index
index = pc.Index(index_name)

Now we can begin populating the index with our embeddings. Pinecone expects us to provide a list of tuples in the format (id, vector, metadata), where the metadata field is an optional extra field where we can store anything we want in a dictionary format. For this example, we will store the original text of the embeddings.

While uploading our data, we will batch everything to avoid pushing too much data in one go.

In [None]:
ids = [str(id) for id in products.index.tolist()]

batch_size = 16

# create list of (id, vector, metadata) tuples to be upserted
to_upsert = list(zip(ids, embeds))

for i in range(0, shape[0], batch_size):
    i_end = min(i+batch_size, shape[0])
    index.upsert(vectors=to_upsert[i:i_end])

# let's view the index statistics
index.describe_index_stats()

{'dimension': 768,
 'index_fullness': 0.005,
 'namespaces': {'': {'vector_count': 500}},
 'total_vector_count': 500}

In [None]:
# STAGE 1: Run query

# Губная помада
query = "Губная помада"

# create the query embedding
xq = co.embed(
    texts=[query],
    model='embed-multilingual-v2.0',
    truncate='LEFT'
).embeddings

print(np.array(xq).shape)

# query, returning the top 10 most similar results
res = index.query(vector=xq, top_k=5, include_metadata=True)
res

(1, 768)


{'matches': [{'id': '140', 'score': 0.884399, 'values': []},
             {'id': '282', 'score': 0.883328199, 'values': []},
             {'id': '468', 'score': 0.880855203, 'values': []},
             {'id': '200', 'score': 0.876894951, 'values': []},
             {'id': '306', 'score': 0.875544429, 'values': []}],
 'namespace': '',
 'usage': {'read_units': 6}}

In [42]:
# Get the product description for each STAGE 1 result
docs = []
for m in res['matches']:
   docs.append(products.iloc[int(m['id'])]['Product Description'])



In [43]:
docs

['Revolutionize your reading enjoyment with our cutting-edge fiction novel. Experience innovation at its best, offering efficiency and versatility.',
 'Transform your reading enjoyment with the latest fiction novel. Featuring advanced technology, it sets new standards in the industry.',
 'Transform your reading enjoyment with the latest fiction novel. Featuring advanced technology, it sets new standards in the industry.']

In [44]:


import csv
import json

results = []

with open("/content/random_product_searches.csv", 'r') as file:
    csvreader = csv.reader(file)
    header = next(csvreader)

    for row in csvreader:

      d = {'query': row[0], 'relevant_passages': []}
      xq = co.embed(
        texts=[row[0]],
        model='embed-multilingual-v2.0',
        truncate='LEFT'
         ).embeddings

      # query, returning the top 10 most similar results
      res = index.query(vector=xq, top_k=3, include_metadata=True)
      for m in res['matches']:
            d['relevant_passages'].append(products.iloc[int(m['id'])]['Product Description'])

      results.append(d)

#Convert to list json

with open('/content/new_file.jsonl', 'w') as f:
  for ddict in results:
      jout = json.dumps(ddict) + '\n'
      f.write(jout)

In [None]:
# STAGE 1 results contain irrelevant results (e.g., products related to men)
docs

['Revolutionize your playtime with our cutting-edge lipstick. Experience innovation at its best, offering efficiency and versatility.',
 'Revolutionize your daily life with our cutting-edge lipstick. Experience innovation at its best, offering efficiency and versatility.',
 'Revolutionize your home improvement projects with our cutting-edge lipstick. Experience innovation at its best, offering efficiency and versatility.',
 'Elevate your playtime with our premium lipstick. Crafted with excellence, it provides superior quality and unmatched durability.',
 'Transform your home improvement projects with the latest lipstick. Featuring advanced technology, it sets new standards in the industry.']

In [None]:
# query, [docs]

In [None]:
# STAGE 2: Rerank the result
rerank_hits = co.rerank(query=query, documents=docs, top_n=3, model='rerank-multilingual-v2.0')

In [None]:
# As you can see, the rerank model applied semantic search and only kept women's results
rerank_hits

[RerankResult<document['text']: Revolutionize your daily life with our cutting-edge lipstick. Experience innovation at its best, offering efficiency and versatility., index: 1, relevance_score: 0.70801073>, RerankResult<document['text']: Elevate your playtime with our premium lipstick. Crafted with excellence, it provides superior quality and unmatched durability., index: 3, relevance_score: 0.66963685>, RerankResult<document['text']: Revolutionize your playtime with our cutting-edge lipstick. Experience innovation at its best, offering efficiency and versatility., index: 0, relevance_score: 0.53740555>]

In [51]:
with open('/content/productlist.jsonl', 'r') as f:
  import json

  data = json.load(f)


  rows = []

  for line in data["data"]:
    line['relevant_passages'] = [line.pop('relevant_passage')]
    rows.append(json.dumps(line))

with open('/content/new_file.jsonl', 'w') as f:
  f.write('\n'.join(rows))



In [None]:
from google.colab import drive
drive.mount('/content/drive')