# Preprocessing

In [None]:
!unzip *.zip && rm -rf *.zip

Archive:  marketing_sample_for_amazon_com-ecommerce__20200101_20200131__10k_data.csv.zip
  inflating: marketing_sample_for_amazon_com-ecommerce__20200101_20200131__10k_data.csv  


In [None]:
import pandas as pd

data = pd.read_csv("products.csv")

In [None]:
data.columns

Index(['Uniq Id', 'Product Name', 'Brand Name', 'Asin', 'Category',
       'Upc Ean Code', 'List Price', 'Selling Price', 'Quantity',
       'Model Number', 'About Product', 'Product Specification',
       'Technical Details', 'Shipping Weight', 'Product Dimensions', 'Image',
       'Variants', 'Sku', 'Product Url', 'Stock', 'Product Details',
       'Dimensions', 'Color', 'Ingredients', 'Direction To Use',
       'Is Amazon Seller', 'Size Quantity Variant', 'Product Description'],
      dtype='object')

In [None]:
impColumns = data[['Product Name', 'Category', 'Selling Price', 'Image', 'Product Url']]

In [None]:
impColumns.columns = ['prod_name', 'category', 'price', 'image', 'product']

In [None]:
corpus = impColumns.to_dict(orient="records")

# Collecting Data

In [None]:
!pip install bs4 tiktoken openai langchain pinecone-client[grpc]

In [None]:
import pinecone

pinecone.init(
        api_key="xxxxx",
        environment="xxxx"
)

  from tqdm.autonotebook import tqdm


In [None]:
pinecone.create_index(
    "recommedation",
    dimension=1536,
    metric='dotproduct'
)

In [None]:
import openai
import tiktoken
import pinecone
from uuid import uuid4
from tqdm.auto import tqdm
from langchain.text_splitter import RecursiveCharacterTextSplitter
openai.api_key = "sk-xxxxxxxx"

In [None]:
tokenizer = tiktoken.get_encoding('p50k_base')

In [None]:
index = pinecone.GRPCIndex("recommedation")

In [None]:
def tiktoken_len(self, text):
  tokens = tokenizer.encode(
      text,
      disallowed_special=()
  )
  return len(tokens)

In [None]:
corpus[0]

{'prod_name': 'DB Longboards CoreFlex Crossbow 41" Bamboo Fiberglass Longboard Complete',
 'category': 'Sports & Outdoors | Outdoor Recreation | Skates, Skateboards & Scooters | Skateboarding | Standard Skateboards & Longboards | Longboards',
 'price': '$237.68',
 'image': 'https://images-na.ssl-images-amazon.com/images/I/51j3fPQTQkL.jpg|https://images-na.ssl-images-amazon.com/images/I/31hKM3cSoSL.jpg|https://images-na.ssl-images-amazon.com/images/I/51WlHdwghfL.jpg|https://images-na.ssl-images-amazon.com/images/I/51FsyLRBzwL.jpg|https://images-na.ssl-images-amazon.com/images/G/01/x-locale/common/transparent-pixel.jpg',
 'product': 'https://www.amazon.com/DB-Longboards-CoreFlex-Fiberglass-Longboard/dp/B07KMVJJK7'}

In [None]:
def process(data):
  chunks = []
  for idx, record in enumerate(tqdm(data)):
    chunks.append({
        'id': str(uuid4()),
        'prod_name': record['prod_name'],
        'category': record['category'],
        'price': record['price'],
        'image': record['image'],
        'product': record['product'],
        'chunk': idx
    })
  return chunks

In [None]:
chunks = process(corpus)

  0%|          | 0/10002 [00:00<?, ?it/s]

In [None]:
import time

def create_embeddings(chunks):
  batch_size = 100  # how many embeddings we create and insert at once

  for i in tqdm(range(0, len(chunks), batch_size)):
      # find end of batch
      i_end = min(len(chunks), i+batch_size)
      meta_batch = chunks[i:i_end]
      # get ids
      ids_batch = [x['id'] for x in meta_batch]
      # get texts to encode
      texts = [x['prod_name'] for x in meta_batch]
      # create embeddings (try-except added to avoid RateLimitError)
      try:
          res = openai.Embedding.create(input=texts, engine="text-embedding-ada-002")
      except:
          done = False
          while not done:
              time.sleep(5)
              try:
                  res = openai.Embedding.create(input=texts, engine="text-embedding-ada-002")
                  done = True
              except:
                  pass
      embeds = [record['embedding'] for record in res['data']]
      # cleanup metadata
      meta_batch = [{
          'id': x['id'],
          'prod_name': x['prod_name'],
          'category': x['category'],
          'price': x['price'],
          'image': x['image'],
          'product': x['product'],
          'chunk': x['chunk']
      } for x in meta_batch]
      to_upsert = list(zip(ids_batch, embeds, meta_batch))
      # upsert to Pinecone
      index.upsert(vectors=to_upsert)

In [None]:
create_embeddings(chunks)

  0%|          | 0/101 [00:00<?, ?it/s]

In [None]:
def query(query):
  res = openai.Embedding.create(
      input=[query],
      engine="text-embedding-ada-002"
  )

  # retrieve from Pinecone
  xq = res['data'][0]['embedding']

  # get relevant contexts (including the questions)
  res = index.query(xq, top_k=5, include_metadata=True)
  return res

#Giving Interface

In [None]:
!pip install gradio

Installing collected packages: pydub, ffmpy, websockets, uc-micro-py, semantic-version, python-multipart, orjson, markdown-it-py, h11, aiofiles, uvicorn, starlette, mdit-py-plugins, linkify-it-py, huggingface-hub, httpcore, httpx, fastapi, gradio-client, gradio
  Attempting uninstall: markdown-it-py
    Found existing installation: markdown-it-py 3.0.0
    Uninstalling markdown-it-py-3.0.0:
      Successfully uninstalled markdown-it-py-3.0.0
Successfully installed aiofiles-23.1.0 fastapi-0.98.0 ffmpy-0.3.0 gradio-3.35.2 gradio-client-0.2.7 h11-0.14.0 httpcore-0.17.2 httpx-0.24.1 huggingface-hub-0.15.1 linkify-it-py-2.0.2 markdown-it-py-2.2.0 mdit-py-plugins-0.3.3 orjson-3.9.1 pydub-0.25.1 python-multipart-0.0.6 semantic-version-2.10.0 starlette-0.27.0 uc-micro-py-1.0.2 uvicorn-0.22.0 websockets-11.0.3


In [None]:
import gradio as gr

def add_text(history, text):
    history = history + [(text, None)]
    return history, ""

def add_file(history, file):
    history = history + [((file.name,), None)]
    return history

def bot(history):
    def get_response(queryy):
        results = query(queryy)
        recommendations = [_match['metadata'] for _match in results['matches']]
        prod_names = "\n".join([f"{idx+1}- {prod['prod_name']}. \n Price {prod['price']}. Product Link: {prod['product']}" for idx, prod in enumerate(recommendations)])
        return f"AI:\n{prod_names}"

    response = get_response(history[-1][0])
    history[-1][1] = response
    return history

with gr.Blocks() as demo:
    chatbot = gr.Chatbot([], elem_id="chatbot").style(height=350, max_height=500, overflow='scroll')

    txt = gr.Textbox(
                show_label=False,
                placeholder="Enter text and press enter",
            ).style(container=False)

    txt.submit(add_text, [chatbot, txt], [chatbot, txt]).then(
        bot, chatbot, chatbot
    )

demo.launch(debug=True, share=True)




Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
Running on public URL: https://19c8fd27656673e947.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7860 <> https://19c8fd27656673e947.gradio.live


