<a href="https://colab.research.google.com/github/HUJameson/Colab/blob/main/aillm_7.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

import os
os.chdir("/content/drive/My Drive/Colab Notebooks")

Mounted at /content/drive


In [None]:
!pip install openai

In [5]:
import openai
from sk_utils import read_sk

openai.api_key = read_sk()

In [7]:
import openai

COMPLETION_MODEL = "text-davinci-003"

def generate_data_by_prompt(prompt):
    response = openai.Completion.create(
        engine=COMPLETION_MODEL,
        prompt=prompt,
        temperature=0.5,
        max_tokens=2048,
        top_p=1,
    )
    return response.choices[0].text

prompt = """请你生成50条淘宝网里的商品的标题，每条在30个字左右，品类是3C数码产品，标题里往往也会有一些促销类的信息，每行一条。"""
data = generate_data_by_prompt(prompt)

In [9]:
import pandas as pd

product_names = data.strip().split('\n')
df = pd.DataFrame({'product_name': product_names})
df.head()

df.product_name = df.product_name.apply(lambda x: x.split('.')[1].strip())
df.head()

Unnamed: 0,product_name
0,【新款】苹果AirPods Pro无线耳机限时特惠
1,【限量】小米电视4A 55英寸4K超清智能电视
2,【热销】华为MateBook X Pro笔记本电脑
3,【独家】荣耀MagicBook 14英寸轻薄笔记本
4,【抢购】OPPO Reno4 Pro 5G全网通手机


In [10]:
clothes_prompt = """请你生成50条淘宝网里的商品的标题，每条在30个字左右，品类是女性的服饰箱包等等，标题里往往也会有一些促销类的信息，每行一条。"""
clothes_data = generate_data_by_prompt(clothes_prompt)
clothes_product_names = clothes_data.strip().split('\n')
clothes_df = pd.DataFrame({'product_name': clothes_product_names})
clothes_df.product_name = clothes_df.product_name.apply(lambda x: x.split('.')[1].strip())
clothes_df.head()

Unnamed: 0,product_name
0,【新款】时尚拼接女士手提单肩包
1,【特惠】复古气质女士钱包
2,【限量】精致编织女士手提包
3,【热销】百搭水洗棉拎包
4,【新品】时尚简约女士双肩包


In [11]:
df = pd.concat([df, clothes_df], axis=0)
df = df.reset_index(drop=True)
display(df)

Unnamed: 0,product_name
0,【新款】苹果AirPods Pro无线耳机限时特惠
1,【限量】小米电视4A 55英寸4K超清智能电视
2,【热销】华为MateBook X Pro笔记本电脑
3,【独家】荣耀MagicBook 14英寸轻薄笔记本
4,【抢购】OPPO Reno4 Pro 5G全网通手机
...,...
95,【促销】百搭时尚女士斜挎包
96,【特惠】精致编织女士单肩包
97,【新款】百搭水洗棉双肩包
98,【热销】时尚简约女士斜挎包


In [13]:
!pip install backoff

Collecting backoff
  Downloading backoff-2.2.1-py3-none-any.whl (15 kB)
Installing collected packages: backoff
Successfully installed backoff-2.2.1


In [14]:
from openai.embeddings_utils import get_embeddings
import openai, backoff

embedding_model = "text-embedding-ada-002"

batch_size = 100

@backoff.on_exception(backoff.expo, openai.error.RateLimitError)
def get_embeddings_with_backoff(prompts, engine):
    embeddings = []
    for i in range(0, len(prompts), batch_size):
        batch = prompts[i:i+batch_size]
        embeddings += get_embeddings(list_of_text=batch, engine=engine)
    return embeddings

prompts = df.product_name.tolist()
prompt_batches = [prompts[i:i+batch_size] for i in range(0, len(prompts), batch_size)]

embeddings = []
for batch in prompt_batches:
    batch_embeddings = get_embeddings_with_backoff(prompts=batch, engine=embedding_model)
    embeddings += batch_embeddings

df["embedding"] = embeddings
df.to_parquet("data/taobao_product_title.parquet", index=False)

In [18]:
from openai.embeddings_utils import get_embedding, cosine_similarity

# search through the reviews for a specific product
def search_product(df, query, n=3, pprint=True):
    product_embedding = get_embedding(
        query,
        engine=embedding_model
    )
    df["similarity"] = df.embedding.apply(lambda x: cosine_similarity(x, product_embedding))

    results = (
        df.sort_values("similarity", ascending=False)
        .head(n)
        .product_name
    )
    if pprint:
        for r in results:
            print(r)
    return results

results = search_product(df, "新潮女士背包", n=3)

【新品】潮流时尚女士单肩包
【新品】潮流时尚女士手提包
【新品】潮流时尚女士手提包


In [20]:
def recommend_product(df, product_name, n=3, pprint=True):
    product_embedding = df[df['product_name'] == product_name].iloc[0].embedding
    df["similarity"] = df.embedding.apply(lambda x: cosine_similarity(x, product_embedding))

    results = (
        df.sort_values("similarity", ascending=False)
        .head(n)
        .product_name
    )
    if pprint:
        for r in results:
            print(r)
    return results

results = recommend_product(df, "荣耀V30 5G全网通手机", n=3)

IndexError: ignored