<a href="https://colab.research.google.com/github/HUJameson/Colab/blob/main/aillm_5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [9]:
from google.colab import drive
drive.mount('/content/drive')

import os
os.chdir("/content/drive/My Drive/Colab Notebooks")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [10]:
from sklearn.datasets import fetch_20newsgroups
import pandas as pd

def twenty_newsgroup_to_csv():
    newsgroups_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'))

    df = pd.DataFrame([newsgroups_train.data, newsgroups_train.target.tolist()]).T
    df.columns = ['text', 'target']

    targets = pd.DataFrame( newsgroups_train.target_names, columns=['title'])

    out = pd.merge(df, targets, left_on='target', right_index=True)
    out.to_csv('data/20_newsgroup2.csv', index=False)

twenty_newsgroup_to_csv()

In [None]:
!pip install openai
!pip install tiktoken
!pip install backoff

In [None]:
import json
f = open('data/openai_sk.json', 'r')
content = f.read()
a = json.loads(content)
f.close()
openai_sk = a['sk']
print(openai_sk)

In [12]:
import tiktoken

embedding_encoding = "cl100k_base"  # this the encoding for text-embedding-ada-002
max_tokens = 8000  # the maximum for text-embedding-ada-002 is 8191

df = pd.read_csv('data/20_newsgroup.csv')
print("Number of rows before null filtering:", len(df))
df = df[df['text'].isnull() == False]
encoding = tiktoken.get_encoding(embedding_encoding)

df["n_tokens"] = df.text.apply(lambda x: len(encoding.encode(x)))
print("Number of rows before token number filtering:", len(df))
df = df[df.n_tokens <= max_tokens]
print("Number of rows data used:", len(df))
df = df.sample(100)
print("Number of rows data sampled:", len(df))
print(df)

Number of rows before null filtering: 11314
Number of rows before token number filtering: 11096
Number of rows data used: 11044
Number of rows data sampled: 100
                                                    text  target  \
6359   Your posting provoked me into checking my save...       0   
3897   I read about the development of EISA-2 some ti...       3   
1245   \n   Just because the 68070 can run upto 15Mhz...       1   
11303  I guess the cryptowranglers read this group to...      11   
9443   \nThis is a ridiculous argument for being a Ch...      15   
...                                                  ...     ...   
2126   Reported yesterday in the Washington Post (Kat...      14   
7576   Hi,\n\nI just compiled the X11R5 distribution ...       5   
2361   \n\nCNN just claimed he bought 104 "semi-autom...      16   
223    \nHow about those toneau covers? I've been thi...       7   
10514  # \n# #Are you saying that:\n# \n# #(1) People...      18   

                      

In [13]:
from openai.embeddings_utils import get_embeddings
import openai, backoff

@backoff.on_exception(backoff.expo, openai.error.RateLimitError)
def get_embeddings_with_backoff(prompts, engine):
    embeddings = []
    for i in range(0, len(prompts), batch_size):
        batch = prompts[i:i+batch_size]
        embeddings += get_embeddings(list_of_text=batch, engine=engine)

    return embeddings

batch_size = 50
prompts = df.text.tolist()
prompt_batches = [prompts[i:i+batch_size] for i in range(0, len(prompts), batch_size)]

openai.api_key = openai_sk
embedding_model = "text-embedding-ada-002"

embeddings = []
for batch in prompt_batches:
    batch_embeddings = get_embeddings_with_backoff(prompts=batch, engine=embedding_model)
    embeddings += batch_embeddings

df["embedding"] = embeddings
df.to_parquet("data/20_newsgroup_with_embedding2.parquet", index=False)

In [14]:
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans

embedding_df = pd.read_parquet("data/20_newsgroup_with_embedding2.parquet")

matrix = np.vstack(embedding_df.embedding.values)
num_of_clusters = 20

kmeans = KMeans(n_clusters=num_of_clusters, init="k-means++", n_init=10, random_state=42)
kmeans.fit(matrix)
labels = kmeans.labels_
embedding_df["cluster"] = labels

# 统计每个cluster的数量
new_df = embedding_df.groupby('cluster')['cluster'].count().reset_index(name='count')

# 统计这个cluster里最多的分类的数量
title_count = embedding_df.groupby(['cluster', 'title']).size().reset_index(name='title_count')
first_titles = title_count.groupby('cluster').apply(lambda x: x.nlargest(1, columns=['title_count']))
first_titles = first_titles.reset_index(drop=True)
new_df = pd.merge(new_df, first_titles[['cluster', 'title', 'title_count']], on='cluster', how='left')
new_df = new_df.rename(columns={'title': 'rank1', 'title_count': 'rank1_count'})

# 统计这个cluster里第二多的分类的数量
second_titles = title_count[~title_count['title'].isin(first_titles['title'])]
second_titles = second_titles.groupby('cluster').apply(lambda x: x.nlargest(1, columns=['title_count']))
second_titles = second_titles.reset_index(drop=True)
new_df = pd.merge(new_df, second_titles[['cluster', 'title', 'title_count']], on='cluster', how='left')
new_df = new_df.rename(columns={'title': 'rank2', 'title_count': 'rank2_count'})
new_df['first_percentage'] = (new_df['rank1_count'] / new_df['count']).map(lambda x: '{:.2%}'.format(x))
# 将缺失值替换为 0
new_df.fillna(0, inplace=True)
# 输出结果
from IPython.display import display
display(new_df)

Unnamed: 0,cluster,count,rank1,rank1_count,rank2,rank2_count,first_percentage
0,0,2,alt.atheism,1,sci.med,1.0,50.00%
1,1,1,talk.religion.misc,1,0,0.0,100.00%
2,2,7,sci.electronics,2,sci.med,2.0,28.57%
3,3,5,rec.sport.baseball,3,rec.sport.hockey,1.0,60.00%
4,4,6,rec.autos,2,sci.med,1.0,33.33%
5,5,9,comp.os.ms-windows.misc,3,comp.sys.mac.hardware,2.0,33.33%
6,6,5,rec.sport.baseball,4,rec.sport.hockey,1.0,80.00%
7,7,3,rec.motorcycles,3,0,0.0,100.00%
8,8,3,alt.atheism,1,talk.politics.misc,1.0,33.33%
9,9,2,comp.graphics,1,comp.sys.ibm.pc.hardware,1.0,50.00%


In [24]:
import time

items_per_cluster = 1
COMPLETIONS_MODEL = "text-davinci-003"

for i in range(num_of_clusters):
    cluster_name = new_df[new_df.cluster == i].iloc[0].rank1
    print(f"Cluster {i}, Rank 1: {cluster_name}, Theme:", end=" ")

    content = "\n".join(
        embedding_df[embedding_df.cluster == i].text.sample(items_per_cluster, random_state=42).values
    )
    response = openai.Completion.create(
        model=COMPLETIONS_MODEL,
        prompt=f'''我们想要给下面的内容，分组成有意义的类别，以便我们可以对其进行总结。请根据下面这些内容的共同点，总结一个50个字以内的新闻组的名称。比如 “PC硬件”\n\n内容:\n"""\n{content}\n"""新闻组名称：''',
        temperature=0,
        max_tokens=100,
        top_p=1,
    )
    print(response["choices"][0]["text"].replace("\n", ""))
    time.sleep(20)

Cluster 0, Rank 1: alt.atheism, Theme: 性别反应研究
Cluster 1, Rank 1: talk.religion.misc, Theme: 洛可可研究组
Cluster 2, Rank 1: sci.electronics, Theme: PC软件和硬件
Cluster 3, Rank 1: rec.sport.baseball, Theme: Hall of Fame候选人
Cluster 4, Rank 1: rec.autos, Theme: 女性摄影
Cluster 5, Rank 1: comp.os.ms-windows.misc, Theme: 农业和马术
Cluster 6, Rank 1: rec.sport.baseball, Theme: 巨人五人轮换投手
Cluster 7, Rank 1: rec.motorcycles, Theme: 汽车拥有者经验
Cluster 8, Rank 1: alt.atheism, Theme: 政府权力与自由主义
Cluster 9, Rank 1: comp.graphics, Theme: 屏幕截图技术
Cluster 10, Rank 1: sci.crypt, Theme: 加密技术与隐私保护
Cluster 11, Rank 1: soc.religion.christian, Theme: 宗教信仰与永恒生命
Cluster 12, Rank 1: rec.motorcycles, Theme: 友谊之间的互动
Cluster 13, Rank 1: soc.religion.christian, Theme: 接受自我：重新思考
Cluster 14, Rank 1: comp.windows.x, Theme: 实时支持X-Windows
Cluster 15, Rank 1: talk.politics.guns, Theme: 政治行动
Cluster 16, Rank 1: comp.graphics, Theme: 电话线路探测
Cluster 17, Rank 1: comp.graphics, Theme: PC性能提升
Cluster 18, Rank 1: rec.autos, Theme: 汽车配置选项
Cluster 19, 

In [26]:
import time

items_per_cluster = 1
COMPLETIONS_MODEL = "text-davinci-003"

for i in range(num_of_clusters):
    cluster_name = new_df[new_df.cluster == i].iloc[0].rank1
    print(f"Cluster {i}, Rank 1: {cluster_name}, 抽样翻译:", end=" ")

    content = "\n".join(
        #embedding_df[(embedding_df.cluster == i) & (embedding_df.n_tokens > 100)].text.sample(items_per_cluster, random_state=42).values
        embedding_df[embedding_df.cluster == i].text.sample(items_per_cluster, random_state=42).values
    )
    response = openai.Completion.create(
        model=COMPLETIONS_MODEL,
        prompt=f'''请把下面的内容翻译成中文\n\n内容:\n"""\n{content}\n"""翻译：''',
        temperature=0,
        max_tokens=2000,
        top_p=1,
    )
    print(response["choices"][0]["text"].replace("\n", ""))
    time.sleep(20)

Cluster 0, Rank 1: alt.atheism, 抽样翻译: 那很有趣。我想知道，在和男人一个沮丧的夜晚之后，女性的反应是否被记录下来？这被认为是重要的吗？
Cluster 1, Rank 1: talk.religion.misc, 抽样翻译: 肯特：尊重你的意见，你一开始就把名字搞错了，我怎么会把你当回事？例如：没有这种东西。正确的名字是古老神秘的十字玫瑰团，缩写为AMORC。也没有这种东西。它是罗斯德鲁西安会。他们明确表示，他们不声称自己是传说中的法玛兄弟会的后裔。Lectorium？还有谁？这些都不是罗斯德鲁西安“团体”。他们是共济会的学习小组，没有一个声称自己是原始秩序的后裔。ORC是什么？如果你指的是AMORC，你甚至都没有学会正确的名字？！
Cluster 2, Rank 1: sci.electronics, 抽样翻译: 出售三款IBM PC及兼容计算机软件：o Wing Commander豪华版，包括秘密任务1和2，所有原装包装、说明书和磁盘，还有注册卡（可以把它寄出去，以你的名字注册），原价Wing Commander：69.95美元，秘密任务1：29.95美元，秘密任务2：29.95美元，总原价：129.85美元，我出售这些游戏的价格是65.00美元。Wing Commander和秘密任务是一款太空战斗和飞行模拟器，包括所有关于飞行模拟器的标准乐趣，比如起飞和降落在航母上，飞行等，更棒的是，它也是一款战斗模拟器，非常有趣。需要IBM PC或兼容机，至少640K内存，双软盘或硬盘。o WinWay Resume for Windows，包括所有原装包装、说明书和磁盘，原价50.00美元，我的要价是35.00美元。WinWay Resume是一款Windows简历写作程序，是一款优秀的程序（它帮我找到了一份工作！），在Windows界面下运行，使用起来非常容易。只需回答几个问题，就可以打印出结果。只需几分钟，就可以拥有一份精美、专业的简历。需要IBM PC，安装Windows 3.0或更高版本，1MB硬盘空间。o More Typefaces，包括所有原装包装、说明书和磁盘，原价99.99美元，我的要价是30.00美元。More Typefaces是一款为Windows 3.1设计的三个TypeType字体族（共12种字体）的软件包，

In [27]:
history = """User : 你是谁？
Assistant : 我是一个人工智能语言模型，可以回答各种问题，包括做菜的问题。

User : 请问鱼香肉丝怎么做？
Assistant : 鱼香肉丝的做法：

1. 猪里脊肉切丝，加入盐、淀粉、料酒搅拌均匀，腌制10分钟
2. 胡萝卜、青椒、木耳切丝备用
3. 热锅凉油，放入蒜末、姜末煸炒出香味，加入肉丝翻炒至变色
4. 加入胡萝卜、青椒、木耳翻炒，加入酱油、糖、醋、盐、味精、水淀粉搅拌均匀，倒入锅中翻炒均匀即可。

提示：鱼香肉丝的关键是调料的配比，酱油、糖、醋、盐、味精的比例要适合自己的口味。

User : 那蚝油牛肉呢？
Assistant : 蚝油牛肉的做法：

1. 将牛肉切成薄片，加入盐、淀粉、料酒腌制10分钟
2. 葱姜切末，蒜切片，青红椒切丝备用
3. 热锅凉油，放入葱姜末、蒜片煸炒出香味，加入牛肉翻炒至变色
4. 加入蚝油、酱油、糖、水淀粉调味，加入青红椒翻炒均匀即可。

提示：蚝油牛肉的关键是牛肉要切薄，翻炒时火候要快，保证牛肉口感鲜嫩。调味时，蚝油和酱油的比例也要适合自己的口味。
"""

def summarize(text, max_tokens=200):
    response = openai.Completion.create(
        model=COMPLETIONS_MODEL,
        prompt=text + "\n\n请总结一下上面User和Assistant聊了些什么：\n",
        max_tokens=max_tokens,
    )
    return response["choices"][0]["text"]

summarized = summarize(history)
print(summarized)


User和Assistant聊了鱼香肉丝和蚝油牛肉的做法，并分别介绍了调料的配比和切肉的要求，还提示调料的比例和火候要适合自己的口味。


In [28]:
class Conversation:
    def __init__(self, prompt, num_of_round):
        self.prompt = prompt
        self.num_of_round = num_of_round
        self.messages = []
        self.messages.append({"role": "system", "content": self.prompt})

    def ask(self, question):
        try:
            self.messages.append({"role": "user", "content": question})
            response = openai.ChatCompletion.create(
                model="gpt-3.5-turbo",
                messages=self.messages,
                temperature=0.5,
                max_tokens=2048,
                top_p=1,
            )
        except Exception as e:
            print(e)
            return e

        message = response["choices"][0]["message"]["content"]
        self.messages.append({"role": "assistant", "content": message})

        if len(self.messages) > self.num_of_round*2 + 1:
            del self.messages[1:3]
        return message

In [29]:
prompt = summarized + "\n\n请你根据已经聊了的内容，继续对话："
conversation = Conversation(prompt, 5)

question = "那宫保鸡丁呢？"
answer = conversation.ask(question)
print("User : %s" % question)
print("Assistant : %s\n" % answer)

User : 那宫保鸡丁呢？
Assistant : 宫保鸡丁是一道非常经典的川菜，口味麻辣香酥，非常美味。它的主要调料包括郫县豆瓣酱、花椒、干辣椒、酱油、白糖、醋等。一般来说，郫县豆瓣酱和花椒是宫保鸡丁的关键调料，它们能给菜肴带来独特的麻辣味道。

在制作宫保鸡丁时，首先需要将鸡肉切成丁状，大小均匀。切鸡肉的时候，要先将鸡肉稍微冷冻一下，这样切起来更容易保持肉质的鲜嫩。然后，将切好的鸡肉用料酒、生抽、淀粉等腌制一段时间，使其入味。

接下来，将花椒和干辣椒用油炸香，然后捞出备用。锅中加入适量的油，放入葱姜蒜爆炒，再加入郫县豆瓣酱炒出红油。然后将腌制好的鸡肉倒入锅中快炒，待鸡肉变色后，加入适量的酱油、白糖、醋翻炒均匀。最后，加入炸好的花椒和干辣椒，翻炒均匀即可。

调料的配比和火候可以根据个人口味进行调整，如果喜欢麻辣重口味，可以多放一些郫县豆瓣酱和花椒；如果喜欢酸甜口味，可以适量增加醋和白糖的比例。记得根据自己的口味来调整哦！



In [30]:
conversation = Conversation("请你根据已经聊了的内容，继续对话：", 5)

question = "那宫保鸡丁呢？"
answer = conversation.ask(question)
print("User : %s" % question)
print("Assistant : %s\n" % answer)

User : 那宫保鸡丁呢？
Assistant : 宫保鸡丁是一道非常经典的中式菜肴。它由鸡肉丁、花生米、辣椒和葱姜蒜等调料炒制而成。这道菜以其麻辣味道和香脆口感而闻名。宫保鸡丁的做法也有一些变化，有些地方会加入木耳、胡萝卜等食材，以增加口感和营养价值。你喜欢吃宫保鸡丁吗？

