### 文本聚类与摘要，让AI帮你做个总结

In [1]:

from sklearn.datasets import fetch_20newsgroups
import pandas as pd

def twenty_newsgroup_to_csv():
    newsgroups_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'))

    df = pd.DataFrame([newsgroups_train.data, newsgroups_train.target.tolist()]).T
    df.columns = ['text', 'target']

    targets = pd.DataFrame( newsgroups_train.target_names, columns=['title'])

    out = pd.merge(df, targets, left_on='target', right_index=True)
    out.to_csv('20_newsgroup.csv', index=False)
    
twenty_newsgroup_to_csv()

In [2]:
from openai.embeddings_utils import get_embeddings
import openai, os, tiktoken, backoff

openai.api_key = os.environ.get("OPENAI_API_KEY")
embedding_model = "text-embedding-ada-002"
embedding_encoding = "cl100k_base"  # this the encoding for text-embedding-ada-002
batch_size = 2000
max_tokens = 8000  # the maximum for text-embedding-ada-002 is 8191

df = pd.read_csv('20_newsgroup.csv')
print("Number of rows before null filtering:", len(df))
df = df[df['text'].isnull() == False]
encoding = tiktoken.get_encoding(embedding_encoding)

df["n_tokens"] = df.text.apply(lambda x: len(encoding.encode(x)))
print("Number of rows before token number filtering:", len(df))
df = df[df.n_tokens <= max_tokens]
print("Number of rows data used:", len(df))

Number of rows before null filtering: 11314
Number of rows before token number filtering: 11096
Number of rows data used: 11044


In [6]:

@backoff.on_exception(backoff.expo, openai.error.RateLimitError)
def get_embeddings_with_backoff(prompts, engine):
    embeddings = []
    for i in range(0, len(prompts), batch_size):
        batch = prompts[i:i+batch_size]
        embeddings += get_embeddings(list_of_text=batch, engine=engine)
    return embeddings

prompts = df.text.tolist()
prompt_batches = [prompts[i:i+batch_size] for i in range(0, len(prompts), batch_size)]

embeddings = []
for batch in prompt_batches:
    batch_embeddings = get_embeddings_with_backoff(prompts=batch, engine=embedding_model)
    embeddings += batch_embeddings

df["embedding"] = embeddings
df.to_parquet("data/20_newsgroup_with_embedding.parquet", index=False)

In [7]:
import numpy as np
from sklearn.cluster import KMeans

embedding_df = pd.read_parquet("data/20_newsgroup_with_embedding.parquet")

matrix = np.vstack(embedding_df.embedding.values)
num_of_clusters = 20

kmeans = KMeans(n_clusters=num_of_clusters, init="k-means++", n_init=10, random_state=42)
kmeans.fit(matrix)
labels = kmeans.labels_
embedding_df["cluster"] = labels

In [8]:

# 统计每个cluster的数量
new_df = embedding_df.groupby('cluster')['cluster'].count().reset_index(name='count')

# 统计这个cluster里最多的分类的数量
title_count = embedding_df.groupby(['cluster', 'title']).size().reset_index(name='title_count')
first_titles = title_count.groupby('cluster').apply(lambda x: x.nlargest(1, columns=['title_count']))
first_titles = first_titles.reset_index(drop=True)
new_df = pd.merge(new_df, first_titles[['cluster', 'title', 'title_count']], on='cluster', how='left')
new_df = new_df.rename(columns={'title': 'rank1', 'title_count': 'rank1_count'})

# 统计这个cluster里第二多的分类的数量
second_titles = title_count[~title_count['title'].isin(first_titles['title'])]
second_titles = second_titles.groupby('cluster').apply(lambda x: x.nlargest(1, columns=['title_count']))
second_titles = second_titles.reset_index(drop=True)
new_df = pd.merge(new_df, second_titles[['cluster', 'title', 'title_count']], on='cluster', how='left')
new_df = new_df.rename(columns={'title': 'rank2', 'title_count': 'rank2_count'})
new_df['first_percentage'] = (new_df['rank1_count'] / new_df['count']).map(lambda x: '{:.2%}'.format(x))
# 将缺失值替换为 0
new_df.fillna(0, inplace=True)
# 输出结果
display(new_df)

To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)
  second_titles = second_titles.groupby('cluster').apply(lambda x: x.nlargest(1, columns=['title_count']))


Unnamed: 0,cluster,count,rank1,rank1_count,rank2,rank2_count,first_percentage
0,0,556,misc.forsale,452,0,0.0,81.29%
1,1,387,rec.motorcycles,369,0,0.0,95.35%
2,2,477,sci.space,432,0,0.0,90.57%
3,3,1164,talk.politics.misc,152,talk.religion.misc,66.0,13.06%
4,4,84,comp.os.ms-windows.misc,8,talk.religion.misc,2.0,9.52%
5,5,463,comp.windows.x,419,0,0.0,90.50%
6,6,480,alt.atheism,218,talk.religion.misc,69.0,45.42%
7,7,891,comp.graphics,367,talk.religion.misc,1.0,41.19%
8,8,387,sci.crypt,372,0,0.0,96.12%
9,9,512,rec.sport.baseball,494,0,0.0,96.48%


In [10]:

items_per_cluster = 10
COMPLETIONS_MODEL = "text-davinci-003"

for i in range(num_of_clusters):
    cluster_name = new_df[new_df.cluster == i].iloc[0].rank1
    print(f"Cluster {i}, Rank 1: {cluster_name}, Theme:", end=" ")

    content = "\n".join(
        embedding_df[embedding_df.cluster == i].text.sample(items_per_cluster, random_state=42).values
    )
    response = openai.Completion.create(
        model=COMPLETIONS_MODEL,
        prompt=f'''我们想要给下面的内容，分组成有意义的类别，以便我们可以对其进行总结。请根据下面这些内容的共同点，总结一个50个字以内的新闻组的名称。比如 “PC硬件”\n\n内容:\n"""\n{content}\n"""新闻组名称：''',
        temperature=0,
        max_tokens=100,
        top_p=1,
    )
    print(response["choices"][0]["text"].replace("\n", ""))

Cluster 0, Rank 1: misc.forsale, Theme: 电脑软件和硬件
Cluster 1, Rank 1: rec.motorcycles, Theme: 骑行安全
Cluster 2, Rank 1: sci.space, Theme: 航天技术研究
Cluster 3, Rank 1: talk.politics.misc, Theme: 政治争议
Cluster 4, Rank 1: comp.os.ms-windows.misc, Theme: 科技产品"""
Cluster 5, Rank 1: comp.windows.x, Theme: PC硬件和窗口管理
Cluster 6, Rank 1: alt.atheism, Theme: 宗教信仰与经验
Cluster 7, Rank 1: comp.graphics, Theme: 软件分享
Cluster 8, Rank 1: sci.crypt, Theme: 数字加密安全
Cluster 9, Rank 1: rec.sport.baseball, Theme: 棒球讨论
Cluster 10, Rank 1: talk.politics.mideast, Theme: 

InvalidRequestError: This model's maximum context length is 4097 tokens, however you requested 5318 tokens (5218 in your prompt; 100 for the completion). Please reduce your prompt; or completion length.

In [11]:

items_per_cluster = 1
COMPLETIONS_MODEL = "text-davinci-003"

for i in range(num_of_clusters):
    cluster_name = new_df[new_df.cluster == i].iloc[0].rank1
    print(f"Cluster {i}, Rank 1: {cluster_name}, 抽样翻译:", end=" ")

    content = "\n".join(
        embedding_df[(embedding_df.cluster == i) & (embedding_df.n_tokens > 100)].text.sample(items_per_cluster, random_state=42).values
    )
    response = openai.Completion.create(
        model=COMPLETIONS_MODEL,
        prompt=f'''请把下面的内容翻译成中文\n\n内容:\n"""\n{content}\n"""翻译：''',
        temperature=0,
        max_tokens=2000,
        top_p=1,
    )
    print(response["choices"][0]["text"].replace("\n", ""))

Cluster 0, Rank 1: misc.forsale, 抽样翻译: 出售房屋！！！！新泽西州梅尔斯维尔16号布罗克顿路描述：美丽的3间卧室，1 1/2浴室的斗篷码头位于一个大型雅致的景观拐角处，有围栏。这个家有一个吃饭的厨房，有内置的角落瓷器橱柜，一个大客厅，地毯，硬木地板，新的瓷砖门厅，和新刷的中性色调装饰。这个家包括新的中央空调和暖气，新屋顶，新热水器，铝外墙，风暴窗户和门，以及Rockwell外墙隔热。还有一个新的部分完成的地下室，有外部入口和新的Duro棚屋。很多存储空间。靠近295号公路。额外：洗碗机，洗衣机和烘干机，吊扇，窗帘。请拨打（609）586-1946预约。
Cluster 1, Rank 1: rec.motorcycles, 抽样翻译: 下次让数字更加可信——这是一个很糟糕的煽动性言论。120分贝接近于从几码远的地方听到的巨型喷气式飞机发动机起飞时的声音。它肯定接近人类的痛苦阈值。如果他们有任何标准，那么没有办法允许110分贝。
Cluster 2, Rank 1: sci.space, 抽样翻译: 天文学符号（这是最后一个发布到sci.astro的FAQ部分）从sci.astro的各种符号发布中收集。光谱分类序列：O B A F G K M R N S噢，要成为一个美丽的女孩，现在就亲亲我，甜心。（经典）奥德尔的大天文学悲剧，现在肯定会杀死我肥胖的秃头天文学被判有罪，杀死了许多不情愿的非科学学生。octopus大脑，一个受欢迎的烹饪厨房菜单，不需要酱汁奇怪的天文学家发现一般都是怪诞的符号，真是太棒了噢，大而凶猛的大猩猩，下个星期杀死我的室友噢，天哪，一个F级杀死我在糟糕的下午，发酵的葡萄让理查德·尼克松太太微笑噢，向后的天文学家，忘记地心说；开普勒的运动揭示了自然的简单我们糟糕的天文学教授星期一被杀烤蚂蚁，慢慢煎，保湿，保持天然的鲜味海外广播：一个闪光！哥斯拉杀死莫斯拉！（罗丹被任命为继任者）超重的男孩和胖女孩一直在嚼只有无聊的天文学家才能找到知道符号的满足感噢，血腥的天文学！F级杀死我行星的顺序：太阳水星金星地球（泰拉）火星（小行星）木星土星天王星海王星冥王星我非常认真的母亲刚刚给我们九个披萨母亲非常体贴地做了一个果冻三明治，没有抗议我非常性感的伴侣兴致勃勃地满足了不寻常的需求男人很容易做壶服务有用的夜间用途男人很早就做了一个壶

ValueError: a must be greater than 0 unless no samples are taken