-
Notifications
You must be signed in to change notification settings - Fork 336
/
_llm.py
139 lines (115 loc) · 5.08 KB
/
_llm.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
from typing import List, Union
try:
from sentence_transformers import util
HAS_SBERT = True
except ModuleNotFoundError:
HAS_SBERT = False
class KeyLLM:
"""
A minimal method for keyword extraction with Large Language Models (LLM)
The keyword extraction is done by simply asking the LLM to extract a
number of keywords from a single piece of text.
"""
def __init__(self, llm):
"""KeyBERT initialization
Arguments:
llm: The Large Language Model to use
"""
self.llm = llm
def extract_keywords(
self,
docs: Union[str, List[str]],
check_vocab: bool = False,
candidate_keywords: List[List[str]] = None,
threshold: float = None,
embeddings=None
) -> Union[List[str], List[List[str]]]:
"""Extract keywords and/or keyphrases
To get the biggest speed-up, make sure to pass multiple documents
at once instead of iterating over a single document.
NOTE: The resulting keywords are expected to be separated by commas so
any changes to the prompt will have to make sure that the resulting
keywords are comma-separated.
Arguments:
docs: The document(s) for which to extract keywords/keyphrases
check_vocab: Only return keywords that appear exactly in the documents
candidate_keywords: Candidate keywords for each document
Returns:
keywords: The top n keywords for a document with their respective distances
to the input document.
Usage:
To extract keywords from a single document:
```python
import openai
from keybert.llm import OpenAI
from keybert import KeyLLM
# Create your LLM
client = openai.OpenAI(api_key=MY_API_KEY)
llm = OpenAI(client)
# Load it in KeyLLM
kw_model = KeyLLM(llm)
# Extract keywords
document = "The website mentions that it only takes a couple of days to deliver but I still have not received mine."
keywords = kw_model.extract_keywords(document)
```
"""
# Check for a single, empty document
if isinstance(docs, str):
if docs:
docs = [docs]
else:
return []
if HAS_SBERT and threshold is not None and embeddings is not None:
# Find similar documents
clusters = util.community_detection(embeddings, min_community_size=2, threshold=threshold)
in_cluster = set([cluster for cluster_set in clusters for cluster in cluster_set])
out_cluster = set(list(range(len(docs)))).difference(in_cluster)
# Extract keywords for all documents not in a cluster
if out_cluster:
selected_docs = [docs[index] for index in out_cluster]
if candidate_keywords is not None:
selected_keywords = [candidate_keywords[index] for index in out_cluster]
else:
selected_keywords = None
out_cluster_keywords = self.llm.extract_keywords(
selected_docs,
selected_keywords,
)
out_cluster_keywords = {index: words for words, index in zip(out_cluster_keywords, out_cluster)}
# Extract keywords for only the first document in a cluster
if in_cluster:
selected_docs = [docs[cluster[0]] for cluster in clusters]
if candidate_keywords is not None:
selected_keywords = [candidate_keywords[cluster[0]] for cluster in clusters]
else:
selected_keywords = None
in_cluster_keywords = self.llm.extract_keywords(
selected_docs,
selected_keywords
)
in_cluster_keywords = {
doc_id: in_cluster_keywords[index]
for index, cluster in enumerate(clusters)
for doc_id in cluster
}
# Update out cluster keywords with in cluster keywords
if out_cluster:
if in_cluster:
out_cluster_keywords.update(in_cluster_keywords)
keywords = [out_cluster_keywords[index] for index in range(len(docs))]
else:
keywords = [in_cluster_keywords[index] for index in range(len(docs))]
else:
# Extract keywords using a Large Language Model (LLM)
keywords = self.llm.extract_keywords(docs, candidate_keywords)
# Only extract keywords that appear in the input document
if check_vocab:
updated_keywords = []
for keyword_set, document in zip(keywords, docs):
updated_keyword_set = []
for keyword in keyword_set:
if keyword in document:
updated_keyword_set.append(keyword)
updated_keywords.append(updated_keyword_set)
return updated_keywords
return keywords