In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
import nltk
import spacy
from nltk.tokenize import sent_tokenize
import numpy as np
import re
from transformers import DebertaV2Tokenizer, DebertaV2Model
import torch
import nltk
from nltk.stem import WordNetLemmatizer
import json
from tqdm import tqdm


# -- Version 1: Use DeBERTa and Tf-Idf to calculate document and word similarity

Calculate the similarity between the context output and the keyword output of tf-idf, also give lower weight to common words

In [87]:
class BertKeywordExtractor:
    def __init__(self):
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

        self.tokenizer = DebertaV2Tokenizer.from_pretrained('microsoft/deberta-v3-base')
        self.model = DebertaV2Model.from_pretrained('microsoft/deberta-v3-base')


        self.model.to(self.device)

    def refine_keywords(self, text, candidate_keywords, num_keywords):
        inputs = self.tokenizer(text, return_tensors='pt', padding=True, truncation=True)

        inputs = {k: v.to(self.device) for k, v in inputs.items()}

        with torch.no_grad():
            outputs = self.model(**inputs)
        embeddings = outputs.last_hidden_state

        doc_embedding = embeddings[:,0,:]

        refined_keywords = {}
        for keyword, w in candidate_keywords.items():
            kw_inputs = self.tokenizer(keyword, return_tensors='pt', padding=True, truncation=True)
            kw_inputs = {k: v.to(self.device) for k, v in kw_inputs.items()}

            with torch.no_grad():
                kw_outputs = self.model(**kw_inputs)
            kw_embedding = kw_outputs.last_hidden_state[:,0,:]

            similarity = torch.cosine_similarity(doc_embedding, kw_embedding)
            if similarity > 0.15:  # 阈值可以调整
                refined_keywords[keyword] = similarity * w
            
        sorted_kw = list(dict(sorted(refined_keywords.items(), key = lambda item: item[1], reverse=True)))

        return sorted_kw[:num_keywords]


# Use TfidfVectorizer to extract keywords
Generate scores for unigram and bigrams for each document, and filter out invalid collocations and redundant words. The weights of words with different frequencies are inversely normalized.

In [88]:
nltk.download('punkt')
nlp = spacy.load("en_core_web_sm")


def pos_tag(sentence):
    doc = nlp(sentence)
    pos_dict = {token.text: token.pos_ for token in doc}
    return pos_dict

def softmax(x):
    """Compute softmax values for each sets of scores in x."""
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum()

def filter_bigrams(text, bigrams):
    pos_dic=pos_tag(text[0])
    sentences = sent_tokenize(text[0])
    # valid_unigrams = []
    valid_bigrams = {}
    
    for bigram in bigrams:
        #unigram
        if ' ' not in bigram and not re.match(r'\d{1,3}(,\d{3})*', bigram) and \
            (bigram not in pos_dic or pos_dic[bigram] in ('NOUN', 'PROPN', 'ADJ')):
                valid_bigrams[bigram] = bigrams[bigram] * 1.05
                
        elif ' ' in bigram and any(bigram in sentence for sentence in sentences):
            bigram_token=bigram.split()
            if bigram_token[0] not in pos_dic or bigram_token[1] not in pos_dic:
                if '-' in bigram_token[0]:
                    valid_bigrams[bigram] = bigrams[bigram]
            elif pos_dic[bigram_token[0]] in ('NOUN', 'PROPN') and pos_dic[bigram_token[1]] in ('NOUN', 'PROPN') or \
               pos_dic[bigram_token[0]] == 'ADJ' and pos_dic[bigram_token[1]] in ('NOUN', 'PROPN'):
                   valid_bigrams[bigram] = bigrams[bigram]
            
    
    return valid_bigrams


class TFIDFKeywordExtractor:
    def __init__(self, index, documents):
        self.documents = documents
        self.index = index
        # self.text = documents[index]
        vectorizer = TfidfVectorizer(lowercase=False, stop_words='english', ngram_range=(1,2),token_pattern=r"(?u)\b\w+[-\w]+\b")
        vectorizer.fit([self.documents[self.index]])
        vocab = vectorizer.vocabulary_
        self.vectorizer = TfidfVectorizer(lowercase=False, stop_words='english', ngram_range=(1,2),vocabulary=vocab,token_pattern=r"(?u)\b\w+[-\w]+\b")
        
        

    def extract_keywords(self):
        
        tfidf_matrix = self.vectorizer.fit_transform(self.documents)
        importance = np.array(np.sum(tfidf_matrix, axis=0).flatten())[0]
        normalized_tfidf = softmax(1 - importance / np.max(importance))
        # print(importance)
        feature_array = self.vectorizer.get_feature_names_out()
        # feature_array
        tfidf_sorting = normalized_tfidf.argsort()[::-1]
        
        
        keywords = feature_array[tfidf_sorting]
        weight = sorted(normalized_tfidf)[::-1]
        kw_dic = dict(zip(keywords,weight))
     
        valid_keywords = filter_bigrams(self.documents, kw_dic)
        valid_keywords = self.remove_redundant_keywords(valid_keywords)
        # print(f'here is my{valid_keywords}')
        # print()
        
        return valid_keywords

    def remove_redundant_keywords(self, keywords):
        refined_keywords = {}
        for kw in keywords:
            if len(kw.split()) == 1:
                if not any(kw in multi_kw for multi_kw in keywords if len(multi_kw.split()) > 1):
                    refined_keywords[kw] = keywords[kw]
            # elif kw not in list(ENGLISH_STOP_WORDS):
            else:
                refined_keywords[kw] = keywords[kw]
                
        return refined_keywords


[nltk_data] Downloading package punkt to /Users/ruzexi/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# preprocessing
Normalisation, Lemmatisation, Stemming

In [89]:
nltk.download('wordnet')

def lemmatize_keywords(keywords):
    lemmatizer = WordNetLemmatizer()
    lemmatized_keywords = set()  # Using a set to avoid duplicates
    # print(keywords)
    for keyword in keywords:
        lemmatized_keyword = lemmatizer.lemmatize(keyword)
        lemmatized_keywords.add(lemmatized_keyword)
        # print(lemmatized_keywords)
    return list(lemmatized_keywords)


def extract_keywords_for_text(index, text, num_keywords):
    # Assuming TFIDFKeywordExtractor and BertKeywordExtractor are already implemented
    tfidf_extractor = TFIDFKeywordExtractor(index, text)
    candidate_keywords = tfidf_extractor.extract_keywords()

    bert_extractor = BertKeywordExtractor()
    refined_keywords = bert_extractor.refine_keywords(text[index], candidate_keywords, num_keywords)
    lemmatized_keywords = lemmatize_keywords(refined_keywords)
    
    return lemmatized_keywords





def read_json(file_path):
    # Function to read the JSON file
    with open(file_path, 'r', encoding='utf-8') as file:
        data = json.load(file)
    return data

def segment_text(id, text, max_length=512, step_size=256):
    """
    After exceeding the maximum bert input limit, use the slide window to truncate to size=256
    """
    segments = []
    start = 0
    while start < len(text):
        if start + max_length > len(text):
            segment = text[start:]
            segments.append(segment)
            id.append(id[-1]+1)
            break
        
        segment = text[start:start+max_length]
        segments.append(segment)
        id.append(id[-1])
        start += step_size

    return segments

    
def preprocess(file_path):
    data = read_json(file_path)
    preprocessed_data = []
    id_context=[0]
    
    
    for item in data:
        context = item['context']
        context = re.sub(r'\s+', ' ', context).strip()
        segmented_contexts = segment_text(id_context, context)
        preprocessed_data.extend(segmented_contexts) 
      

    return preprocessed_data, id_context

def main():
    FILE_PATH = "Extract.json"
    texts, id_context = preprocess(FILE_PATH)
    
    with open('new.txt', 'w') as file:
        num_keywords = int(len(texts[0])/30)
        keywords = extract_keywords_for_text(0, texts, num_keywords)
        for i in tqdm(range(1, len(texts))):
            if id_context[i] != id_context[i-1]:
                file.write(f"Text {id_context[i]-1} Keywords: {', '.join(list(set(keywords)))}\n\n")
                keywords = extract_keywords_for_text(i, texts, num_keywords)
            else:
                num_keywords = int(len(texts[i])/30)
                keywords += extract_keywords_for_text(i, texts, num_keywords)

    

if __name__ == "__main__":
    main()


[nltk_data] Downloading package wordnet to /Users/ruzexi/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
  0%|          | 0/30042 [00:00<?, ?it/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
  0%|          | 1/30042 [00:19<166:17:53, 19.93s/it]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
  0%|          | 2/30042 [00:37<153:34:18, 18.40s/it]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
  0%|          | 2/30042 [00:42<177:05:36, 21.22s/it]


KeyboardInterrupt: 

# Version 2, using a pre-trained model dedicated to keyword extraction: bert-uncased-keyword-extractor

In [17]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline

tokenizer = AutoTokenizer.from_pretrained("yanekyuk/bert-uncased-keyword-extractor")
model = AutoModelForTokenClassification.from_pretrained("yanekyuk/bert-uncased-keyword-extractor")

In [None]:

kw_extractor = pipeline("token-classification", 
                        model="yanekyuk/bert-uncased-keyword-extractor",
                        tokenizer = tokenizer)

def KeyWords_generator(outputs, text):
    keywords = []
    current_keyword = None
    current_start = None
    current_end = None

    for item in outputs:
        if item['entity'] == 'B-KEY':
            if current_keyword:
                keyword = text[current_start:current_end]
                if keyword.count(' ') < 4:
                    keywords.append(keyword) 
                current_keyword = None
            current_start = item['start']
            current_keyword = item['word']
        elif item['entity'] == 'I-KEY' and current_keyword:
            current_keyword = item['word']
        current_end = item['end']


    if current_keyword:
        keyword = text[current_start:current_end]
        if keyword.count(' ') < 4:
            keywords.append(keyword)

    return keywords



In [85]:
FILE_PATH = "Extract.json"
texts, id_context = preprocess(FILE_PATH)
FILE_WRITE = "Keywords.json"
unpreprocess_data = read_json(FILE_PATH)


def main():
    doc_kw = []
    with open(FILE_WRITE, 'w') as file:
        keywords = KeyWords_generator(kw_extractor(texts[0]), texts[0])
        for i in tqdm(range(1, len(texts))):
            if id_context[i] != id_context[i-1]:
                kw_list = list(set(keywords))
                print(kw_list)
                data_dic = {
                    'context': unpreprocess_data[id_context[i] - 1],
                    'keywords': kw_list
                }
                # file.write(f"Text {id_context[i]-1} Keywords: {', '.join(list(set(keywords)))}\n\n")
                doc_kw.append(data_dic)
                keywords = KeyWords_generator(kw_extractor(texts[i]), texts[i])
            else:
                keywords += KeyWords_generator(kw_extractor(texts[i]), texts[i])
        json.dump(doc_kw, file, ensure_ascii=False, indent=4)
    
       
main()



  0%|          | 0/30042 [00:00<?, ?it/s]

['anonymous', 'editor-in-chief', 'Julian Assange', 'WikiLeaks', 'Sunshine Press', 'Kristinn Hrafnsson', 'Iceland', 'Internet']


  0%|          | 3/30042 [00:01<3:34:17,  2.34it/s]

['Soviet Union', 'Hiroshima', 'Nagasaki', 'Japan', 'war crimes', 'unconditional surrender', 'Adolf Hitler', 'atomic bombs', 'Potsdam Declaration', 'Western Allies', 'Soviet', 'United States']


  0%|          | 6/30042 [00:02<3:17:44,  2.53it/s]

['marriage equality', 'Loving v. Virginia', 'Due Process Clause', 'gay marriage', 'separate marriage', 'Same-sex marriage']


  0%|          | 6/30042 [00:03<4:17:08,  1.95it/s]
