In [35]:
# https://medium.com/towards-data-science/keyword-extraction-python-tf-idf-textrank-topicrank-yake-bert-7405d51cd839

Index
+ [x] TF-IDF  
+ [x] TextRank 
+ [x] YAKE!
+ [x] KeyBERT : 

In [36]:
# !pip install trafilatura -qqq
# !pip install summa -qqq
# !pip install git+https://github.com/smirnov-am/pytopicrank.git#egg=pytopicrank -qqq
# !pip install git+https://github.com/LIAAD/yake -qqq
# !pip install keyBERT -qqq
# !pip install konlpy -qqq

In [1]:
import numpy as np
import itertools
import pandas as pd
import re

from konlpy.tag import Okt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

import nltk

In [2]:
# Data Load
df = pd.read_excel('./20230316_kurly_review.xlsx',0)

In [16]:
def listToString(s):  
    str1 = ""  
    for ele in s:  
        str1 += " " + ele.strip()  
    return str1

def clean_str(text):
    pattern = '([a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+)' # E-mail제거
    text = re.sub(pattern=pattern, repl=' ', string=text)
    pattern = '(http|ftp|https)://(?:[-\w.]|(?:%[\da-fA-F]{2}))+' # URL제거
    text = re.sub(pattern=pattern, repl='', string=text)
    pattern = '<[^>]*>'         # HTML 태그 제거
    text = re.sub(pattern=pattern, repl=' ', string=text)
    pattern = '[^\w\s]'         # 특수기호제거
    text = re.sub(pattern=pattern, repl=' ', string=text)
    return text   

In [31]:
df.PRODUCT.unique()

array(['[오트사이드] 초콜릿맛 귀리 우유 1L', '[오틀리] 식물성 귀리음료 초코 1L',
       '[오틀리] 식물성 귀리음료 초코 250ml'], dtype=object)

In [21]:
# Text Cleansing
text1 = clean_str(listToString(df[df['PRODUCT'] =='[오트사이드] 초콜릿맛 귀리 우유 1L'].REVIEW.sample(frac = 0.7)))
text2 = clean_str(listToString(df[df['PRODUCT'] =='[오트사이드] 초콜릿맛 귀리 우유 1L'].REVIEW.sample(frac = 0.7)))
text3 = clean_str(listToString(df[df['PRODUCT'] =='[오틀리] 식물성 귀리음료 초코 250ml'].REVIEW.sample(frac = 0.7)))
text_list = [text1, text2, text3]

In [22]:
# Text Cleansing
corpus = []
for text_ in text_list:
    corpus.append(clean_str(text_))

### 1.TF-IDF

In [53]:
from sklearn.feature_extraction.text import TfidfVectorizer
from numpy import array, log
from tqdm.notebook import tqdm

vectorizer = TfidfVectorizer().fit(corpus)
tf = vectorizer.fit_transform(corpus)
tf = tf.toarray()
tf = log(tf + 1)

In [33]:
tfidf = tf.copy()
words = array(vectorizer.get_feature_names_out())

for k in tqdm(vectorizer.vocabulary_.keys()):
    if k in words:
        tfidf[:, words == k] = tfidf[:, words == k] * vectorizer.vocabulary_[k]

  0%|          | 0/2905 [00:00<?, ?it/s]

In [40]:
for j in range(tfidf.shape[0]):
    print(str(j+1), df.PRODUCT.unique()[j], "Keywords : \n", words[tfidf[j, :].argsort()[-10:][::-1]])

1 [오트사이드] 초콜릿맛 귀리 우유 1L Keywords : 
 ['좋아요' '맛있어요' '초코우유' '진하고' '초코' '진짜' '자주' '진한' '초코맛이' '초콜릿맛']
2 [오틀리] 식물성 귀리음료 초코 1L Keywords : 
 ['좋아요' '맛있어요' '초코우유' '진하고' '초콜릿맛' '초코' '진짜' '진한' '자주' '초코맛']
3 [오틀리] 식물성 귀리음료 초코 250ml Keywords : 
 ['좋아요' '초코우유' '맛있어요' '초코맛' '초코' '자주' '오틀리' '아이가' '초코우유보다' '좋아해요']


### 2.TextRank

In [42]:
from summa import keywords

In [44]:
for j in range(len(text_list)):
    print(str(j+1), df.PRODUCT.unique()[j], "Keywords : \n", (keywords.keywords(text_list[j], words=5)).split("\n"))

1 [오트사이드] 초콜릿맛 귀리 우유 1L Keywords : 
 ['너무 맛있어요', '좋아요', '초코우유', '많이']
2 [오틀리] 식물성 귀리음료 초코 1L Keywords : 
 ['맛있어요', '좋아요', '초코우유', '너무', '자주']
3 [오틀리] 식물성 귀리음료 초코 250ml Keywords : 
 ['맛있어요', '좋아요', '초코우유', '초코맛', '가끔']


### 3.YAKE! 

In [51]:
from yake import KeywordExtractor
kw_extractor = KeywordExtractor(lan="ko", n=1, top=5)

In [52]:
for j in range(len(text_list)):
    keywords = kw_extractor.extract_keywords(text=text_list[j])
    keywords = [x for x, y in keywords]
    print(str(j+1), df.PRODUCT.unique()[j], "Keywords : \n", keywords)

1 [오트사이드] 초콜릿맛 귀리 우유 1L Keywords : 
 ['맛있어요', '좋아요', '초코우유', '진하고', '맛있어서']
2 [오틀리] 식물성 귀리음료 초코 1L Keywords : 
 ['맛있어요', '좋아요', '초코우유', '진하고', '맛있어서']
3 [오틀리] 식물성 귀리음료 초코 250ml Keywords : 
 ['맛있어요', '좋아요', '초코우유', '초코맛', '오틀리']


### 4.KeyBERT

In [49]:
from keybert import KeyBERT
kw_extractor = KeyBERT('distilbert-base-nli-mean-tokens')

In [None]:
for j in range(len(text_list)):
    keywords = kw_extractor.extract_keywords(text_list[j], keyphrase_length=1 )
    print(str(j+1), df.PRODUCT.unique()[j], "Keywords : \n", keywords)

In [55]:
for j in range(len(text_list)):
    keywords = kw_extractor.extract_keywords(text_list[j], keyphrase_length=1, stop_words='english')
    print("Keywords of article", str(j+1), "\n", keywords)

TypeError: KeywordExtractor.extract_keywords() got an unexpected keyword argument 'keyphrase_length'