<a href="https://colab.research.google.com/github/HahyunKang/Datamining_Project/blob/main/texkRank.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
from collections import defaultdict
from scipy.sparse import csr_matrix
from collections import Counter
import numpy as np
from sklearn.preprocessing import normalize
import pandas as pd
import spacy
import nltk
import pickle
from nltk.corpus import stopwords
import re

#자연어 처리를 위한 파이썬 패키지
nltk.download('all')
nlp = spacy.load("en_core_web_sm")


def remove_protagonist_name(text):
    # 텍스트를 SpaCy 문서로 변환
    doc = nlp(str(text))

    # 주인공의 이름을 찾아서 제거
    names_to_remove = [ent.text for ent in doc.ents if ent.label_ == 'PERSON']
    for name in names_to_remove:
        text = text.replace(name, '')

    return text

def scan_vocabulary(sents, tokenize, min_count=1):
    counter = Counter(w for sent in sents for w in tokenize(sent))
    counter = {w:c for w,c in counter.items() if c >= min_count}
    idx_to_vocab = [w for w, _ in sorted(counter.items(), key=lambda x:-x[1])]
    vocab_to_idx = {vocab:idx for idx, vocab in enumerate(idx_to_vocab)}
    return idx_to_vocab, vocab_to_idx

def sentences_to_list(text):
    # 문장 구분자를 기준으로 문장을 분리하여 리스트로 반환
    sentences_list =remove_protagonist_name(text)
    sentences_list = str(text).split('.')

    return  sentences_list

def tokenize(sentence):

      # 문장 구분자를 기준으로 문장을 분리하여 리스트로 반환
    sentences_list = sentence.lower().split()  # 여기에서는 마침표를 문장 구분자로 가정

    #불용어
    stop_words = [
    "i", "me", "my", "myself",
    "we", "our", "ours", "ourselves",
    "you", "your", "yours", "yourself", "yourselves",
    "he", "him", "his", "himself",
    "she", "her", "hers", "herself",
    "it", "its", "itself",
    "they", "them", "their", "theirs", "themselves",
    "what", "which", "who", "whom",
    "this", "that", "these", "those",
    "am", "is", "are", "was", "were", "be", "been", "being",
    "have", "has", "had", "having",
    "do", "does", "did", "doing",
    "a", "an", "the",
    "and", "but", "if", "or", "because", "as", "until", "while",
    "of", "at", "by", "for", "with", "about", "against", "between", "into", "through", "during", "before", "after", "above", "below", "to", "from", "up", "down", "in", "out", "on", "off", "over", "under", "again", "further", "then", "once", "here", "there", "when", "where", "why", "how",
    "all", "any", "both", "each", "few", "more", "most", "other", "some", "such",
    "no", "nor", "not", "only", "own", "same", "so", "than", "too", "very",
    "s", "t",
    "can", "will", "just", "don", "should", "now","?","-"]
    filtered_sentences = [word for word in sentences_list if word not in stop_words]
    #명사만 추출하여 리스트 반환
    NN_words = []
    tokens_pos = nltk.pos_tag(filtered_sentences)
    for word,pos in tokens_pos:
      if 'NN' in pos:
        NN_words.append(word)

    return NN_words

def dict_to_mat(d, n_rows, n_cols):
    rows, cols, data = [], [], []
    for (i, j), v in d.items():
        rows.append(i)
        cols.append(j)
        data.append(v)
    return csr_matrix((data, (rows, cols)), shape=(n_rows, n_cols))

def cooccurrence(tokens, vocab_to_idx, window=2, min_cooccurrence=2):
    counter = defaultdict(int)
    for s, tokens_i in enumerate(tokens):
        vocabs = [vocab_to_idx[w] for w in tokens_i if w in vocab_to_idx]
        n = len(vocabs)
        for i, v in enumerate(vocabs):
            if window <= 0:
                b, e = 0, n
            else:
                b = max(0, i - window)
                e = min(i + window, n)
            for j in range(b, e):
                if i == j:
                    continue
                counter[(v, vocabs[j])] += 1
                counter[(vocabs[j], v)] += 1
    counter = {k:v for k,v in counter.items() if v >= min_cooccurrence}
    n_vocabs = len(vocab_to_idx)
    return dict_to_mat(counter, n_vocabs, n_vocabs)

def word_graph(sents, tokenize, min_count=2, window=2, min_cooccurrence=2):
    idx_to_vocab, vocab_to_idx = scan_vocabulary(sents, tokenize, min_count)
    tokens = [tokenize(sent) for sent in sents]
    g = cooccurrence(tokens, vocab_to_idx, window, min_cooccurrence)
    return g, idx_to_vocab

def pagerank(x, df=0.85, max_iter=30):
    assert 0 < df < 1

    # initialize
    A = normalize(x, axis=0, norm='l1')
    R = np.ones(A.shape[0]).reshape(-1,1)
    bias = (1 - df) * np.ones(A.shape[0]).reshape(-1,1)

    # iteration
    for _ in range(max_iter):
        R = df * (A * R) + bias

    return R


def textrank_keyword(sents, tokenize, min_count, window, min_cooccurrence, df=0.85, max_iter=30, topk=5):
    g, idx_to_vocab = word_graph(sents, tokenize, min_count, window, min_cooccurrence)
    R = pagerank(g, df, max_iter).reshape(-1)
    idxs = R.argsort()[-topk:]
    keywords = [(idx_to_vocab[idx]) for idx in reversed(idxs)]
    return keywords


#book_data에 추출한 keyword 포함
book_data = pd.read_csv("/content/drive/MyDrive/데이터마이닝/books_list.csv",encoding='cp949')
book_data['keywords'] = ''


for i in range(book_data.shape[0]):
  sentence = book_data.loc[i,'description']
  keywords_tuple = textrank_keyword(sentences_to_list(sentence),tokenize,1,2,2)
  keywords_list = list(keywords_tuple)
  book_data.at[i, 'keywords'] = keywords_list




[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /root/nltk_data...
[nltk_data]    |   Package abc is already up-to-date!
[nltk_data]    | Downloading package alpino to /root/nltk_data...
[nltk_data]    |   Package alpino is already up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger is already up-
[nltk_data]    |       to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger_ru is already
[nltk_data]    |       up-to-date!
[nltk_data]    | Downloading package basque_grammars to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Package basque_grammars is already up-to-date!
[nltk_data]    | Downloading package bcp47 to /root/nltk_data...
[nltk_data]    |   Package bcp47 is already up-to-dat

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive
