## 순차 검색

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
df = pd.read_csv('/content/drive/Othercomputers/내 노트북/study/06_TextAnalytics/data/neurips.csv')
df.head()

Unnamed: 0,year,title,abstract
0,2007,Competition Adds Complexity,It is known that determinining whether a DEC-P...
1,2007,Efficient Principled Learning of Thin Junction...,We present the first truly polynomial algorith...
2,2007,Regularized Boost for Semi-Supervised Learning,Semi-supervised inductive learning concerns ho...
3,2007,Simplified Rules and Theoretical Analysis for ...,We show that under suitable assumptions (prima...
4,2007,Predicting human gaze using low-level saliency...,"Under natural viewing conditions, human observ..."


In [3]:
query = {'natural', 'language'}

import re
def tokenize(text):
    text = text.lower() # 소문자로 변환
    return re.findall(r'\w{2,}', text) # 2글자 이상 단어 추출

### 표의 각 행에서 순서대로 검색어가 있는지 확인

In [4]:
%%time
results = []
for row in df.itertuples():
    words = set(tokenize(row.abstract))
    if query < words: # 검색어가 부분집합이면
        results.append(row.Index)

CPU times: user 231 ms, sys: 0 ns, total: 231 ms
Wall time: 230 ms


### 조건에 맞는 행 번호

In [5]:
results

[49,
 269,
 542,
 557,
 846,
 881,
 1068,
 1079,
 1219,
 1278,
 1503,
 1660,
 1805,
 1911,
 1917,
 1961,
 1980,
 2047,
 2138,
 2186,
 2187,
 2284,
 2393,
 2412,
 2516,
 2544,
 2581,
 2682,
 2700,
 2770,
 2864,
 2897,
 2900,
 2920,
 2931,
 3148,
 3266,
 3303,
 3333,
 3445,
 3543,
 3640,
 3727,
 3844,
 3878,
 3883]

### 조건에 맞는 행 보기

In [6]:
df.loc[results]

Unnamed: 0,year,title,abstract
49,2007,Discriminative Keyword Selection Using Support...,Many tasks in speech processing involve classi...
269,2008,Modeling the effects of memory on human online...,Language comprehension in humans is significan...
542,2009,Rethinking LDA: Why Priors Matter,Implementations of topic models typically use ...
557,2009,Conditional Neural Fields,Conditional random fields (CRF) are quite succ...
846,2010,Probabilistic Deterministic Infinite Automata,We propose a novel Bayesian nonparametric appr...
881,2011,Inverting Grice's Maxims to Learn Rules from N...,We consider the problem of learning rules from...
1068,2011,Kernel Embeddings of Latent Tree Graphical Models,Latent tree graphical models are natural tools...
1079,2011,Higher-Order Correlation Clustering for Image ...,For many of the state-of-the-art computer visi...
1219,2012,Learned Prioritization for Trading Off Accurac...,Users want natural language processing (NLP) s...
1278,2012,Tensor Decomposition for Fast Parsing with Lat...,We describe an approach to speed-up inference ...


### 리스트와 사전

In [7]:
a = list(range(1000000))

### 리스트에서 999999를 검색하는데 걸리는 시간 측정 리스트의 뒤로 갈 수록 검색이 오래 걸림

In [8]:
%%time
a.index(999999)

CPU times: user 22.2 ms, sys: 1.53 ms, total: 23.7 ms
Wall time: 26.1 ms


999999

In [9]:
b = dict(zip(a, a))

### 검색 시간이 0에 가까움

In [10]:
%%time
b[999999]

CPU times: user 4 µs, sys: 1e+03 ns, total: 5 µs
Wall time: 9.54 µs


999999

### 인덱싱

In [11]:
from collections import defaultdict
index = defaultdict(set)

for row in df.itertuples():
    words = tokenize(row.abstract)
    for word in words:
        index[word].add(row.Index)

In [12]:
index['language']

{15,
 27,
 49,
 51,
 90,
 161,
 197,
 269,
 278,
 347,
 423,
 450,
 505,
 542,
 557,
 626,
 634,
 747,
 781,
 846,
 881,
 1068,
 1079,
 1219,
 1243,
 1278,
 1420,
 1503,
 1570,
 1660,
 1805,
 1806,
 1807,
 1906,
 1911,
 1917,
 1961,
 1980,
 2047,
 2061,
 2138,
 2167,
 2186,
 2187,
 2216,
 2284,
 2312,
 2393,
 2412,
 2421,
 2481,
 2482,
 2501,
 2516,
 2536,
 2544,
 2581,
 2585,
 2682,
 2696,
 2700,
 2735,
 2770,
 2822,
 2835,
 2864,
 2877,
 2895,
 2897,
 2900,
 2920,
 2931,
 2934,
 3021,
 3034,
 3047,
 3082,
 3104,
 3105,
 3148,
 3207,
 3264,
 3266,
 3282,
 3296,
 3303,
 3333,
 3438,
 3445,
 3543,
 3554,
 3581,
 3640,
 3642,
 3702,
 3727,
 3765,
 3794,
 3808,
 3820,
 3844,
 3872,
 3878,
 3883,
 3885}

In [13]:
%%time
results = list(index['natural'] & index['language'])

CPU times: user 44 µs, sys: 0 ns, total: 44 µs
Wall time: 49.6 µs


## TF

In [16]:
from collections import Counter
idxs = list(index['natural'] & index['language'])
results = []
for row in df.iloc[idxs].itertuples():
    words = tokenize(row.abstract)
    cnt = Counter(words)
    tf = sum(cnt[w] for w in query)
    results.append((tf, row.Index))


### 점수의 역순으로 정렬

In [17]:
idx = [i for _, i in sorted(results, reverse=True)]

### 정렬된 문서 보기

In [18]:
df.iloc[idx]

Unnamed: 0,year,title,abstract
3445,2017,Emergence of Language with Multi-agent Games: ...,"Learning to communicate through interaction, r..."
3148,2016,LightRNN: Memory and Computation-Efficient Rec...,Recurrent neural networks (RNNs) have achieved...
1805,2013,A Novel Two-Step Method for Cross Language Rep...,Cross language text classi?cation is an import...
2920,2016,Latent Attention For If-Then Program Synthesis,Automatic translation from natural language de...
2900,2016,Dialog-based Language Learning,A long-term goal of machine learning research ...
2897,2016,Visual Question Answering with Question Repres...,Our method aims at reasoning over natural lang...
2544,2015,Testing Closeness With Unequal Sized Samples,We consider the problem of testing whether two...
2412,2015,Expressing an Image Stream with a Sequence of ...,We propose an approach for generating a sequen...
2186,2014,Convolutional Neural Network Architectures for...,Semantic matching is of central importance to ...
1660,2013,A Deep Architecture for Matching Short Texts,Many machine learning problems can be interpre...


## TF-IDF

### 문서 빈도

In [19]:
{k: len(v) for k, v in index.items()}

{'it': 1326,
 'is': 3158,
 'known': 443,
 'that': 3315,
 'determinining': 1,
 'whether': 84,
 'dec': 4,
 'pomdp': 14,
 'namely': 72,
 'cooperative': 16,
 'partially': 78,
 'observable': 49,
 'stochastic': 434,
 'game': 82,
 'posg': 1,
 'has': 932,
 'strategy': 144,
 'with': 2553,
 'positive': 117,
 'expected': 141,
 'reward': 96,
 'complete': 70,
 'for': 3261,
 'nexp': 1,
 'was': 130,
 'not': 870,
 'until': 23,
 'now': 36,
 'how': 551,
 'cooperation': 1,
 'affected': 12,
 'complexity': 372,
 'we': 3766,
 'show': 1762,
 'competitive': 148,
 'posgs': 2,
 'the': 3903,
 'of': 3875,
 'determining': 32,
 'one': 698,
 'team': 6,
 'class': 483,
 'an': 2170,
 'oracle': 72,
 'np': 56,
 'present': 797,
 'first': 675,
 'truly': 14,
 'polynomial': 140,
 'algorithm': 1471,
 'learning': 1775,
 'structure': 505,
 'bounded': 109,
 'treewidth': 12,
 'junction': 5,
 'trees': 78,
 'attractive': 40,
 'subclass': 15,
 'probabilistic': 297,
 'graphical': 172,
 'models': 1071,
 'permits': 16,
 'both': 875,
 '

### 전체 문서 수

In [20]:
N, _ = df.shape

### 역문서빈도(inverse document frequency)

In [21]:
import numpy as np
idf = {k: np.log(N / len(v)) for k, v in index.items()}

idxs = list(index['natural'] & index['language'])
results = []

for row in df.iloc[idxs].itertuples():
    words = tokenize(row.abstract)
    cnt = Counter(words)
    tfidf = sum(cnt[w] * idf[w] for w in query)
    results.append((tfidf, row.Index))

idx = [i for _, i in sorted(results, reverse=True)]

## BM25

In [22]:
!pip install rank_bm25 kiwipiepy

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting rank_bm25
  Downloading rank_bm25-0.2.2-py3-none-any.whl (8.6 kB)
Collecting kiwipiepy
  Downloading kiwipiepy-0.15.0-cp39-cp39-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (1.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m18.3 MB/s[0m eta [36m0:00:00[0m
Collecting dataclasses
  Downloading dataclasses-0.6-py3-none-any.whl (14 kB)
Collecting kiwipiepy-model~=0.15
  Downloading kiwipiepy_model-0.15.0.tar.gz (30.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.5/30.5 MB[0m [31m28.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: kiwipiepy-model
  Building wheel for kiwipiepy-model (setup.py) ... [?25l[?25hdone
  Created wheel for kiwipiepy-model: filename=kiwipiepy_model-0.15.0-py3-none-any.whl size=30602642 sha256=e95132aa37

In [23]:
import pandas as pd
books = pd.read_csv('/content/drive/Othercomputers/내 노트북/study/06_TextAnalytics/data/science_books.csv')

In [24]:
from kiwipiepy import Kiwi
kiwi = Kiwi()


def tokenize(sent):
    for token in kiwi.tokenize(sent):
        if token.tag in {'NNG', 'NNP', 'SL', 'VV', 'VA'}:
            yield token.form, token.tag

In [25]:
tokenized_corpus = []
for title in books.제목:
    tokenized_corpus.append(list(tokenize(title)))

In [26]:
from rank_bm25 import BM25Okapi
bm25 = BM25Okapi(tokenized_corpus)

In [27]:
import pandas as pd
idf_table = pd.DataFrame(bm25.idf.items(), columns=['token', 'idf'])
idf_table.sort_values('idf')

Unnamed: 0,token,idf
24,"(과학, NNG)",1.590378
169,"(수학, NNG)",2.076635
9,"(이야기, NNG)",2.226424
17,"(세상, NNG)",2.396806
31,"(양장, NNG)",2.396806
...,...,...
1249,"(아이디어, NNG)",6.501790
1248,"(보듬, VV)",6.501790
1247,"(이웃, NNG)",6.501790
1621,"(스타일링, NNG)",6.501790


In [30]:
list(tokenize('다정한 것이 살아남는다'))

[('다정', 'NNG'), ('살아남', 'VV')]

In [28]:
query = list(tokenize('다정한 것이 살아남는다'))
bm25.get_top_n(query, books.제목, n=5)

['다정한 것이 살아남는다 : 친화력으로 세상을 바꾸는 인류의 진화에 관하여(10만부 기념 스페셜 에디션, 저자 친필 사인 인쇄본)',
 '낙타는 왜 사막으로 갔을까 : 살아남은 동물들의 비밀',
 '무엇이 우리를 다정하게 만드는가 : 타인을 도우려 하는 인간 심리의 뇌과학적 비밀(양장)',
 '우주에서 기다릴게 : 한국 첫 우주인이 펼치는 다정한 호기심의 기록',
 '다정한 물리학 : 거대한 우주와 물질의 기원을 탐구하고 싶을 때']

### 임베딩을 이용한 검색

In [31]:
!pip install sentence-transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sentence-transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting transformers<5.0.0,>=4.6.0
  Downloading transformers-4.28.1-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m49.3 MB/s[0m eta [36m0:00:00[0m
Collecting sentencepiece
  Downloading sentencepiece-0.1.98-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m62.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting huggingface-hub>=0.4.0
  Downloading huggingface_hub-0.13.4-py3-none-any.whl (200 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m200.1

In [32]:
from sentence_transformers import SentenceTransformer
sbert = SentenceTransformer(
    'snunlp/KR-SBERT-V40K-klueNLI-augSTS')

Downloading (…)635b2/.gitattributes:   0%|          | 0.00/1.23k [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)3f97e635b2/README.md:   0%|          | 0.00/4.02k [00:00<?, ?B/s]

Downloading (…)97e635b2/config.json:   0%|          | 0.00/707 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/467M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)635b2/tokenizer.json:   0%|          | 0.00/967k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/394 [00:00<?, ?B/s]

Downloading (…)3f97e635b2/vocab.txt:   0%|          | 0.00/336k [00:00<?, ?B/s]

Downloading (…)7e635b2/modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

In [33]:
from sklearn.metrics.pairwise import cosine_similarity
emb = sbert.encode(books.제목)

query_emb = sbert.encode(['다정한 것이 살아남는다'])

In [34]:
query_emb

array([[-1.31547019e-01,  3.03923875e-01,  4.85664159e-01,
        -2.16841251e-01,  1.36009288e+00, -1.20646286e+00,
         3.91339399e-02, -7.92704999e-01,  1.02010286e+00,
        -7.96528220e-01,  2.94336706e-01,  6.82330132e-01,
        -9.22739506e-01, -5.95071912e-01,  7.80481398e-01,
        -1.22301541e-01, -1.54480234e-01, -9.37348902e-01,
        -1.98144600e-01,  7.95778036e-02,  1.67816952e-01,
        -1.53957352e-01, -1.60212770e-01,  4.84559864e-01,
        -1.95721030e-01,  6.76929295e-01, -1.05160546e+00,
         9.15205657e-01,  1.18469134e-01,  1.01883256e+00,
         4.14323479e-01, -1.06345344e+00,  1.68637902e-01,
         7.66607523e-02, -4.79440182e-01,  1.50974477e-02,
         4.00385857e-01,  7.76839912e-01,  3.78833354e-01,
         4.29200590e-01, -7.29155838e-01, -1.31777322e+00,
        -5.59606194e-01, -3.51428121e-01,  3.79287869e-01,
        -2.03167032e-02,  1.27308264e-01,  7.41717935e-01,
        -6.60891891e-01, -6.95003867e-01,  1.97525725e-0

In [35]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [36]:
%%time
sims = cosine_similarity(query_emb, emb)
ids = np.argsort(-sims[0])[:5]

CPU times: user 7.75 ms, sys: 0 ns, total: 7.75 ms
Wall time: 16.5 ms


In [37]:
books.iloc[ids]

Unnamed: 0,제목
322,"다정함의 과학 : 친절, 신뢰, 공감 속에 숨어 있는 건강과 행복의 비밀"
6,다정한 것이 살아남는다 : 친화력으로 세상을 바꾸는 인류의 진화에 관하여(10만부 ...
618,모든 것은 그 자리에 : 첫사랑부터 마지막 이야기까지(양장)
392,이토록 다정한 기술 : 지구와 이웃을 보듬는 아이디어(〈희망의 이유〉 사쉐 증정 (...
111,무엇이 우리를 다정하게 만드는가 : 타인을 도우려 하는 인간 심리의 뇌과학적 비밀(양장)


### nmslib

In [38]:
!pip install nmslib

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting nmslib
  Downloading nmslib-2.1.1-cp39-cp39-manylinux2010_x86_64.whl (13.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.3/13.3 MB[0m [31m68.2 MB/s[0m eta [36m0:00:00[0m
Collecting pybind11<2.6.2
  Downloading pybind11-2.6.1-py2.py3-none-any.whl (188 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m188.5/188.5 kB[0m [31m15.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pybind11, nmslib
Successfully installed nmslib-2.1.1 pybind11-2.6.1


In [39]:
import nmslib
index = nmslib.init()
index.addDataPointBatch(emb)
index.createIndex()

In [40]:
%%time
ids, dist = index.knnQuery(query_emb, k=5)

CPU times: user 276 µs, sys: 0 ns, total: 276 µs
Wall time: 291 µs


In [41]:
books.iloc[ids]

Unnamed: 0,제목
322,"다정함의 과학 : 친절, 신뢰, 공감 속에 숨어 있는 건강과 행복의 비밀"
6,다정한 것이 살아남는다 : 친화력으로 세상을 바꾸는 인류의 진화에 관하여(10만부 ...
618,모든 것은 그 자리에 : 첫사랑부터 마지막 이야기까지(양장)
392,이토록 다정한 기술 : 지구와 이웃을 보듬는 아이디어(〈희망의 이유〉 사쉐 증정 (...
111,무엇이 우리를 다정하게 만드는가 : 타인을 도우려 하는 인간 심리의 뇌과학적 비밀(양장)


### chroma

In [43]:
!pip install chromadb

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [44]:
import chromadb

In [45]:
client = chromadb.Client()



In [46]:
collection = client.create_collection(
    name="science_books", 
    embedding_function=sbert.encode)

In [47]:
metadatas = books.제목.map(lambda x: {'length': len(x)}).tolist()
ids = books.index.map(str).tolist()
collection.add(
    documents=books.제목.tolist(),
    metadatas=metadatas,
    ids=ids
)

In [48]:
results = collection.query(
    query_texts=["다정한 것이 살아남는다"],
    n_results=5
)

In [49]:
%%time
results = collection.query(
    query_embeddings=[query_emb[0]],
    n_results=5
)

CPU times: user 3.49 ms, sys: 0 ns, total: 3.49 ms
Wall time: 10.3 ms


In [50]:
query_emb.shape

(1, 768)

In [51]:
results

{'ids': [['6', '322', '392', '313', '323']],
 'embeddings': None,
 'documents': [['다정한 것이 살아남는다 : 친화력으로 세상을 바꾸는 인류의 진화에 관하여(10만부 기념 스페셜 에디션, 저자 친필 사인 인쇄본)',
   '다정함의 과학 : 친절, 신뢰, 공감 속에 숨어 있는 건강과 행복의 비밀',
   '이토록 다정한 기술 : 지구와 이웃을 보듬는 아이디어(〈희망의 이유〉 사쉐 증정 (포인트 차감))',
   'ADHD 2.0 : 산만하고 변덕스러운 ‘나’를 뛰어난 ‘창조자’로 바꾸는 특별한 여정!(포함 건강취미분야 2만원↑ 데일리 알약케이스 증정(택1, 포인트 차감))',
   'Friends 프렌즈 : 과학이 우정에 대해 알려줄 수 있는 가장 중요한 것(레더 티코스터 증정(포인트 차감) )']],
 'metadatas': [[{'length': 71},
   {'length': 40},
   {'length': 54},
   {'length': 90},
   {'length': 63}]],
 'distances': [[305.2731018066406,
   332.41082763671875,
   351.1950988769531,
   384.9884033203125,
   389.6494140625]]}

In [None]:
results = collection.query(
    query_texts=["다정한 것이 살아남는다"],
    n_results=5,
    where={"length": {'$lt': 75}},
    where_document={"$contains":"다정"}
)

In [52]:
results

{'ids': [['6', '322', '392', '313', '323']],
 'embeddings': None,
 'documents': [['다정한 것이 살아남는다 : 친화력으로 세상을 바꾸는 인류의 진화에 관하여(10만부 기념 스페셜 에디션, 저자 친필 사인 인쇄본)',
   '다정함의 과학 : 친절, 신뢰, 공감 속에 숨어 있는 건강과 행복의 비밀',
   '이토록 다정한 기술 : 지구와 이웃을 보듬는 아이디어(〈희망의 이유〉 사쉐 증정 (포인트 차감))',
   'ADHD 2.0 : 산만하고 변덕스러운 ‘나’를 뛰어난 ‘창조자’로 바꾸는 특별한 여정!(포함 건강취미분야 2만원↑ 데일리 알약케이스 증정(택1, 포인트 차감))',
   'Friends 프렌즈 : 과학이 우정에 대해 알려줄 수 있는 가장 중요한 것(레더 티코스터 증정(포인트 차감) )']],
 'metadatas': [[{'length': 71},
   {'length': 40},
   {'length': 54},
   {'length': 90},
   {'length': 63}]],
 'distances': [[305.2731018066406,
   332.41082763671875,
   351.1950988769531,
   384.9884033203125,
   389.6494140625]]}