In [82]:
%%bash
pip install Elasticsearch
pip install docker
wget https://s3.ap-northeast-2.amazonaws.com/bitcoin-kaggle-dataset/crypto-markets.csv # 캐글에 나온 암호화폐 데이터셋
wget https://s3.ap-northeast-2.amazonaws.com/bitcoin-kaggle-dataset/winemag-data-130k-v2.csv # 와인 정보 데이터셋



You are using pip version 9.0.1, however version 9.0.3 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.
You are using pip version 9.0.1, however version 9.0.3 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.
--2018-03-27 02:42:12--  https://s3.ap-northeast-2.amazonaws.com/bitcoin-kaggle-dataset/crypto-markets.csv
Resolving s3.ap-northeast-2.amazonaws.com (s3.ap-northeast-2.amazonaws.com)... 52.219.60.57
Connecting to s3.ap-northeast-2.amazonaws.com (s3.ap-northeast-2.amazonaws.com)|52.219.60.57|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 68957305 (66M) [text/csv]
Saving to: ‘crypto-markets.csv’

     0K .......... .......... .......... .......... ..........  0% 3.47M 19s
    50K .......... .......... .......... .......... ..........  0% 8.03M 14s
   100K .......... .......... .......... .......... ..........  0% 4.28M 14s
   150K .......... .......... .......... .......... ....

In [2]:
import json
import re 
import time

import pandas as pd
from elasticsearch import Elasticsearch
from elasticsearch import helpers
import docker

### 0. Sample data-set 세팅

In [67]:
wine_df = pd.read_csv("../winemag-data-130k-v2.csv",index_col=0)

In [68]:
wine_df.price = wine_df.price.fillna("0").astype(int)
wine_df = wine_df.fillna("")

### 1. initialize Elastic-search Client

In [69]:
es = Elasticsearch(host="172.17.0.2") # 이 또한, 도커 세팅할 때 적혀 있는 방법 대로 주소 값을 알아낸 후 채워준다
if es.ping():
    # Check Elasticsearch is operating
    print("Elasticsearch is Okay\nElasticsearch spec\n")

    print(json.dumps(es.info(),indent=4,sort_keys=True))

Elasticsearch is Okay
Elasticsearch spec

{
    "cluster_name": "docker-cluster",
    "cluster_uuid": "Seag_G9QRFmFauEUfB-k0A",
    "name": "-A2cndJ",
    "tagline": "You Know, for Search",
    "version": {
        "build_date": "2018-03-14T08:28:22.470Z",
        "build_hash": "d838f2d",
        "build_snapshot": false,
        "lucene_version": "7.1.0",
        "minimum_index_compatibility_version": "5.0.0",
        "minimum_wire_compatibility_version": "5.6.0",
        "number": "6.1.4"
    }
}


### 2. create the index with tokenizer setting

In [132]:
index_name = "wine_table" # ElasticSearch에 넣을 index 이름

setting = {
    "settings": {
        "index.mapping.ignore_malformed" : "true",
        "index":{
            "analysis" : {
                "tokenizer" :{
                    "my_tokenizer": {
                        "type": "standard",
                        "max_token_length" : 20
                    }
                },
                "filter": {
                    "english_stop": {
                      "type":       "stop",
                      "stopwords":  "_english_" 
                    },
                    "english_keywords": {
                      "type":       "keyword_marker",
                      "keywords":   ["example"] 
                    },
                    "english_stemmer": {
                      "type":       "stemmer",
                      "language":   "english"
                    },
                    "english_possessive_stemmer": {
                      "type":       "stemmer",
                      "language":   "possessive_english"
                    }
                  },
                "analyzer" : {
                    "my_analyzer" : {
                        "tokenizer" : "my_tokenizer",
                        "filter": [
                            "english_possessive_stemmer",
                            "lowercase",
                            "english_stop",
                            "english_keywords",
                            "english_stemmer"
                        ]
                    }
                }
            }
        }
    }
}

In [133]:
# create the index 
es.indices.create(index_name,body=json.dumps(setting))

{'acknowledged': True, 'index': 'wine_table', 'shards_acknowledged': True}

In [134]:
test_sentence = wine_df.iloc[0].description
body = {
    "analyzer" : "my_analyzer",
    "text": test_sentence
}
result = es.indices.analyze(index_name,body=json.dumps(body))

print("original sentence : \n", )
print("tokenizer results : \n")
print(json.dumps(result,indent=2))

original sentence : 

tokenizer results : 

{
  "tokens": [
    {
      "token": "aroma",
      "start_offset": 0,
      "end_offset": 6,
      "type": "<ALPHANUM>",
      "position": 0
    },
    {
      "token": "includ",
      "start_offset": 7,
      "end_offset": 14,
      "type": "<ALPHANUM>",
      "position": 1
    },
    {
      "token": "tropic",
      "start_offset": 15,
      "end_offset": 23,
      "type": "<ALPHANUM>",
      "position": 2
    },
    {
      "token": "fruit",
      "start_offset": 24,
      "end_offset": 29,
      "type": "<ALPHANUM>",
      "position": 3
    },
    {
      "token": "broom",
      "start_offset": 31,
      "end_offset": 36,
      "type": "<ALPHANUM>",
      "position": 4
    },
    {
      "token": "brimston",
      "start_offset": 38,
      "end_offset": 47,
      "type": "<ALPHANUM>",
      "position": 5
    },
    {
      "token": "dri",
      "start_offset": 52,
      "end_offset": 57,
      "type": "<ALPHANUM>",
      "position": 7
  

### 2. Put mapping in Index

In [135]:
wine_mapping = {
    "properties":{
        "country"   : {"type":"keyword","store":True},
        "description" : {"type":"text", "analyzer" :"my_analyzer"},
        "points"    : {"type":"integer"},
        "price"  : {"type":"integer"},
        "province"  : {"type":"text","analyzer":"my_analyzer"},
        "region_1"  : {"type":"keyword"},
        "region_2"     : {"type":"keyword"},
        "taster_name"   : {"type":"keyword"},
        "taster_twitter_handle" : {"type":"keyword","store":True},
        "title" : {"type":"text","analyzer":"my_analyzer","fields":{"raw":{"type":"keyword"}}},
        "variety" : {"type":"text","analyzer":"my_analyzer","fields":{"raw":{"type":"keyword"}},"store":True},
        "winery" : {"type":"keyword","store":True}
    }
}

es.indices.put_mapping("wine",body=json.dumps(wine_mapping),index=index_name)

{'acknowledged': True}

In [136]:
es.indices.get_mapping(index_name,"wine")

{'wine_table': {'mappings': {'wine': {'properties': {'country': {'store': True,
      'type': 'keyword'},
     'description': {'analyzer': 'my_analyzer', 'type': 'text'},
     'points': {'type': 'integer'},
     'price': {'type': 'integer'},
     'province': {'analyzer': 'my_analyzer', 'type': 'text'},
     'region_1': {'type': 'keyword'},
     'region_2': {'type': 'keyword'},
     'taster_name': {'type': 'keyword'},
     'taster_twitter_handle': {'store': True, 'type': 'keyword'},
     'title': {'analyzer': 'my_analyzer',
      'fields': {'raw': {'type': 'keyword'}},
      'type': 'text'},
     'variety': {'analyzer': 'my_analyzer',
      'fields': {'raw': {'type': 'keyword'}},
      'store': True,
      'type': 'text'},
     'winery': {'store': True, 'type': 'keyword'}}}}}}

### 4. put data into index

In [137]:
def generate_action(_index,_type):
    def _generate_action(_source,_id=None):
        if _id:
            return {
                "_index"  : _index,
                "_type"   : _type,
                "_id"     : _id,
                "_source" : _source
            }
        else:
            return {
                "_index"  : _index,
                "_type"   : _type,
                "_source" : _source
            }
    return _generate_action

In [138]:
action = generate_action(_index=index_name,_type="wine")
actions = [action(row) for row in wine_df.to_dict(orient='records')]

start = time.time()
helpers.bulk(es,actions,stats_only=False,chunk_size=500,raise_on_error=False)
end = time.time()
print("consumed time --- {}".format(end-start));

In [139]:
start = time.time()
helpers.bulk(es,actions,stats_only=False,chunk_size=500,raise_on_error=False)
end = time.time()
print("consumed time --- {}".format(end-start));

consumed time --- 14.062002658843994


### 5. Search

In [153]:
body = {
    "sort" : [
        {"price" : {"order" :"desc"}}
    ],
    "_source" : ["country","description","price"],
    "query" :{ 
        "match":{
            "country":"Spain" # 지정한 필드에 대해 전문 검색을 수행
        }},
    "from":1,
    "size":5
}
es.search(index=index_name,body=json.dumps(body))

{'_shards': {'failed': 0, 'skipped': 0, 'successful': 5, 'total': 5},
 'hits': {'hits': [{'_id': 'uiVmZWIBkpnhHYeRUSOP',
    '_index': 'wine_table',
    '_score': None,
    '_source': {'country': 'Spain',
     'description': 'Extremely ripe aromas of prune, blackberry, cassis and cinnamon announce a jammy loaded palate that weighs nothing short of a ton. The flavor profile is a black-fruit bonanza accented by graphite and blackened toast. Drink this monster wine from a dry, hot vintage from 2019–2034.',
     'price': 600},
    '_type': 'wine',
    'sort': [600]},
   {'_id': '2SVmZWIBkpnhHYeRXZpO',
    '_index': 'wine_table',
    '_score': None,
    '_source': {'country': 'Spain',
     'description': "This blend of the 1996, '98 and '02 vintages is mature and browning in hue. The nose is superripe, with prune, brandied cherry, tobacco and molasses notes. A soft, creamy palate holds vanilla, tobacco, baking spice, prune and raisin flavors, while the finish is an echo of what came before.

### 쿼리와 필터의 구분
    
    쿼리와 필터는 둘 다 문서를 걸러내고 선택하는 용도이므로 비슷하지만, 구체적인 쓰임새가 다르다.


| 쿼리 | 필터 |
|----|----|
|연관성| yes/no |
|캐시불가|캐시 가능|
|느림|빠름|

루씬은 아래와 같은 형태로 역 색인표를 만든다.

|필드|텀 |문서1|문서2|문서3|문서N|
|---|---|---|---|---|---|
|title|민주노총|1|0|0|...|
|title|한상균|0|1|0|...|
|title|편지|1|1|1|...|
|genre|편지|1|0|1|...|

캐시는 필터 전용 역색인표라고 할 수 있다. 마치 역색인표의 일부를 뽑아낸 것과 비슷한 모양으로 저장된다.

필터 종류, 필드, 텀에 의해 캐시의 키를 정하고, 필터의 결과를 비트벡터 형태로 저장해둔다. 예를 들어, 텀 필터의 결과는 다음과 같이 캐시된다.

필터는 Bool Query에 속하는 개념.

Bool Query type에는 
    
    - filter : filter 내 항목 모두를 밪아야 Okay 
    - must : must 내 항목 모두를 맞아야 Okay
    - should : should 내 항목 중에 하나라도 맞으면 Okay
    - must_not : must_not 내 항목 모두 없어야 Okay
    
    filter vs must : 점수를 계산하냐 안하냐의 차이

In [157]:
body = {
    "query": {
        "bool" : {
            "filter" : [
                { "range" : {
                    "price" :{
                        "gte" :300,
                        "lte" :600
                        }
                    }
                },
                {"terms" : {
                    "country":["Spain"]
                 }
                }
            ]            
        }
    }
}
es.search(index=index_name,body=json.dumps(body))

{'_shards': {'failed': 0, 'skipped': 0, 'successful': 5, 'total': 5},
 'hits': {'hits': [{'_id': 'UiVmZWIBkpnhHYeRVlNg',
    '_index': 'wine_table',
    '_score': 0.0,
    '_source': {'country': 'Spain',
     'description': "Jammy, creamy and superripe, with raspberry, burnt toast and coffee on the thick, meaty nose. Flavors reminiscent of an ice cream sundae work the palate, which means berry syrup, chocolate sauce and vanilla are highly prominent. Texturally speaking, this is a smooth, soft and plush wine. It pushes the envelope on ripeness, but that's become the Cirsion way. An awesome wine that deserves a proper decanting.",
     'designation': 'Cirsion',
     'points': 96,
     'price': 303,
     'province': 'Northern Spain',
     'region_1': 'Rioja',
     'region_2': '',
     'taster_name': 'Michael Schachner',
     'taster_twitter_handle': '@wineschach',
     'title': 'Bodegas Roda 2005 Cirsion  (Rioja)',
     'variety': 'Tempranillo',
     'winery': 'Bodegas Roda'},
    '_type'