# Elasticsearch Pratice
__Author__: Jingkai Sun

In [1]:
from elasticsearch import Elasticsearch as es

In [2]:
es = es(HOST = 'http://localhost', PORT = 9200)

In [3]:
es.indices.create(index = 'first_index', ignore = 400)

{'acknowledged': True, 'shards_acknowledged': True, 'index': 'first_index'}

In [4]:
es.indices.exists(index = 'first_index')

True

In [5]:
es.indices.delete(index = 'first_index')

{'acknowledged': True}

In [6]:
es.indices.exists(index = 'first_index')

False

# Insert and get query

In [7]:
doc1 = {"city": "New Delhi", "country": "India"}
doc2 = {'city': "London", "country": "England"}
doc3 = {'city': "Los Angeles", "country": "USA"}
doc4 = {'city': 'Beijing', 'country': 'China'}

In [8]:
es.index(index = "cities", doc_type = "places", id = 1, body = doc1)



{'_index': 'cities',
 '_type': 'places',
 '_id': '1',
 '_version': 6,
 'result': 'updated',
 '_shards': {'total': 2, 'successful': 1, 'failed': 0},
 '_seq_no': 8,
 '_primary_term': 1}

In [9]:
es.index(index = "cities", doc_type = "places", id = 2, body = doc2)

{'_index': 'cities',
 '_type': 'places',
 '_id': '2',
 '_version': 2,
 'result': 'updated',
 '_shards': {'total': 2, 'successful': 1, 'failed': 0},
 '_seq_no': 9,
 '_primary_term': 1}

In [10]:
es.index(index = "cities", doc_type = "places", id = 3, body = doc3)
es.index(index = "cities", doc_type = "places", id = 4, body = doc4)

{'_index': 'cities',
 '_type': 'places',
 '_id': '4',
 '_version': 2,
 'result': 'updated',
 '_shards': {'total': 2, 'successful': 1, 'failed': 0},
 '_seq_no': 11,
 '_primary_term': 1}

In [11]:
res = es.get(index = 'cities', doc_type = 'places', id = 3)



In [12]:
res

{'_index': 'cities',
 '_type': 'places',
 '_id': '3',
 '_version': 2,
 '_seq_no': 10,
 '_primary_term': 1,
 'found': True,
 '_source': {'city': 'Los Angeles', 'country': 'USA'}}

In [13]:
res['_source']

{'city': 'Los Angeles', 'country': 'USA'}

# Different search query for matching documents

In [14]:
doc1 = {"sentence": "Today is a sunny day."}
doc2 = {"sentence": "Today is a bright-sunny day"}

In [15]:
es.index(index = "english", doc_type = "sentences", id = 1, body = doc1)

{'_index': 'english',
 '_type': 'sentences',
 '_id': '1',
 '_version': 3,
 'result': 'updated',
 '_shards': {'total': 2, 'successful': 1, 'failed': 0},
 '_seq_no': 4,
 '_primary_term': 1}

In [16]:
es.index(index = "english", doc_type = "sentences", id = 2, body = doc2)

{'_index': 'english',
 '_type': 'sentences',
 '_id': '2',
 '_version': 2,
 'result': 'updated',
 '_shards': {'total': 2, 'successful': 1, 'failed': 0},
 '_seq_no': 5,
 '_primary_term': 1}

In [17]:
res = es.search(index = 'english', body = {'from': 0, "size": 0, "query": {
    "match": {"sentence": "SUNNY"}
    }})

In [18]:
res
#insensitive to uppercase or lowercase

{'took': 526,
 'timed_out': False,
 '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0},
 'hits': {'total': {'value': 2, 'relation': 'eq'},
  'max_score': None,
  'hits': []}}

In [19]:
res_match_phrase = es.search(index = 'english', body = {'from': 0, "size": 0, "query": {
    "match_phrase": {"sentence": "bright SUNNY"}
    }})

In [20]:
res_match_phrase
#Match phrase: nont only matches the individual token but also the relative ordering

{'took': 2,
 'timed_out': False,
 '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0},
 'hits': {'total': {'value': 1, 'relation': 'eq'},
  'max_score': None,
  'hits': []}}

In [21]:
res_term = es.search(index = 'english', body = {'from': 0, "size": 1, "query": {
    "term": {"sentence": "bright SUNNY"}
    }})

In [22]:
res_term

{'took': 1,
 'timed_out': False,
 '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0},
 'hits': {'total': {'value': 0, 'relation': 'eq'},
  'max_score': None,
  'hits': []}}

# Combining queries

In [23]:
# must, must_not, should
res = es.search(index = "english", body = {"from": 0, "size": 1, "query": {"bool": {
                                                                            "must_not": {
                                                                                "match":{
                                                                                    "sentence": "bright"
                                                                                }
                                                                          },
                                                                            "should": {
                                                                                "match":{
                                                                                    "sentence": "sunny"
                                                                                }
                                                                            }
}}})

In [24]:
res

{'took': 3,
 'timed_out': False,
 '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0},
 'hits': {'total': {'value': 1, 'relation': 'eq'},
  'max_score': 0.4823361,
  'hits': [{'_index': 'english',
    '_type': 'sentences',
    '_id': '1',
    '_score': 0.4823361,
    '_source': {'sentence': 'Today is a sunny day.'}}]}}

# Regular Expressions in Elasticsearch

In [25]:
doc3 = {'sentence': "Today is a rainy day"}

In [26]:
es.index(index = 'english', doc_type = 'sentences', id = 3, body  = doc3)

{'_index': 'english',
 '_type': 'sentences',
 '_id': '3',
 '_version': 2,
 'result': 'updated',
 '_shards': {'total': 2, 'successful': 1, 'failed': 0},
 '_seq_no': 6,
 '_primary_term': 1}

In [27]:
es.get(index = 'english', doc_type = "sentences", id = 3)

{'_index': 'english',
 '_type': 'sentences',
 '_id': '3',
 '_version': 2,
 '_seq_no': 6,
 '_primary_term': 1,
 'found': True,
 '_source': {'sentence': 'Today is a rainy day'}}

In [28]:
# match everything
es.search(index = 'english', body = {"from": 0, "size": 5, "query":{"regexp":{"sentence": ".*"}}})

{'took': 3,
 'timed_out': False,
 '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0},
 'hits': {'total': {'value': 3, 'relation': 'eq'},
  'max_score': 1.0,
  'hits': [{'_index': 'english',
    '_type': 'sentences',
    '_id': '1',
    '_score': 1.0,
    '_source': {'sentence': 'Today is a sunny day.'}},
   {'_index': 'english',
    '_type': 'sentences',
    '_id': '2',
    '_score': 1.0,
    '_source': {'sentence': 'Today is a bright-sunny day'}},
   {'_index': 'english',
    '_type': 'sentences',
    '_id': '3',
    '_score': 1.0,
    '_source': {'sentence': 'Today is a rainy day'}}]}}

In [29]:
es.search(index = 'english', body = {"from": 0, "size": 5, "query":{"regexp":{"sentence": "sun.*"}}})

{'took': 4,
 'timed_out': False,
 '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0},
 'hits': {'total': {'value': 2, 'relation': 'eq'},
  'max_score': 1.0,
  'hits': [{'_index': 'english',
    '_type': 'sentences',
    '_id': '1',
    '_score': 1.0,
    '_source': {'sentence': 'Today is a sunny day.'}},
   {'_index': 'english',
    '_type': 'sentences',
    '_id': '2',
    '_score': 1.0,
    '_source': {'sentence': 'Today is a bright-sunny day'}}]}}

# Mapping

## What is Mapping?
as per Elasticsearch Reference, "Mapping is the process of defining how a document, and the fields it contains, are stored and indexed"

## How does it help?
It enables in faster search retrieval and aggregations. Hence, your mapping defines how effectively you can handle your data. A bad mapping can have severe consequences on the performance of your system

In [30]:
#documents to insert in the elasticsearch index "cities"
doc1 = {'city': "Bangalore", "country": "India", "datetime": "2018,01,01,10,20,00"}
doc2 = {'city': "London", "country": "England", "datetime": "2018,01,02,03,12,00"}
doc3 = {'city': "Los Angeles", "country": "USA", "datetime": "2018,04,19,12,02,00"}
doc4 = {'city': "Shanghai", "country": "China", "datetime": "2018,05,12,03,30,00"}

In [31]:
es.index(index = 'travel', doc_type = 'cities', id = 1, body = doc1)
es.index(index = 'travel', doc_type = 'cities', id = 2, body = doc2)
es.index(index = 'travel', doc_type = 'cities', id = 3, body = doc3)
es.index(index = 'travel', doc_type = 'cities', id = 4, body = doc4)

{'_index': 'travel',
 '_type': 'cities',
 '_id': '4',
 '_version': 3,
 'result': 'updated',
 '_shards': {'total': 2, 'successful': 1, 'failed': 0},
 '_seq_no': 14,
 '_primary_term': 1}

In [32]:
es.indices.get_mapping(index = 'travel', doc_type = 'cities', include_type_name=True) #POSTMAN: http://127.0.0.1:9200/travel/_mapping/cities



{'travel': {'mappings': {'cities': {'properties': {'city': {'type': 'text',
      'fields': {'keyword': {'type': 'keyword', 'ignore_above': 256}}},
     'country': {'type': 'text',
      'fields': {'keyword': {'type': 'keyword', 'ignore_above': 256}}},
     'datetime': {'type': 'date', 'format': 'yyyy,MM,dd,hh,mm,ss'}}}}}}

In [33]:
es.indices.delete(index = "travel") #MUST delete automatic mappings before cutomising the mappings

{'acknowledged': True}

In [34]:
es.indices.create(index = "travel")

{'acknowledged': True, 'shards_acknowledged': True, 'index': 'travel'}

In [35]:
#http://127.0.0.1:9200/travel/_mapping/places
es.indices.put_mapping(
    index = "travel",
    doc_type = "cities",
    body = 
        {
            "properties" : {
                "city":{
                    "type": "text",
                    "fields": {
                        "keyword": {
                            "type": "keyword",
                            "ignore_above": 256
                        }
                    }
                },
                "country": {
                    "type": "text",
                    "fields": {
                        "keyword": {
                            "type": "keyword",
                            "ignore_above": 256
                        }
                    }
                },
                "datetime": {
                    "type": "date",
                    "format": "yyyy,MM,dd,hh,mm,ss"
                }
            }
        },
    include_type_name = True
)



{'acknowledged': True}

# Date Histogram Aggregations

Aggregations are one of the most important application of Elasticsearch. It provides you with quick powerful analysis of your data! Below we have discussed aggregations over date values.

A lot of analysis happen on a time-series scales. For example: Quaterly of iphone cross the world. Therefore it is essential to have an fast aggregation done over large dataset under different granular scales. ES provides such an aggregation via date histogram aggregation. The granulation over which you can do aggregations are:

1. year
2. quater
3. month
4. hour
5. week
6. day
7. hour
8. minute
9. second
10. milisecond

In [36]:
es.search(index = "travel", body = {"from": 0, "size": 5, "query": {"match_all": {}}})

{'took': 1,
 'timed_out': False,
 '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0},
 'hits': {'total': {'value': 0, 'relation': 'eq'},
  'max_score': None,
  'hits': []}}

In [37]:
es.search(index = "travel", 
          body = {"from": 0, "size": 0, "query":{"match_all":{}}, "aggs": {
            "country": {
                "date_histogram": {"field": "datetime", "interval": "month"}}}})
#根据year，count数据个数
#summirization



{'took': 2,
 'timed_out': False,
 '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0},
 'hits': {'total': {'value': 0, 'relation': 'eq'},
  'max_score': None,
  'hits': []},
 'aggregations': {'country': {'buckets': []}}}

In [38]:
doc4 = {"city":"Sydney", "country": "Australia", "datetime":"2019,01,01,10,20,00"}
es.index(index= "travel", doc_type = "cities", id = 5, body = doc4)

{'_index': 'travel',
 '_type': 'cities',
 '_id': '5',
 '_version': 1,
 'result': 'created',
 '_shards': {'total': 2, 'successful': 1, 'failed': 0},
 '_seq_no': 0,
 '_primary_term': 1}

In [39]:
es.search(index = "travel", body = {"from": 0, "size": 5, "query": {"match_all": {}}})

{'took': 1,
 'timed_out': False,
 '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0},
 'hits': {'total': {'value': 1, 'relation': 'eq'},
  'max_score': 1.0,
  'hits': [{'_index': 'travel',
    '_type': 'cities',
    '_id': '5',
    '_score': 1.0,
    '_source': {'city': 'Sydney',
     'country': 'Australia',
     'datetime': '2019,01,01,10,20,00'}}]}}

# Bulk Insert and Scan

In [40]:
from datetime import datetime
from elasticsearch import helpers as hprs
import time

In [41]:
# Bulk inserts snippet with time calculation to insert
#批量插入

action = [
    {
        "_index": "chapter8",
        "_type": "doc",
        "_id": j,
        "_source": {
            "any": "data" + str(j),
            "timestamp": datetime.now()
        }
    }
    for j in range(0, 10000)
]

st = time.time()
hprs.bulk(es, action)
end = time.time()
print("total time: ", end - st)




total time:  1.0638988018035889


In [42]:
# Insertion iteratively: same effect as above but more INEFFICIENT !!!
st = time.time()
for j in range(0,100):
    doc = {
        "any": "data" + str(j),
        "timestamp": datetime.now()
    }
    es.index(index = 'chapter8', doc_type = "doc", id = j, body = doc)
end = time.time()
print("total time", end - st)

total time 3.2384350299835205


In [43]:
es.search(index = "chapter8", body = {"from": 0, "size": 5, "query": {"match_all": {}}})

{'took': 448,
 'timed_out': False,
 '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0},
 'hits': {'total': {'value': 10000, 'relation': 'eq'},
  'max_score': 1.0,
  'hits': [{'_index': 'chapter8',
    '_type': 'doc',
    '_id': '100',
    '_score': 1.0,
    '_source': {'any': 'data100', 'timestamp': '2020-08-09T10:38:05.055040'}},
   {'_index': 'chapter8',
    '_type': 'doc',
    '_id': '101',
    '_score': 1.0,
    '_source': {'any': 'data101', 'timestamp': '2020-08-09T10:38:05.055041'}},
   {'_index': 'chapter8',
    '_type': 'doc',
    '_id': '102',
    '_score': 1.0,
    '_source': {'any': 'data102', 'timestamp': '2020-08-09T10:38:05.055043'}},
   {'_index': 'chapter8',
    '_type': 'doc',
    '_id': '103',
    '_score': 1.0,
    '_source': {'any': 'data103', 'timestamp': '2020-08-09T10:38:05.055044'}},
   {'_index': 'chapter8',
    '_type': 'doc',
    '_id': '104',
    '_score': 1.0,
    '_source': {'any': 'data104',
     'timestamp': '2020-08-09T10:38:05.055045'}

In [44]:
# Scanning over large data (very quickly!!!)
results = hprs.scan(es, index = "chapter8", doc_type = "doc", query = {"query": {"match_all": {}}})
for item in results:
    print(item["_id"], item["_source"])



100 {'any': 'data100', 'timestamp': '2020-08-09T10:38:05.055040'}
101 {'any': 'data101', 'timestamp': '2020-08-09T10:38:05.055041'}
102 {'any': 'data102', 'timestamp': '2020-08-09T10:38:05.055043'}
103 {'any': 'data103', 'timestamp': '2020-08-09T10:38:05.055044'}
104 {'any': 'data104', 'timestamp': '2020-08-09T10:38:05.055045'}
105 {'any': 'data105', 'timestamp': '2020-08-09T10:38:05.055047'}
106 {'any': 'data106', 'timestamp': '2020-08-09T10:38:05.055048'}
107 {'any': 'data107', 'timestamp': '2020-08-09T10:38:05.055050'}
108 {'any': 'data108', 'timestamp': '2020-08-09T10:38:05.055053'}
109 {'any': 'data109', 'timestamp': '2020-08-09T10:38:05.055054'}
110 {'any': 'data110', 'timestamp': '2020-08-09T10:38:05.055056'}
111 {'any': 'data111', 'timestamp': '2020-08-09T10:38:05.055057'}
112 {'any': 'data112', 'timestamp': '2020-08-09T10:38:05.055058'}
113 {'any': 'data113', 'timestamp': '2020-08-09T10:38:05.055060'}
114 {'any': 'data114', 'timestamp': '2020-08-09T10:38:05.055061'}
115 {'any'

933 {'any': 'data933', 'timestamp': '2020-08-09T10:38:05.056588'}
934 {'any': 'data934', 'timestamp': '2020-08-09T10:38:05.056589'}
935 {'any': 'data935', 'timestamp': '2020-08-09T10:38:05.056593'}
936 {'any': 'data936', 'timestamp': '2020-08-09T10:38:05.056596'}
937 {'any': 'data937', 'timestamp': '2020-08-09T10:38:05.056597'}
938 {'any': 'data938', 'timestamp': '2020-08-09T10:38:05.056599'}
939 {'any': 'data939', 'timestamp': '2020-08-09T10:38:05.056600'}
940 {'any': 'data940', 'timestamp': '2020-08-09T10:38:05.056601'}
941 {'any': 'data941', 'timestamp': '2020-08-09T10:38:05.056603'}
942 {'any': 'data942', 'timestamp': '2020-08-09T10:38:05.056604'}
943 {'any': 'data943', 'timestamp': '2020-08-09T10:38:05.056605'}
944 {'any': 'data944', 'timestamp': '2020-08-09T10:38:05.056607'}
945 {'any': 'data945', 'timestamp': '2020-08-09T10:38:05.056608'}
946 {'any': 'data946', 'timestamp': '2020-08-09T10:38:05.056609'}
947 {'any': 'data947', 'timestamp': '2020-08-09T10:38:05.056611'}
948 {'any'

1683 {'any': 'data1683', 'timestamp': '2020-08-09T10:38:05.058780'}
1684 {'any': 'data1684', 'timestamp': '2020-08-09T10:38:05.058781'}
1685 {'any': 'data1685', 'timestamp': '2020-08-09T10:38:05.058783'}
1686 {'any': 'data1686', 'timestamp': '2020-08-09T10:38:05.058787'}
1687 {'any': 'data1687', 'timestamp': '2020-08-09T10:38:05.058789'}
1688 {'any': 'data1688', 'timestamp': '2020-08-09T10:38:05.058790'}
1689 {'any': 'data1689', 'timestamp': '2020-08-09T10:38:05.058793'}
1690 {'any': 'data1690', 'timestamp': '2020-08-09T10:38:05.058794'}
1691 {'any': 'data1691', 'timestamp': '2020-08-09T10:38:05.058795'}
1692 {'any': 'data1692', 'timestamp': '2020-08-09T10:38:05.058797'}
1693 {'any': 'data1693', 'timestamp': '2020-08-09T10:38:05.058798'}
1694 {'any': 'data1694', 'timestamp': '2020-08-09T10:38:05.058799'}
1695 {'any': 'data1695', 'timestamp': '2020-08-09T10:38:05.058800'}
1696 {'any': 'data1696', 'timestamp': '2020-08-09T10:38:05.058802'}
1697 {'any': 'data1697', 'timestamp': '2020-08-0

2683 {'any': 'data2683', 'timestamp': '2020-08-09T10:38:05.060409'}
2684 {'any': 'data2684', 'timestamp': '2020-08-09T10:38:05.060410'}
2685 {'any': 'data2685', 'timestamp': '2020-08-09T10:38:05.060413'}
2686 {'any': 'data2686', 'timestamp': '2020-08-09T10:38:05.060415'}
2687 {'any': 'data2687', 'timestamp': '2020-08-09T10:38:05.060416'}
2688 {'any': 'data2688', 'timestamp': '2020-08-09T10:38:05.060417'}
2689 {'any': 'data2689', 'timestamp': '2020-08-09T10:38:05.060418'}
2690 {'any': 'data2690', 'timestamp': '2020-08-09T10:38:05.060420'}
2691 {'any': 'data2691', 'timestamp': '2020-08-09T10:38:05.060421'}
2692 {'any': 'data2692', 'timestamp': '2020-08-09T10:38:05.060422'}
2693 {'any': 'data2693', 'timestamp': '2020-08-09T10:38:05.060424'}
2694 {'any': 'data2694', 'timestamp': '2020-08-09T10:38:05.060429'}
2695 {'any': 'data2695', 'timestamp': '2020-08-09T10:38:05.060430'}
2696 {'any': 'data2696', 'timestamp': '2020-08-09T10:38:05.060431'}
2697 {'any': 'data2697', 'timestamp': '2020-08-0

3433 {'any': 'data3433', 'timestamp': '2020-08-09T10:38:05.061692'}
3434 {'any': 'data3434', 'timestamp': '2020-08-09T10:38:05.061693'}
3435 {'any': 'data3435', 'timestamp': '2020-08-09T10:38:05.061695'}
3436 {'any': 'data3436', 'timestamp': '2020-08-09T10:38:05.061696'}
3437 {'any': 'data3437', 'timestamp': '2020-08-09T10:38:05.061697'}
3438 {'any': 'data3438', 'timestamp': '2020-08-09T10:38:05.061699'}
3439 {'any': 'data3439', 'timestamp': '2020-08-09T10:38:05.061700'}
3440 {'any': 'data3440', 'timestamp': '2020-08-09T10:38:05.061701'}
3441 {'any': 'data3441', 'timestamp': '2020-08-09T10:38:05.061704'}
3442 {'any': 'data3442', 'timestamp': '2020-08-09T10:38:05.061706'}
3443 {'any': 'data3443', 'timestamp': '2020-08-09T10:38:05.061707'}
3444 {'any': 'data3444', 'timestamp': '2020-08-09T10:38:05.061708'}
3445 {'any': 'data3445', 'timestamp': '2020-08-09T10:38:05.061710'}
3446 {'any': 'data3446', 'timestamp': '2020-08-09T10:38:05.061711'}
3447 {'any': 'data3447', 'timestamp': '2020-08-0

4416 {'any': 'data4416', 'timestamp': '2020-08-09T10:38:05.063311'}
4417 {'any': 'data4417', 'timestamp': '2020-08-09T10:38:05.063313'}
4418 {'any': 'data4418', 'timestamp': '2020-08-09T10:38:05.063314'}
4419 {'any': 'data4419', 'timestamp': '2020-08-09T10:38:05.063317'}
4420 {'any': 'data4420', 'timestamp': '2020-08-09T10:38:05.063318'}
4421 {'any': 'data4421', 'timestamp': '2020-08-09T10:38:05.063320'}
4422 {'any': 'data4422', 'timestamp': '2020-08-09T10:38:05.063321'}
4423 {'any': 'data4423', 'timestamp': '2020-08-09T10:38:05.063322'}
4424 {'any': 'data4424', 'timestamp': '2020-08-09T10:38:05.063323'}
4425 {'any': 'data4425', 'timestamp': '2020-08-09T10:38:05.063327'}
4426 {'any': 'data4426', 'timestamp': '2020-08-09T10:38:05.063328'}
4427 {'any': 'data4427', 'timestamp': '2020-08-09T10:38:05.063329'}
4428 {'any': 'data4428', 'timestamp': '2020-08-09T10:38:05.063331'}
4429 {'any': 'data4429', 'timestamp': '2020-08-09T10:38:05.063332'}
4430 {'any': 'data4430', 'timestamp': '2020-08-0

5100 {'any': 'data5100', 'timestamp': '2020-08-09T10:38:05.064581'}
5101 {'any': 'data5101', 'timestamp': '2020-08-09T10:38:05.064582'}
5102 {'any': 'data5102', 'timestamp': '2020-08-09T10:38:05.064587'}
5103 {'any': 'data5103', 'timestamp': '2020-08-09T10:38:05.064588'}
5104 {'any': 'data5104', 'timestamp': '2020-08-09T10:38:05.064589'}
5105 {'any': 'data5105', 'timestamp': '2020-08-09T10:38:05.064591'}
5106 {'any': 'data5106', 'timestamp': '2020-08-09T10:38:05.064592'}
5107 {'any': 'data5107', 'timestamp': '2020-08-09T10:38:05.064593'}
5108 {'any': 'data5108', 'timestamp': '2020-08-09T10:38:05.064594'}
5109 {'any': 'data5109', 'timestamp': '2020-08-09T10:38:05.064597'}
5110 {'any': 'data5110', 'timestamp': '2020-08-09T10:38:05.064599'}
5111 {'any': 'data5111', 'timestamp': '2020-08-09T10:38:05.064600'}
5112 {'any': 'data5112', 'timestamp': '2020-08-09T10:38:05.064603'}
5113 {'any': 'data5113', 'timestamp': '2020-08-09T10:38:05.064604'}
5114 {'any': 'data5114', 'timestamp': '2020-08-0

5932 {'any': 'data5932', 'timestamp': '2020-08-09T10:38:05.066403'}
5933 {'any': 'data5933', 'timestamp': '2020-08-09T10:38:05.066404'}
5934 {'any': 'data5934', 'timestamp': '2020-08-09T10:38:05.066405'}
5935 {'any': 'data5935', 'timestamp': '2020-08-09T10:38:05.066407'}
5936 {'any': 'data5936', 'timestamp': '2020-08-09T10:38:05.066408'}
5937 {'any': 'data5937', 'timestamp': '2020-08-09T10:38:05.066411'}
5938 {'any': 'data5938', 'timestamp': '2020-08-09T10:38:05.066412'}
5939 {'any': 'data5939', 'timestamp': '2020-08-09T10:38:05.066413'}
5940 {'any': 'data5940', 'timestamp': '2020-08-09T10:38:05.066415'}
5941 {'any': 'data5941', 'timestamp': '2020-08-09T10:38:05.066416'}
5942 {'any': 'data5942', 'timestamp': '2020-08-09T10:38:05.066419'}
5943 {'any': 'data5943', 'timestamp': '2020-08-09T10:38:05.066420'}
5944 {'any': 'data5944', 'timestamp': '2020-08-09T10:38:05.066421'}
5945 {'any': 'data5945', 'timestamp': '2020-08-09T10:38:05.066422'}
5946 {'any': 'data5946', 'timestamp': '2020-08-0

6688 {'any': 'data6688', 'timestamp': '2020-08-09T10:38:05.067673'}
6689 {'any': 'data6689', 'timestamp': '2020-08-09T10:38:05.067675'}
6690 {'any': 'data6690', 'timestamp': '2020-08-09T10:38:05.067676'}
6691 {'any': 'data6691', 'timestamp': '2020-08-09T10:38:05.067677'}
6692 {'any': 'data6692', 'timestamp': '2020-08-09T10:38:05.067678'}
6693 {'any': 'data6693', 'timestamp': '2020-08-09T10:38:05.067681'}
6694 {'any': 'data6694', 'timestamp': '2020-08-09T10:38:05.067682'}
6695 {'any': 'data6695', 'timestamp': '2020-08-09T10:38:05.067684'}
6696 {'any': 'data6696', 'timestamp': '2020-08-09T10:38:05.067685'}
6697 {'any': 'data6697', 'timestamp': '2020-08-09T10:38:05.067686'}
6698 {'any': 'data6698', 'timestamp': '2020-08-09T10:38:05.067689'}
6699 {'any': 'data6699', 'timestamp': '2020-08-09T10:38:05.067690'}
6700 {'any': 'data6700', 'timestamp': '2020-08-09T10:38:05.067691'}
6701 {'any': 'data6701', 'timestamp': '2020-08-09T10:38:05.067693'}
6702 {'any': 'data6702', 'timestamp': '2020-08-0

7681 {'any': 'data7681', 'timestamp': '2020-08-09T10:38:05.069261'}
7682 {'any': 'data7682', 'timestamp': '2020-08-09T10:38:05.069262'}
7683 {'any': 'data7683', 'timestamp': '2020-08-09T10:38:05.069264'}
7684 {'any': 'data7684', 'timestamp': '2020-08-09T10:38:05.069265'}
7685 {'any': 'data7685', 'timestamp': '2020-08-09T10:38:05.069266'}
7686 {'any': 'data7686', 'timestamp': '2020-08-09T10:38:05.069267'}
7687 {'any': 'data7687', 'timestamp': '2020-08-09T10:38:05.069268'}
7688 {'any': 'data7688', 'timestamp': '2020-08-09T10:38:05.069270'}
7689 {'any': 'data7689', 'timestamp': '2020-08-09T10:38:05.069272'}
7690 {'any': 'data7690', 'timestamp': '2020-08-09T10:38:05.069273'}
7691 {'any': 'data7691', 'timestamp': '2020-08-09T10:38:05.069275'}
7692 {'any': 'data7692', 'timestamp': '2020-08-09T10:38:05.069276'}
7693 {'any': 'data7693', 'timestamp': '2020-08-09T10:38:05.069277'}
7694 {'any': 'data7694', 'timestamp': '2020-08-09T10:38:05.069278'}
7695 {'any': 'data7695', 'timestamp': '2020-08-0

8681 {'any': 'data8681', 'timestamp': '2020-08-09T10:38:05.070999'}
8682 {'any': 'data8682', 'timestamp': '2020-08-09T10:38:05.071000'}
8683 {'any': 'data8683', 'timestamp': '2020-08-09T10:38:05.071002'}
8684 {'any': 'data8684', 'timestamp': '2020-08-09T10:38:05.071003'}
8685 {'any': 'data8685', 'timestamp': '2020-08-09T10:38:05.071006'}
8686 {'any': 'data8686', 'timestamp': '2020-08-09T10:38:05.071008'}
8687 {'any': 'data8687', 'timestamp': '2020-08-09T10:38:05.071009'}
8688 {'any': 'data8688', 'timestamp': '2020-08-09T10:38:05.071011'}
8689 {'any': 'data8689', 'timestamp': '2020-08-09T10:38:05.071012'}
8690 {'any': 'data8690', 'timestamp': '2020-08-09T10:38:05.071013'}
8691 {'any': 'data8691', 'timestamp': '2020-08-09T10:38:05.071014'}
8692 {'any': 'data8692', 'timestamp': '2020-08-09T10:38:05.071015'}
8693 {'any': 'data8693', 'timestamp': '2020-08-09T10:38:05.071017'}
8694 {'any': 'data8694', 'timestamp': '2020-08-09T10:38:05.071018'}
8695 {'any': 'data8695', 'timestamp': '2020-08-0

9681 {'any': 'data9681', 'timestamp': '2020-08-09T10:38:05.072855'}
9682 {'any': 'data9682', 'timestamp': '2020-08-09T10:38:05.072856'}
9683 {'any': 'data9683', 'timestamp': '2020-08-09T10:38:05.072857'}
9684 {'any': 'data9684', 'timestamp': '2020-08-09T10:38:05.072858'}
9685 {'any': 'data9685', 'timestamp': '2020-08-09T10:38:05.072859'}
9686 {'any': 'data9686', 'timestamp': '2020-08-09T10:38:05.072861'}
9687 {'any': 'data9687', 'timestamp': '2020-08-09T10:38:05.072862'}
9688 {'any': 'data9688', 'timestamp': '2020-08-09T10:38:05.072863'}
9689 {'any': 'data9689', 'timestamp': '2020-08-09T10:38:05.072864'}
9690 {'any': 'data9690', 'timestamp': '2020-08-09T10:38:05.072865'}
9691 {'any': 'data9691', 'timestamp': '2020-08-09T10:38:05.072867'}
9692 {'any': 'data9692', 'timestamp': '2020-08-09T10:38:05.072868'}
9693 {'any': 'data9693', 'timestamp': '2020-08-09T10:38:05.072871'}
9694 {'any': 'data9694', 'timestamp': '2020-08-09T10:38:05.072874'}
9695 {'any': 'data9695', 'timestamp': '2020-08-0

# Analyser

### Types of Analysers:
- Standard Analyzer
- Simple Analyzer
- Whitespace Analyzer
- Stop Analyzer
- Keyword Analyzer
- Pattern Analyzer
- Language Analyzer
- Fingerprint Analyzer

### NOTE OF FINGERPRINT ALGORITHM:
It implements fingerprint algorithm which:

- remove leading and trailing whitespace
- change all characters to their lowercase representation
- remove all punctuation and control characters
- normalize extended western characters to their ASCII representation (for example "gödel" -> "godel")
- split the string into whitespace-spparated tokens
- sort the tokens and remove duplicates and join the tokens back together. For example, Hence, Mishra Amogh will be represented as amogh mishra

In [45]:
es.indices.analyze(body={
    "analyzer": "standard",
    "text": ['HELLO today is A GREAT DAY']
})
# remove uppercase letters

{'tokens': [{'token': 'hello',
   'start_offset': 0,
   'end_offset': 5,
   'type': '<ALPHANUM>',
   'position': 0},
  {'token': 'today',
   'start_offset': 6,
   'end_offset': 11,
   'type': '<ALPHANUM>',
   'position': 1},
  {'token': 'is',
   'start_offset': 12,
   'end_offset': 14,
   'type': '<ALPHANUM>',
   'position': 2},
  {'token': 'a',
   'start_offset': 15,
   'end_offset': 16,
   'type': '<ALPHANUM>',
   'position': 3},
  {'token': 'great',
   'start_offset': 17,
   'end_offset': 22,
   'type': '<ALPHANUM>',
   'position': 4},
  {'token': 'day',
   'start_offset': 23,
   'end_offset': 26,
   'type': '<ALPHANUM>',
   'position': 5}]}

In [46]:
analyzer = ['standard','simple','whitespace','stop','keyword','pattern','fingerprint']

for analyze in analyzer:
    res = es.indices.analyze(body = {
        "analyzer": analyze,
        "text": ["HELLO WORLD. Today is the 2nd day of the week!!!!        it is Monday."]
    })
    print("====================", analyze, '==================')
    for i in res['tokens']:
        print(i['token'])
    print("\n")

hello
world
today
is
the
2nd
day
of
the
week
it
is
monday


hello
world
today
is
the
nd
day
of
the
week
it
is
monday


HELLO
WORLD.
Today
is
the
2nd
day
of
the
week!!!!
it
is
Monday.


hello
world
today
nd
day
week
monday


HELLO WORLD. Today is the 2nd day of the week!!!!        it is Monday.


hello
world
today
is
the
2nd
day
of
the
week
it
is
monday


2nd day hello is it monday of the today week world




In [57]:
# to specify analyzer while creating index
body = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
            "properties": {
                "text": {
                    "type" : "text",
                    "fields" : {
                        "english": {
                            "type": "text",
                            "analyzer": "english"
                        }
                    }
                }
            }
        }
}

#create index
es.indices.create(index = "chapter9-analyzer", ignore = 400, body = body)

{'acknowledged': True,
 'shards_acknowledged': True,
 'index': 'chapter9-analyzer'}

In [60]:
res = es.indices.analyze(index = "chapter9-analyzer", body = {
    "field": "text.english",
    "text": "The quick Brown Foxes."
})
for i in res['tokens']:
    print(i['token'])

The
quick
Brown
Foxes.


In [56]:
es.indices.delete(index = "chapter9-analyzer")

{'acknowledged': True}