# elasticsearch在Python requests中的应用

网址: https://es.xiaoleilu.com/

# basic

In [183]:
import requests, json
from pprint import pprint

In [184]:
r = requests.get("http://localhost:9200")
json.loads(r.text)

{'name': 'xgm_Node',
 'cluster_name': 'xgm_Cluster',
 'cluster_uuid': 'P_42AvtNT8WDvCtjIhKErQ',
 'version': {'number': '7.5.1',
  'build_flavor': 'default',
  'build_type': 'tar',
  'build_hash': '3ae9ac9a93c95bd0cdc054951cf95d88e1e18d96',
  'build_date': '2019-12-16T22:57:37.835892Z',
  'build_snapshot': False,
  'lucene_version': '8.3.0',
  'minimum_wire_compatibility_version': '6.8.0',
  'minimum_index_compatibility_version': '6.0.0-beta1'},
 'tagline': 'You Know, for Search'}

### health

In [185]:
r = requests.get("http://localhost:9200/_cat/health?v")
print(r.text)

epoch      timestamp cluster     status node.total node.data shards pri relo init unassign pending_tasks max_task_wait_time active_shards_percent
1579799482 17:11:22  xgm_Cluster yellow          1         1      4   4    0    0        4             0                  -                 50.0%



# PUT 增加数据

In [None]:
data = {
    "first_name" : "John",
    "last_name" :  "Smith",
    "age" :        25,
    "about" :      "I love to go rock climbing",
    "interests": [ "sports", "music" ]
}
url = 'http://localhost:9200/megacorp/employee/1'
headers = {'Content-Type': 'application/json'}

r = requests.put(url, data = json.dumps(data), headers = headers)
json.loads(r.text)

In [29]:
data = {
    "first_name" :  "Jane",
    "last_name" :   "Smith",
    "age" :         32,
    "about" :       "I like to collect rock albums",
    "interests":  [ "music" ]
}
url = 'http://localhost:9200/megacorp/employee/2'
headers = {'Content-Type': 'application/json'}

r = requests.put(url, data = json.dumps(data), headers = headers)
json.loads(r.text)

{'_id': '2',
 '_index': 'megacorp',
 '_primary_term': 1,
 '_seq_no': 2,
 '_shards': {'failed': 0, 'successful': 1, 'total': 2},
 '_type': 'employee',
 '_version': 1,
 'result': 'created'}


In [32]:
data = {
    "first_name" :  "Douglas",
    "last_name" :   "Fir",
    "age" :         35,
    "about":        "I like to build cabinets",
    "interests":  [ "forestry" ]
}
url = 'http://localhost:9200/megacorp/employee/3'
headers = {'Content-Type': 'application/json'}

r = requests.put(url, data = json.dumps(data), headers = headers)
json.loads(r.text)

{'_index': 'megacorp',
 '_type': 'employee',
 '_id': '3',
 '_version': 1,
 'result': 'created',
 '_shards': {'total': 2, 'successful': 1, 'failed': 0},
 '_seq_no': 5,
 '_primary_term': 1}

In [42]:
r = requests.get("http://localhost:9200/megacorp/employee/1")
json.loads(r.text)

{'_index': 'megacorp',
 '_type': 'employee',
 '_id': '1',
 '_version': 3,
 '_seq_no': 6,
 '_primary_term': 1,
 'found': True,
 '_source': {'first_name': 'John',
  'last_name': 'Smith',
  'age': 25,
  'about': 'I love to go rock climbing',
  'interests': ['sports', 'music']}}

# Indexing documents in bulkedit

`curl -H "Content-Type: application/json" -XPOST "localhost:9200/bank/_bulk?pretty&refresh" --data-binary "@accounts.json"`

`curl "localhost:9200/_cat/indices?v"`


In [80]:
# data = {
#     "data-binary" :  "@accounts.json",
# } # wrong usage

with open('accounts.json', 'r') as f1:
    data = f1.read()
url = 'http://localhost:9200/bank/_bulk?pretty&refresh'
headers = {'Content-Type': 'application/json'}

r = requests.post(url = url, headers = headers, data=data)
json.loads(r.text)

{'took': 236,
 'errors': False,
 'items': [{'index': {'_index': 'bank',
    '_type': '_doc',
    '_id': '1',
    '_version': 2,
    'result': 'updated',
    'forced_refresh': True,
    '_shards': {'total': 2, 'successful': 1, 'failed': 0},
    '_seq_no': 1000,
    '_primary_term': 1,
    'status': 200}},
  {'index': {'_index': 'bank',
    '_type': '_doc',
    '_id': '6',
    '_version': 2,
    'result': 'updated',
    'forced_refresh': True,
    '_shards': {'total': 2, 'successful': 1, 'failed': 0},
    '_seq_no': 1001,
    '_primary_term': 1,
    'status': 200}},
  {'index': {'_index': 'bank',
    '_type': '_doc',
    '_id': '13',
    '_version': 2,
    'result': 'updated',
    'forced_refresh': True,
    '_shards': {'total': 2, 'successful': 1, 'failed': 0},
    '_seq_no': 1002,
    '_primary_term': 1,
    'status': 200}},
  {'index': {'_index': 'bank',
    '_type': '_doc',
    '_id': '18',
    '_version': 2,
    'result': 'updated',
    'forced_refresh': True,
    '_shards': {'total

In [186]:
r = requests.get("http://localhost:9200/_cat/indices?v")
print(r.text)

health status index    uuid                   pri rep docs.count docs.deleted store.size pri.store.size
yellow open   bank     U8Pq9fF_SO-qmNwuVGYzLg   1   1       1000            0    414.3kb        414.3kb
yellow open   megacorp thAuJyrPQW6XLloDmsJrCw   1   1          3            0      6.5kb          6.5kb
yellow open   bank2    gzKAZhcPSkmnAb9P1J3bpA   1   1         18            0     40.1kb         40.1kb
yellow open   acl      cw7p1xwXTyyNGD12--8xUw   1   1         11            0    462.2kb        462.2kb



In [182]:
# 尝试一下json文件不加index,能否使用自动序号

with open('accounts2.json', 'r') as f1:
    data = f1.read()
url = 'http://localhost:9200/bank2/test1/_bulk?pretty&refresh'
headers = {'Content-Type': 'application/json'}

r = requests.post(url = url, headers = headers, data=data)
json.loads(r.text)

{'took': 29,
 'errors': False,
 'items': [{'index': {'_index': 'bank2',
    '_type': 'test1',
    '_id': '9f6e6800cfae7749eb6c486619254b9c',
    '_version': 1,
    'result': 'created',
    'forced_refresh': True,
    '_shards': {'total': 2, 'successful': 1, 'failed': 0},
    '_seq_no': 15,
    '_primary_term': 1,
    'status': 201}},
  {'index': {'_index': 'bank2',
    '_type': 'test1',
    '_id': 'T78o028BwZMdlpIKYiMN',
    '_version': 1,
    'result': 'created',
    'forced_refresh': True,
    '_shards': {'total': 2, 'successful': 1, 'failed': 0},
    '_seq_no': 16,
    '_primary_term': 1,
    'status': 201}},
  {'index': {'_index': 'bank2',
    '_type': 'test1',
    '_id': 'UL8o028BwZMdlpIKYiMN',
    '_version': 1,
    'result': 'created',
    'forced_refresh': True,
    '_shards': {'total': 2, 'successful': 1, 'failed': 0},
    '_seq_no': 17,
    '_primary_term': 1,
    'status': 201}}]}

# Start searchingedit

源数据: 

```json
{"account_number":1,"balance":39225,"firstname":"Amber","lastname":"Duke","age":32,"gender":"M","address":"880 Holmes Lane","employer":"Pyrami","email":"amberduke@pyrami.com","city":"Brogan","state":"IL"}
```

In [96]:
# 查找所有; 按照某字段升序排列
data = {
  "query": { "match_all": {} },
  "sort": [
    { "account_number": "asc" }
  ]
}
headers = {'Content-Type': 'application/json'}

r = requests.get("http://localhost:9200/bank/_search", headers = headers, data = json.dumps(data))
json.loads(r.text)

{'took': 78,
 'timed_out': False,
 '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0},
 'hits': {'total': {'value': 1000, 'relation': 'eq'},
  'max_score': None,
  'hits': [{'_index': 'bank',
    '_type': '_doc',
    '_id': '0',
    '_score': None,
    '_source': {'account_number': 0,
     'balance': 16623,
     'firstname': 'Bradshaw',
     'lastname': 'Mckenzie',
     'age': 29,
     'gender': 'F',
     'address': '244 Columbus Place',
     'employer': 'Euron',
     'email': 'bradshawmckenzie@euron.com',
     'city': 'Hobucken',
     'state': 'CO'},
    'sort': [0]},
   {'_index': 'bank',
    '_type': '_doc',
    '_id': '1',
    '_score': None,
    '_source': {'account_number': 1,
     'balance': 39225,
     'firstname': 'Amber',
     'lastname': 'Duke',
     'age': 32,
     'gender': 'M',
     'address': '880 Holmes Lane',
     'employer': 'Pyrami',
     'email': 'amberduke@pyrami.com',
     'city': 'Brogan',
     'state': 'IL'},
    'sort': [1]},
   {'_index': 'ban

In [97]:
# 查找所有; 按照某字段升序排列; gets hits 10 through 19
data = {
  "query": { "match_all": {} },
  "sort": [
    { "account_number": "asc" }
  ],
  "from": 10,
  "size": 10
}
headers = {'Content-Type': 'application/json'}

r = requests.get("http://localhost:9200/bank/_search", headers = headers, data = json.dumps(data))
json.loads(r.text)

{'took': 8,
 'timed_out': False,
 '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0},
 'hits': {'total': {'value': 1000, 'relation': 'eq'},
  'max_score': None,
  'hits': [{'_index': 'bank',
    '_type': '_doc',
    '_id': '10',
    '_score': None,
    '_source': {'account_number': 10,
     'balance': 46170,
     'firstname': 'Dominique',
     'lastname': 'Park',
     'age': 37,
     'gender': 'F',
     'address': '100 Gatling Place',
     'employer': 'Conjurica',
     'email': 'dominiquepark@conjurica.com',
     'city': 'Omar',
     'state': 'NJ'},
    'sort': [10]},
   {'_index': 'bank',
    '_type': '_doc',
    '_id': '11',
    '_score': None,
    '_source': {'account_number': 11,
     'balance': 20203,
     'firstname': 'Jenkins',
     'lastname': 'Haney',
     'age': 20,
     'gender': 'M',
     'address': '740 Ferry Place',
     'employer': 'Qimonk',
     'email': 'jenkinshaney@qimonk.com',
     'city': 'Steinhatchee',
     'state': 'GA'},
    'sort': [11]},
   {

In [98]:
# 查找特定字段
data = {
  "query": { "match": { "address": "mill lane" } }
}
headers = {'Content-Type': 'application/json'}

r = requests.get("http://localhost:9200/bank/_search", headers = headers, data = json.dumps(data))
json.loads(r.text)

{'took': 22,
 'timed_out': False,
 '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0},
 'hits': {'total': {'value': 19, 'relation': 'eq'},
  'max_score': 9.507477,
  'hits': [{'_index': 'bank',
    '_type': '_doc',
    '_id': '136',
    '_score': 9.507477,
    '_source': {'account_number': 136,
     'balance': 45801,
     'firstname': 'Winnie',
     'lastname': 'Holland',
     'age': 38,
     'gender': 'M',
     'address': '198 Mill Lane',
     'employer': 'Neteria',
     'email': 'winnieholland@neteria.com',
     'city': 'Urie',
     'state': 'IL'}},
   {'_index': 'bank',
    '_type': '_doc',
    '_id': '970',
    '_score': 5.4032025,
    '_source': {'account_number': 970,
     'balance': 19648,
     'firstname': 'Forbes',
     'lastname': 'Wallace',
     'age': 28,
     'gender': 'M',
     'address': '990 Mill Road',
     'employer': 'Pheast',
     'email': 'forbeswallace@pheast.com',
     'city': 'Lopezo',
     'state': 'AK'}},
   {'_index': 'bank',
    '_type': '_d

In [102]:
# 查找词组
data = {
  "query": { "match_phrase": { "address": "mill lane" } }
}
headers = {'Content-Type': 'application/json'}

r = requests.get("http://localhost:9200/bank/_search", headers = headers, data = json.dumps(data))
json.loads(r.text)

{'took': 15,
 'timed_out': False,
 '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0},
 'hits': {'total': {'value': 1, 'relation': 'eq'},
  'max_score': 9.507477,
  'hits': [{'_index': 'bank',
    '_type': '_doc',
    '_id': '136',
    '_score': 9.507477,
    '_source': {'account_number': 136,
     'balance': 45801,
     'firstname': 'Winnie',
     'lastname': 'Holland',
     'age': 38,
     'gender': 'M',
     'address': '198 Mill Lane',
     'employer': 'Neteria',
     'email': 'winnieholland@neteria.com',
     'city': 'Urie',
     'state': 'IL'}}]}}

In [104]:
# 组合筛选: 年龄==40 不住Idaho
data = {
  "query": {
    "bool": {
      "must": [
        { "match": { "age": "40" } }
      ],
      "must_not": [
        { "match": { "state": "ID" } }
      ]
    }
  }
}
headers = {'Content-Type': 'application/json'}

r = requests.get("http://localhost:9200/bank/_search", headers = headers, data = json.dumps(data))
json.loads(r.text)

{'took': 3,
 'timed_out': False,
 '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0},
 'hits': {'total': {'value': 43, 'relation': 'eq'},
  'max_score': 1.0,
  'hits': [{'_index': 'bank',
    '_type': '_doc',
    '_id': '474',
    '_score': 1.0,
    '_source': {'account_number': 474,
     'balance': 35896,
     'firstname': 'Obrien',
     'lastname': 'Walton',
     'age': 40,
     'gender': 'F',
     'address': '192 Ide Court',
     'employer': 'Suremax',
     'email': 'obrienwalton@suremax.com',
     'city': 'Crucible',
     'state': 'UT'}},
   {'_index': 'bank',
    '_type': '_doc',
    '_id': '479',
    '_score': 1.0,
    '_source': {'account_number': 479,
     'balance': 31865,
     'firstname': 'Cameron',
     'lastname': 'Ross',
     'age': 40,
     'gender': 'M',
     'address': '904 Bouck Court',
     'employer': 'Telpod',
     'email': 'cameronross@telpod.com',
     'city': 'Nord',
     'state': 'MO'}},
   {'_index': 'bank',
    '_type': '_doc',
    '_id': '54

In [105]:
# 组合筛选: 范围筛选,余额在2w-3w之间.
data = {
  "query": {
    "bool": {
      "must": { "match_all": {} },
      "filter": {
        "range": {
          "balance": {
            "gte": 20000,
            "lte": 30000
          }
        }
      }
    }
  }
}
headers = {'Content-Type': 'application/json'}

r = requests.get("http://localhost:9200/bank/_search", headers = headers, data = json.dumps(data))
json.loads(r.text)

{'took': 7,
 'timed_out': False,
 '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0},
 'hits': {'total': {'value': 217, 'relation': 'eq'},
  'max_score': 1.0,
  'hits': [{'_index': 'bank',
    '_type': '_doc',
    '_id': '49',
    '_score': 1.0,
    '_source': {'account_number': 49,
     'balance': 29104,
     'firstname': 'Fulton',
     'lastname': 'Holt',
     'age': 23,
     'gender': 'F',
     'address': '451 Humboldt Street',
     'employer': 'Anocha',
     'email': 'fultonholt@anocha.com',
     'city': 'Sunriver',
     'state': 'RI'}},
   {'_index': 'bank',
    '_type': '_doc',
    '_id': '102',
    '_score': 1.0,
    '_source': {'account_number': 102,
     'balance': 29712,
     'firstname': 'Dena',
     'lastname': 'Olson',
     'age': 27,
     'gender': 'F',
     'address': '759 Newkirk Avenue',
     'employer': 'Hinway',
     'email': 'denaolson@hinway.com',
     'city': 'Choctaw',
     'state': 'NJ'}},
   {'_index': 'bank',
    '_type': '_doc',
    '_id': '1

### 摸索

In [111]:
# 返回指定字段值 :"firstname","lastname"  query_string能包含或且关系
data = {
    "_source": {
        "include": [
                "firstname",
                "lastname",
                "address"
            ]
            },
    "from":0,
    "size":5,    
    "query": {
        "query_string" : {
            "query" : "(Mill Street) OR (Mill Avenue)",
            "default_field" : "address"
        }
    }
    }
headers = {'Content-Type': 'application/json'}

r = requests.get("http://localhost:9200/bank/_search", headers = headers, data = json.dumps(data))
json.loads(r.text)

{'took': 16,
 'timed_out': False,
 '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0},
 'hits': {'total': {'value': 601, 'relation': 'eq'},
  'max_score': 12.34643,
  'hits': [{'_index': 'bank',
    '_type': '_doc',
    '_id': '345',
    '_score': 12.34643,
    '_source': {'firstname': 'Parker',
     'address': '715 Mill Avenue',
     'lastname': 'Hines'}},
   {'_index': 'bank',
    '_type': '_doc',
    '_id': '472',
    '_score': 11.760358,
    '_source': {'firstname': 'Lee',
     'address': '288 Mill Street',
     'lastname': 'Long'}},
   {'_index': 'bank',
    '_type': '_doc',
    '_id': '970',
    '_score': 10.806405,
    '_source': {'firstname': 'Forbes',
     'address': '990 Mill Road',
     'lastname': 'Wallace'}},
   {'_index': 'bank',
    '_type': '_doc',
    '_id': '136',
    '_score': 10.806405,
    '_source': {'firstname': 'Winnie',
     'address': '198 Mill Lane',
     'lastname': 'Holland'}},
   {'_index': 'bank',
    '_type': '_doc',
    '_id': '25',
   

In [133]:
# 返回指定字段值; 做聚合,只能对数值型字段进行count
data = {
    "_source": {
        "include": [
            "balance",
            "address"
            ]
            },
    "from":0,
    "size":1,

    "aggs" : {
        "xxxx1" : { "value_count" : { "field" : "balance" } }
    }
}
headers = {'Content-Type': 'application/json'}

r = requests.get("http://localhost:9200/bank/_search", headers = headers, data = json.dumps(data))
json.loads(r.text)

{'took': 4,
 'timed_out': False,
 '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0},
 'hits': {'total': {'value': 1000, 'relation': 'eq'},
  'max_score': 1.0,
  'hits': [{'_index': 'bank',
    '_type': '_doc',
    '_id': '1',
    '_score': 1.0,
    '_source': {'address': '880 Holmes Lane', 'balance': 39225}}]},
 'aggregations': {'xxxx1': {'value': 1000}}}

In [187]:
key_word = "Holmes Lane"

base_script = {}
base_script["嘻嘻嘻嘻嘻嘻"] = {}
base_script["嘻嘻嘻嘻嘻嘻"]["script"] = {}
base_script["嘻嘻嘻嘻嘻嘻"]["script"]["lang"] = "painless"
base_script["嘻嘻嘻嘻嘻嘻"]["script"]["source"] = f"""
int count = 0;
if(doc['address.keyword'].size() > 0 && doc['address.keyword'].value.indexOf(params.phrase)!=-1) count++;
return count;        
"""
base_script["嘻嘻嘻嘻嘻嘻"]["script"]["params"] = {}
base_script["嘻嘻嘻嘻嘻嘻"]["script"]["params"]["phrase"] = key_word
base_script

{'嘻嘻嘻嘻嘻嘻': {'script': {'lang': 'painless',
   'source': "\nint count = 0;\nif(doc['address.keyword'].size() > 0 && doc['address.keyword'].value.indexOf(params.phrase)!=-1) count++;\nreturn count;        \n",
   'params': {'phrase': 'Holmes Lane'}}}}

In [188]:
# 统计特定词语的词频
data = {
    "_source": {
        "include": [
            "balance",
            "address"
            ]
            },
    "from":0,
    "size":5,
    "sort": { "_id": { "order": "asc" }},
  "script_fields": base_script
}
headers = {'Content-Type': 'application/json'}

r = requests.get("http://localhost:9200/bank/_search", headers = headers, data = json.dumps(data))
json.loads(r.text)

{'took': 3,
 'timed_out': False,
 '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0},
 'hits': {'total': {'value': 1000, 'relation': 'eq'},
  'max_score': None,
  'hits': [{'_index': 'bank',
    '_type': '_doc',
    '_id': '0',
    '_score': None,
    '_source': {'address': '244 Columbus Place', 'balance': 16623},
    'fields': {'嘻嘻嘻嘻嘻嘻': [0]},
    'sort': ['0']},
   {'_index': 'bank',
    '_type': '_doc',
    '_id': '1',
    '_score': None,
    '_source': {'address': '880 Holmes Lane', 'balance': 39225},
    'fields': {'嘻嘻嘻嘻嘻嘻': [1]},
    'sort': ['1']},
   {'_index': 'bank',
    '_type': '_doc',
    '_id': '10',
    '_score': None,
    '_source': {'address': '100 Gatling Place', 'balance': 46170},
    'fields': {'嘻嘻嘻嘻嘻嘻': [0]},
    'sort': ['10']},
   {'_index': 'bank',
    '_type': '_doc',
    '_id': '100',
    '_score': None,
    '_source': {'address': '696 Ryder Avenue', 'balance': 29869},
    'fields': {'嘻嘻嘻嘻嘻嘻': [0]},
    'sort': ['100']},
   {'_index': 'bank',


In [189]:
j = json.loads(r.text)
lines = []
jj = j['hits']['hits']
for item in jj:
    one_line = []
    address = item['_source']['address']
    balance = item['_source']['balance']
    temp = [item['fields'][key][0] for key in item['fields'].keys()]
    one_line.append(address)
    one_line.append(balance)
    one_line.extend(temp)
    lines.append(one_line)


In [190]:
lines

[['244 Columbus Place', 16623, 0],
 ['880 Holmes Lane', 39225, 1],
 ['100 Gatling Place', 46170, 0],
 ['696 Ryder Avenue', 29869, 0],
 ['972 Lincoln Place', 43400, 0]]

## head

In [29]:
data = {
        "_source": {
        "include": [
            "Name",
            ]
            },
  "query": {
    "bool": {
      "must": [
        { "match": { "Name": "Single-Agent vs. Multi-Agent Techniques for Concurrent Reinforcement Learning of Negotiation Dialogue Policies" } }
      ],
    }
  }
}
headers = {'Content-Type': 'application/json'}

r = requests.get("http://localhost:9200/acl/_search", headers = headers, data = json.dumps(data))
res_json = json.loads(r.text)
res_json

{'took': 10,
 'timed_out': False,
 '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0},
 'hits': {'total': {'value': 2139, 'relation': 'eq'},
  'max_score': 52.84863,
  'hits': [{'_index': 'acl',
    '_type': '_doc',
    '_id': 'ad4456a737a69002df2b1b2ee4980da6',
    '_score': 52.84863,
    '_source': {'Name': ' Multi-Agent Techniques for Concurrent Reinforcement Learning of Negotiation Dialogue Policies'}},
   {'_index': 'acl',
    '_type': '_doc',
    '_id': '6edd87f77ff1abe602815ade3ecb5f6e',
    '_score': 13.71889,
    '_source': {'Name': 'Word-order Biases in Deep-agent Emergent Communication'}},
   {'_index': 'acl',
    '_type': '_doc',
    '_id': 'b0e82af8978f5ae927fcf10ce15d5742',
    '_score': 13.71889,
    '_source': {'Name': 'Insights from Building an Open-Ended Conversational Agent'}},
   {'_index': 'acl',
    '_type': '_doc',
    '_id': 'd3feebe9ae18beb1431ba58c714d412b',
    '_score': 13.451605,
    '_source': {'Name': 'Improving social relationships in fa

In [None]:
'sad sad sa d.pdf'.split('.pdf')