### Model: Generalized Significant Terms

Here, we model the recommendations accounting for:
1. Multiple signals (`visit`, `share`, `purchase`, `like`)
2. Signal interaction counts

We use a custom scoring function to do this with the significant terms aggregation.

#### Assumptions / Constraints

1. We're not accounting for timestamps of interaction. (Perhaps post POC, this can be done at the time of upserting a document by only rolling up counts that are within the timeframe to be considered),
2. We will have as many queries as the type of interactions (4 currently) to recommend the next items for each of these four categories,
3. We can't pass additional parameters to the script scoring in aggregations (e.g. I would have liked to pass signal weights and values),
4. A limitation is that we have to pass the interactions in the query (as a foreground filter) for which we want the next set of recommendations. Since term queries aren't scored, the foreground filter can either be:  
  A. At least one match - current condition of should,  
  B. Be too restrictive - using a must clause,  
  C. Be somewhere in between but needs to be defined - using a terms_set query with a minimum_should_match clause.  
 Combined with 3. where even if the query could be scored, since the aggs don't allow passing custom params while scoring, I think we need to look at something beyond generalized significant terms to do this.
  


#### Data Model Change

To accommodate the above, we are using an updated shape of data:

```json
{
   "user_id":"1",
   "items":[
      {
         "item": {
            "id":"forrest gump",
            "visit":2,
            "like":1
         }
      },
      {
         "item": {
            "id":"terminator",
            "share":1
         }
      }
   ]
}
```


In [2]:
# We will define some persistant variables that we will use everywhere over here. Always run this script first

# you may want to update the value below to something like 'http://localhost:9200/search_recommendations' for testing locally
url = 'https://vYBoRZTxv:cafabdca-c61c-4b70-9c19-f7f7a5e27258@es-cluster-dc-test-2-b5c555.searchbase.io/recommendations_generalized'

headers = {
    'Content-Type': 'application/json'
}
%store url
%store headers

Stored 'url' (str)
Stored 'headers' (dict)


In [9]:
# (optional) deletes the index
import requests

response = requests.request("DELETE", url)
print(response.text.encode('utf8'))

b'{"acknowledged":true}'


In [10]:
# Here's the script to create a fresh index with the mapping
import requests
import json

payload = {
    "mappings": {
        "properties": {
            "items": {
                "type": "nested"
            }
        }
    }
}
response = requests.request("PUT", url, headers=headers, data = json.dumps(payload))
print(response.text.encode('utf8'))

b'{"acknowledged":true,"shards_acknowledged":true,"index":"recommendations_generalized"}'


In [11]:
# (only run once) Here's the script to populate some qualitative data in the index
import requests
import json

data = '''
{"index": {}}
{ "user_id": "1", "items": [ { "item": { "id": "forrest gump", "visit": 2, "like": 1 } }, { "item": { "id": "terminator", "visit": 1 } }, { "item": { "id": "rambo", "purchase": 1 } }, { "item": { "id": "rocky", "visit": 1 } }, { "item": { "id": "good will hunting", "visit": 5 } } ] }
{"index": {}}
{ "user_id": "2", "items": [ { "item": { "id": "forrest gump", "visit": 1, "purchase": 1 } }, { "item": { "id": "terminator", "visit": 10 } }, { "item": { "id": "rocky iv", "visit": 1, "share": 1 } }, { "item": { "id": "rocky", "visit": 2 } }, { "item": { "id": "rocky ii", "visit": 5 } }, { "item": { "id": "predator", "purchase": 1 } } ] }
{"index": {}}
{ "user_id": "3", "items": [ { "item": { "id": "forrest gump", "visit": 1 } }, { "item": { "id": "the thin red line", "visit": 10, "share": 10, "purchase": 1 } }, { "item": { "id": "good will hunting", "visit": 15, "purchase": 1 } },  { "item": { "id": "rocky ii", "visit": 1 } }, { "item": { "id": "predator", "visit": 1 } }, { "item": { "id": "batman", "visit": 2, "like": 1 } } ] }
{"index": {}}
{ "user_id": "4", "items": [ { "item": { "id": "forrest gump", "visit": 1, "like": 1 } }, { "item": { "id": "something about mary", "visit": 1 } }, { "item": { "id": "sixteen candles", "visit": 1 } } ] }
'''

headers = {
    'Content-Type': 'application/x-ndjson'
}

response = requests.request("POST", url+'/_bulk', headers=headers, data = data)
print(json.dumps(response.json(), indent=2))

{
  "took": 160,
  "errors": false,
  "items": [
    {
      "index": {
        "_index": "recommendations_generalized",
        "_type": "_doc",
        "_id": "JaiUOnIB7SgwU7KadphZ",
        "_version": 1,
        "result": "created",
        "_shards": {
          "total": 2,
          "successful": 2,
          "failed": 0
        },
        "_seq_no": 0,
        "_primary_term": 1,
        "status": 201
      }
    },
    {
      "index": {
        "_index": "recommendations_generalized",
        "_type": "_doc",
        "_id": "JqiUOnIB7SgwU7KadphZ",
        "_version": 1,
        "result": "created",
        "_shards": {
          "total": 2,
          "successful": 2,
          "failed": 0
        },
        "_seq_no": 1,
        "_primary_term": 1,
        "status": 201
      }
    },
    {
      "index": {
        "_index": "recommendations_generalized",
        "_type": "_doc",
        "_id": "J6iUOnIB7SgwU7KadphZ",
        "_version": 1,
        "result": "created",
       

In [None]:
# Lets take a scenario where this data needs to be updated via a script insert.

## TBD

In [18]:
# This is the recommendations query
# This is WIP - needs to account for event score
import requests
import json
payload = {
   "size":0,
   "query":{
      "nested":{
         "path":"items",
         "query":{
            "bool":{
               "should":[
                  {
                     "term":{
                        "items.item.id.keyword":"terminator"
                     }
                  },
                  {
                     "term":{
                        "items.item.id.keyword":"predator"
                     }
                  }
               ]
            }
         }
      }
   },
   "aggs":{
      "urls":{
         "nested":{
            "path":"items"
         },
         "aggs":{
            "items_recommendation":{
               "significant_terms":{
                  "field":"items.item.id.keyword",
                  "min_doc_count":1,
                  "script_heuristic":{
                     "script":{
                        "lang":"painless",
                        "source":"params._subset_freq*1.0 + params._superset_freq*1.0"
                     }
                  }
               }
            }
         }
      }
   }
}

response = requests.request("GET", url+"/_search", headers=headers, data=json.dumps(payload))
print(json.dumps(response.json(), indent=2))

{
  "took": 10,
  "timed_out": false,
  "_shards": {
    "total": 1,
    "successful": 1,
    "skipped": 0,
    "failed": 0
  },
  "hits": {
    "total": {
      "value": 3,
      "relation": "eq"
    },
    "max_score": null,
    "hits": []
  },
  "aggregations": {
    "urls": {
      "doc_count": 17,
      "items_recommendation": {
        "doc_count": 17,
        "bg_count": 24,
        "buckets": [
          {
            "key": "forrest gump",
            "doc_count": 3,
            "score": 7.0,
            "bg_count": 4
          },
          {
            "key": "rocky ii",
            "doc_count": 2,
            "score": 4.0,
            "bg_count": 2
          },
          {
            "key": "predator",
            "doc_count": 2,
            "score": 4.0,
            "bg_count": 2
          },
          {
            "key": "terminator",
            "doc_count": 2,
            "score": 4.0,
            "bg_count": 2
          },
          {
            "key": "rocky",
    