### Model: Significant Terms

Here, we model the recommendations using significant terms aggregation.

In [10]:
# We will define some persistent variables that we will use everywhere over here. Always run this script first

# you may want to update the value below to something like 'http://localhost:9200/search_recommendations' for testing locally
url = 'http://localhost:9200/events'

headers = {
    'Content-Type': 'application/json'
}
%store url
%store headers

Stored 'url' (str)
Stored 'headers' (dict)


In [11]:
# (optional) deletes the index
import requests

response = requests.request("DELETE", url)
print(response.text.encode('utf8'))

b'{"acknowledged":true}'


In [14]:
# Here's the script to create a fresh index with the mapping
import requests
import json 

data = {
    "mappings": {
        "properties": {
            "agent": {
                "type": "keyword"
            },
            "buy": {
                "type": "object",
                "properties": {
                    "target": {
                        "type": "keyword"
                    }
                }
            }
        }
    }
}

response = requests.request("PUT", url, headers=headers, data=json.dumps(data))
print(json.dumps(response.json(), indent=2))

{
  "acknowledged": true,
  "shards_acknowledged": true,
  "index": "events"
}


In [17]:
# (only run once) Here's the script to populate some qualitative data in the index
#
# We will use the following data:
# 
#| agent~target | a | b | c | d | e |
#|--------------|---|---|---|---|---|
#| 1            | + |   | + | + |   |
#| 2            | + |   |   | + |   |
#| 3            | + |   |   | + | + |
#| 4            |   | + |   |   | + |
#| 5            |   |   |   |   | + |


import requests
import json

data = '''
{"index": {}}
{ "agent": "1", "buy.target": [ "a", "c", "d" ] }
{"index": {}}
{ "agent": "2", "buy.target": [ "a", "d" ] }
{"index": {}}
{ "agent": "3", "buy.target": [ "a", "d", "e" ] }
{"index": {}}
{ "agent": "4", "buy.target": [ "b", "e" ] }
{"index": {}}
{ "agent": "5", "buy.target": [ "a", "e" ] }
'''

headers = {
    'Content-Type': 'application/x-ndjson'
}

response = requests.request("POST", url+'/_bulk', headers=headers, data=data)
print(json.dumps(response.json(), indent=2))

{
  "took": 48,
  "errors": false,
  "items": [
    {
      "index": {
        "_index": "events",
        "_type": "_doc",
        "_id": "W5wxhHIByTqzuCc4QxQj",
        "_version": 1,
        "result": "created",
        "_shards": {
          "total": 2,
          "successful": 1,
          "failed": 0
        },
        "_seq_no": 0,
        "_primary_term": 1,
        "status": 201
      }
    },
    {
      "index": {
        "_index": "events",
        "_type": "_doc",
        "_id": "XJwxhHIByTqzuCc4QxQj",
        "_version": 1,
        "result": "created",
        "_shards": {
          "total": 2,
          "successful": 1,
          "failed": 0
        },
        "_seq_no": 1,
        "_primary_term": 1,
        "status": 201
      }
    },
    {
      "index": {
        "_index": "events",
        "_type": "_doc",
        "_id": "XZwxhHIByTqzuCc4QxQj",
        "_version": 1,
        "result": "created",
        "_shards": {
          "total": 2,
          "successful": 1,
 

In [80]:
# Recommend the next targets to buy

import requests
import json

#input
agent="1"

# returns past interaction targets. They help in informing both
# the foreground set (i.e. correlated targets) as well as how to
# finally account for the recommendations
def get_past_interactions(agent_id):
    query = {
        "query": {
            "term": {
                "agent": agent_id
            }
        }
    }
    response = requests.request("GET", url+"/_search", headers=headers, data=json.dumps(query))
    response_hits = response.json()['hits']['hits']
    if len(response_hits) == 1:
        targets = response_hits[0]['_source']['buy.target']
    print(json.dumps(targets, indent=2))
    return targets

# returns recommended interactions. We use the S.F.T. aggs here to mimic
# Collaborative Filtering.
# Foreground Set = documents that contain correlated targets
# Background Set = all the documents
# S.F.T. uses these two sets to find statistically significant targets -> i.e. targets that behave in a statistically interesting way in foreground relative to the background.
# The default scoring algorithm looks like (foreground_freq / background_freq) * (foreground_freq - background_freq).
# We will modify it slightly so that documents with foreground_freq < background_freq (negative scores) still return albeit with a very low score.
def recommended_interactions(agent_id, past_interactions):
    # tunable parameters
    required_matches=1
    
    query = {
        "size": 0,
        "query": {
            "bool": {
                "should": {
                    "terms": {
                        "buy.target": past_interactions
                    }
                }
            }
        },
        "aggs": {
            "sft": {
                "significant_terms": {
                    "field": "buy.target",
                    "min_doc_count": 1,
                    "script_heuristic": {
                        "script": {
                            "lang": "painless",
                            "source": """
                                double ff = 1.0*(params._subset_freq/Math.max(1, 1.0*params._subset_size));
                                double bf = 1.0*(params._superset_freq/Math.max(1, 1.0*params._superset_size));
                                return (ff/bf)*Math.max((ff - bf), 0.0001)"""
                        }
                    }
                }
            }
        }
    }
    
    response = requests.request("GET", url+'/_search', headers=headers, data=json.dumps(query))
    recommended_target_list = response.json()['aggregations']['sft']['buckets']
    print(f'recommended target list via S.F.T (debug): {json.dumps(recommended_target_list, indent=2)}')
    if len(recommended_target_list) > 0:
        recommended_targets = set(map(lambda x: x['key'], recommended_target_list))
        new_recommended_targets = list(recommended_targets - set(past_interactions))
    return new_recommended_targets

# returns most popular target recommendations from those outside of the agent's past interactions
# and ones that are already not recommended by sft.
def get_popular_items(agent, past_interactions, sft_recommendations):
    query = {
        "query": {
            "bool": {
                "must_not": {
                    "terms": {
                        "buy.target": past_interactions
                    }
                }
            }
        },
        "aggs": {
            "popular_targets": {
                "terms": {
                    "field": "buy.target",
                    "size": 10
                }
            }
        }
    }
    response = requests.request("GET", url+'/_search', headers=headers, data=json.dumps(query))
    popular_buckets = response.json()['aggregations']['popular_targets']['buckets']
    print(f'recommended target list via terms agg (debug): {json.dumps(popular_buckets, indent=2)}')
    popular_targets = list(set(map(lambda x: x['key'], popular_buckets)) - set(past_interactions) - set(sft_recommendations))
    return popular_targets

past_interactions = get_past_interactions(agent)
sft_recommendations = recommended_interactions(agent, past_interactions)
if len(sft_recommendations) < 3:
    popular_recommendations = get_popular_items(agent, past_interactions, sft_recommendations)
    

print(f'final recommendations: {sft_recommendations + popular_recommendations}')


[
  "a",
  "c",
  "d"
]
recommended target list via S.F.T (debug): [
  {
    "key": "a",
    "doc_count": 4,
    "score": 0.24999999999999994,
    "bg_count": 4
  },
  {
    "key": "d",
    "doc_count": 3,
    "score": 0.18750000000000003,
    "bg_count": 3
  },
  {
    "key": "c",
    "doc_count": 1,
    "score": 0.062499999999999986,
    "bg_count": 1
  },
  {
    "key": "e",
    "doc_count": 2,
    "score": 8.333333333333334e-05,
    "bg_count": 3
  }
]
recommended target list via terms agg (debug): [
  {
    "key": "b",
    "doc_count": 1
  },
  {
    "key": "e",
    "doc_count": 1
  }
]
final recommendations: ['e', 'b']
