### Model: Significant Terms

Here, we model the recommendations using significant terms aggregation.

In [1]:
# We will define some persistent variables that we will use everywhere over here. Always run this script first

# you may want to update the value below to something like 'http://localhost:9200/search_recommendations' for testing locally
url = 'http://localhost:9200/events'

headers = {
    'Content-Type': 'application/json'
}
%store url
%store headers

Stored 'url' (str)
Stored 'headers' (dict)


In [14]:
# (optional) deletes the index
import requests

response = requests.request("DELETE", url)
print(response.text.encode('utf8'))

b'{"acknowledged":true}'


In [15]:
# Here's the script to create a fresh index with the mapping
import requests
import json

data = {
    "mappings": {
        "dynamic_templates": [
            {
                "string": {
                    "match_mapping_type": "string",
                    "mapping": {
                        "type": "keyword"
                    }
                }
            }
        ]
    }
}

response = requests.request("PUT", url, headers=headers, data=json.dumps(data))
print(json.dumps(response.json(), indent=2))

{
  "acknowledged": true,
  "shards_acknowledged": true,
  "index": "events"
}


In [70]:
# (only run once) Here's the script to populate generic data of actions

import requests
import json
import random

def event_generator(num):
    ev_agents = ['neal','billg','sid','ethanh','donatello','leonardo','michaelangelo','raphael','mada','john','rolo','eminem']
    ev_actions = ['buy','view','returned']
    ev_targets = ['prod:1','prod:2','prod:3','prod:4','prod:5','prod:6','prod:9','prod:8','prod:7','prod:10','prod:11','prod:12']
    ev_referredby = ['sid', 'leonardo', 'mada']
    for n in range(1, num + 1):
        agent = random.choice(ev_agents)
        action = random.choice(ev_actions)
        target = random.choice(ev_targets)
        referred_by = random.choice(ev_referredby)
        script = {
            "upsert": {},
            "scripted_upsert": True,
            "script": {
                "lang": "painless",
                "source": '''
                    ctx._source["agent"] = params.agent;
                    if (ctx._source[params.action] != null) {
                        boolean found = false;
                        for (item in ctx._source[params.action]) {
                            if (item.target == params.target) {
                                found = true;
                                item.count += 1;
                                item.referred_by = params.referred_by;
                            }
                        }
                        if (found == false) {
                            ctx._source[params.action].add(['target': params.target, 'count': 1, 'referred_by': params.referred_by])
                        }
                    } else {
                        ctx._source[params.action] = [[
                            "target": params.target,
                            "count": 1,
                            "referred_by": params.referred_by
                        ]]
                    }
                ''',
                "params": {
                    "agent": agent,
                    "target": target,
                    "referred_by": referred_by,
                    "action": action
                }
            }
        }
        script['script']['source'] = " ".join(script['script']['source'].splitlines())
        #print(f"script: {script['script']['source']}")
        response = requests.request("POST", url + f"/_update/{agent}", headers=headers, data = json.dumps(script))
        #print(json.dumps(response.json(), indent=2))
        
event_generator(100)

In [75]:
import requests
import json

response = requests.request("GET", url + '/_doc/sid')
print(json.dumps(response.json(), indent=2))

{
  "_index": "events",
  "_type": "_doc",
  "_id": "sid",
  "_version": 27,
  "_seq_no": 216,
  "_primary_term": 1,
  "found": true,
  "_source": {
    "agent": "sid",
    "buy": [
      {
        "count": 2,
        "referred_by": "leonardo",
        "target": "prod:4"
      },
      {
        "count": 1,
        "referred_by": "sid",
        "target": "prod:7"
      },
      {
        "count": 3,
        "referred_by": "leonardo",
        "target": "prod:8"
      },
      {
        "count": 1,
        "referred_by": "mada",
        "target": "prod:2"
      },
      {
        "count": 2,
        "referred_by": "sid",
        "target": "prod:3"
      },
      {
        "count": 1,
        "referred_by": "leonardo",
        "target": "prod:11"
      },
      {
        "count": 2,
        "referred_by": "sid",
        "target": "prod:6"
      },
      {
        "count": 2,
        "referred_by": "mada",
        "target": "prod:9"
      },
      {
        "count": 1,
        "referred_by

In [112]:
# Recommend the next targets based on the specified action and optionally (only those that are referred_by a specified value)

import requests
import json

#input
agent="sid"
action="buy"
referred_by="mada" #optional

# returns past interaction targets. They help in informing both
# the foreground set (i.e. correlated targets) as well as how to
# finally account for the recommendations
# referred_by is optional: when passed it filters interactions by the referrer
def get_past_targets(agent_id, action, referred_by):
    query = {
        "query": {
            "term": {
                "agent": agent_id
            }
        }
    }
    response = requests.request("GET", url+"/_search", headers=headers, data=json.dumps(query))
    response_hits = response.json()['hits']['hits']
    targets = []
    if len(response_hits) == 1:
        targets = response_hits[0]['_source'][action]
    if len(targets) > 0:
        if referred_by:
            targets = filter(lambda x: x['referred_by'] == referred_by, targets)
        targets = list(map(lambda x: x['target'], targets))
    print('past targets: ', json.dumps(targets, indent=2))
    return targets

# returns recommended interactions. We use the S.F.T. aggs here to mimic
# Collaborative Filtering.
# Foreground Set = documents that contain correlated targets
# Background Set = all the documents
# S.F.T. uses these two sets to find statistically significant targets -> i.e. targets that behave in a statistically interesting way in foreground relative to the background.
# The default scoring algorithm looks like (foreground_freq / background_freq) * (foreground_freq - background_freq).
# We will modify it slightly so that documents with foreground_freq < background_freq (negative scores) still return albeit with a very low score.
def recommended_targets(agent_id, action, past_targets, referred_by):
    # tunable parameters
    required_matches=1
    referred_by_query = {"match_all": {}}
    if referred_by:
        referred_by_query = {"term": { f"{action}.referred_by": referred_by } }
    query = {
        "size": 0,
        "query": {
            "bool": {
                "must": [
                    referred_by_query,
                    {
                        "bool": {
                            "should": {
                                "terms": {
                                    f"{action}.target": past_targets
                                }
                            }
                        }
                    }
                ]
            }
        },
        "aggs": {
            "sft": {
                "significant_terms": {
                    "field": f"{action}.target",
                    "background_filter": referred_by_query,
                    "min_doc_count": 1,
                    "script_heuristic": {
                        "script": {
                            "lang": "painless",
                            "source": """
                                double ff = 1.0*(params._subset_freq/Math.max(1, 1.0*params._subset_size));
                                double bf = 1.0*(params._superset_freq/Math.max(1, 1.0*params._superset_size));
                                return (ff/bf)*Math.max((ff - bf), 0.0001)"""
                        }
                    }
                }
            }
        }
    }
    
    response = requests.request("GET", url+'/_search', headers=headers, data=json.dumps(query))
    recommended_target_list = response.json()['aggregations']['sft']['buckets']
    print(f'recommended target list via S.F.T (debug): {json.dumps(recommended_target_list, indent=2)}')
    if len(recommended_target_list) > 0:
        recommended_targets = set(map(lambda x: x['key'], recommended_target_list))
        new_recommended_targets = list(recommended_targets - set(past_targets))
        print(f'S.F.T. recommendations: ', new_recommended_targets)
        return new_recommended_targets
    return []

# returns most popular target recommendations from those outside of the agent's past interactions
# and ones that are already not recommended by sft.
def get_popular_targets(agent, action, past_targets, sft_recommendations, referred_by):
    referred_by_query = {"match_all": {}}
    if referred_by:
        referred_by_query = {"term": { f"{action}.referred_by": referred_by } }
    query = {
        "query": {
            "bool": {
                "must": [
                    referred_by_query,
                    {
                        "bool": {
                            "must_not": {
                                "terms": {
                                    f"{action}.target": past_targets
                                }
                            }
                        }
                    }
                ]
            }
        },
        "aggs": {
            "popular_targets": {
                "terms": {
                    "field": f"{action}.target",
                    "size": 10
                }
            }
        }
    }
    response = requests.request("GET", url+'/_search', headers=headers, data=json.dumps(query))
    popular_buckets = response.json()['aggregations']['popular_targets']['buckets']
    print(f'popular target list via terms agg (debug): {json.dumps(popular_buckets, indent=2)}')
    popular_targets = list(set(map(lambda x: x['key'], popular_buckets)) - set(past_targets) - set(sft_recommendations))
    print(f'popular target recommendations: ', popular_targets)
    return popular_targets

past_targets = get_past_targets(agent, action, referred_by)

sft_recommendations = recommended_targets(agent, action, past_targets, referred_by)

if len(sft_recommendations) < 10:
    popular_recommendations = get_popular_targets(agent, action, past_targets, sft_recommendations, referred_by)
else:
    popular_recommendations = []

print(f'final recommendations: {sft_recommendations + popular_recommendations}')


past targets:  [
  "prod:2",
  "prod:9"
]
recommended target list via S.F.T (debug): [
  {
    "key": "prod:2",
    "doc_count": 7,
    "score": 0.7142857142857142,
    "bg_count": 7
  },
  {
    "key": "prod:7",
    "doc_count": 5,
    "score": 0.30612244897959184,
    "bg_count": 6
  },
  {
    "key": "prod:9",
    "doc_count": 3,
    "score": 0.3061224489795918,
    "bg_count": 3
  },
  {
    "key": "prod:3",
    "doc_count": 5,
    "score": 0.05102040816326537,
    "bg_count": 8
  },
  {
    "key": "prod:6",
    "doc_count": 2,
    "score": 0.04081632653061223,
    "bg_count": 3
  },
  {
    "key": "prod:1",
    "doc_count": 2,
    "score": 0.04081632653061223,
    "bg_count": 3
  },
  {
    "key": "prod:4",
    "doc_count": 3,
    "score": 0.01224489795918363,
    "bg_count": 5
  },
  {
    "key": "prod:12",
    "doc_count": 4,
    "score": 9.795918367346938e-05,
    "bg_count": 7
  },
  {
    "key": "prod:10",
    "doc_count": 4,
    "score": 9.795918367346938e-05,
    "bg_count"