## How this works

1. We define an update_by_query to update the documents by removing actions whose timestamps are older than the provided input date
2. We verify this with a query request using a `range: lte: $date` to see that all the returned timestamps are in fact greater than 


In [41]:
# We will define some persistent variables that we will use everywhere over here. Always run this script first

# you may want to update the value below to something like 'http://localhost:9200/search_recommendations' for testing locally
url = 'http://localhost:9200/events'

headers = {
    'Content-Type': 'application/json'
}
%store url
%store headers

Stored 'url' (str)
Stored 'headers' (dict)


In [37]:
# We will define an update_by_query clause that will delete all the events prior to a provided timestamp

import datetime
import time
import requests
import json


#input
date=datetime.date(2020,6,10)
actions=['view', 'buy', 'returned']

timestamp=int(time.mktime(date.timetuple())*1000)
print("timestamp is: ", timestamp)
queryArray = []
for action in actions:
    q = {
        "range": {
            f"{action}.timestamp": {
                "lte": timestamp
            }
        }
    }
    queryArray.append(q)

script_literal = '''
    for (action in params.actions) {
        List found = new ArrayList();
        for (item in ctx._source[action]) {
            if (item.timestamp < params.timestamp) {
                found.add(item);
            }
        }
        ctx._source[action].removeAll(found);
    }
'''
query = {
    "query": {
        "bool": {
            "should": queryArray
        }
    },
    "script": {
        "source": " ".join(script_literal.splitlines()),
        "lang": "painless",
        "params": {
            "actions": actions,
            "timestamp": timestamp
        }
    }
}

response = requests.request("POST", url + '/_update_by_query', headers=headers, data=json.dumps(query))
print(json.dumps(response.json(), indent=2))


timestamp is:  1591727400000
{
  "took": 37,
  "timed_out": false,
  "total": 5,
  "updated": 5,
  "deleted": 0,
  "batches": 1,
  "version_conflicts": 0,
  "noops": 0,
  "retries": {
    "bulk": 0,
    "search": 0
  },
  "throttled_millis": 0,
  "requests_per_second": -1.0,
  "throttled_until_millis": 0,
  "failures": []
}


In [40]:
# This is a verification query and should have the same date input. If the above query is successful, this should return no hits.

import datetime
import time
import requests
import json


#input
date=datetime.date(2020,6,10)
actions=['view', 'buy', 'returned']

timestamp=int(time.mktime(date.timetuple())*1000)

queryArray = []
for action in actions:
    q = {
        "range": {
            f"{action}.timestamp": {
                "lt": timestamp
            }
        }
    }
    queryArray.append(q)

query = {
    "query": {
        "bool": {
            "should": queryArray
        }
    }
}

response = requests.request("GET", url + '/_search', headers=headers, data=json.dumps(query))
print(json.dumps(response.json(), indent=2))


{
  "took": 2,
  "timed_out": false,
  "_shards": {
    "total": 1,
    "successful": 1,
    "skipped": 0,
    "failed": 0
  },
  "hits": {
    "total": {
      "value": 0,
      "relation": "eq"
    },
    "max_score": null,
    "hits": []
  }
}
