In [1]:
import requests
import json
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from sequence_mining.spam import SpamAlgo

In [2]:
# https://developer.matomo.org/api-reference
res = requests.post(
    "https://demo.matomo.cloud",
    params={
        "module": "API",
        "method": "Live.getLastVisitsDetails",
        "format": "json",
        "idSite": "1",
        "period": "day",
        "date": "today",
        "filter_limit": "2000", # take the logs of 2000 visits.
    }
)

In [3]:
data = json.loads(res.content)

In [4]:
def filter_dict(dict_, keys):
    return {k: dict_[k] for k in keys if k in dict_}

In [5]:
def classify_action(url):
    types = [
        "jobs", "products", "cart", "checkout", "faq", "diving", "best-dive-sites", 
        "best-of-the-best", "my-account", "liveaboard", "divesite", "blog", "resumes",
        "forum", "travel", "guides", "buying-guide"]
    for type_ in types:
        if type_ in url:
            e = {"type": "action", "details": type_, "e": "action__"+type_}
            break
        elif url == "https://dive-shop.net/":
            e = {"type": "action", "details": "shop-home", "e": "action__shop-home"}
            break
        elif url == "https://divezone.net/":
            e = {"type": "action", "details": "zone-home", "e": "action__zone-home"}
            break
    else:
        # print(url)
        e = {"type": "action", "details": "other", "e": "action__other"}

    return e

In [6]:
d = []
ol_urls = []
action_urls = []

for visit in data:
    actions = visit['actionDetails']

    events = []
    faulty_visit = False

    for action in actions:
        if action['type'] == 'action':
            
            e = classify_action(action["url"])
            events.append(e)
            if action["title"] not in action_urls:
                action_urls.append(action["url"])
            
        elif action["type"] == "outlink":
            
            e = filter_dict(action, ["type", "url"])
            e["e"] = e["type"]
            events.append(e)
            
            if action["url"] not in ol_urls:
                ol_urls.append(action["url"])
        elif action["type"] == "search":
            e = filter_dict(action, ["type", "searchKeyword"])
            e["e"] = e["type"] + "__" + e.get("searchKeyword", "")
            events.append(e)
        elif action["type"] == "form":
            e = filter_dict(action, ["type", "formName"])
            e["e"] = e["type"] + "__" + e["formName"]
            events.append(e)
        elif action["type"] == "event":
            e = filter_dict(action, ["type", "eventAction"])
            e["e"] = e["type"] + "__" + e["eventAction"]
            events.append(e)
        elif action["type"] == "ecommerceAbandonedCart":
            e = filter_dict(action, ["type"])
            e["e"] = e["type"]
            events.append(e)
        elif action["type"] == "goal":
            e = filter_dict(action, ["type", "goalName"])
            e["e"] = e["type"] + "__" + e["goalName"]
            events.append(e)
        elif action["type"] == "ecommerceOrder":
            e = filter_dict(action, ["type"])
            e["e"] = e["type"]
            events.append(e)
        else:
            # print(action["type"])
            faulty_visit = True

    if not faulty_visit:
        if events != []:
            d.append(events)

In [7]:
example = ['https://divezone.net/diving/maldives',
 'https://divezone.net/diving/florida',
 'https://divezone.net/diving/red-sea',
 'https://dive-shop.net/products/diving-accessory-starter-kit/',
 'https://dive-shop.net/products/distance-line-reel/']

In [8]:
example_transf = [classify_action(x) for x in example]

In [9]:
# get the longest sequence in case of padding
longest_seq = max([len([j["e"] for j in i]) for i in d])

In [10]:
sequences = []
type_to_id = {}
id_to_type = {}
i = 0

for visit in d:
    events = [j["e"] for j in visit]

    for event in events:
        if event not in type_to_id:
            type_to_id[event] = i
            id_to_type[i] = event
            i += 1
    events_id = [[type_to_id[event]] for event in events]
    if len(events) > 5:
        sequences.append(events_id)

In [15]:
algo = SpamAlgo(0.1)
# sometimes the library seems to be a bit buggy, so in that case you need to try and remove the last sequences
# so if algo.spam(sequences) doesnt work you can try algo.spam(sequences[:-1]) or algo.spam(sequences[:-2]), ...

algo.spam(sequences[:-1]) 

frequent_items = algo.frequent_items

Min support 25


In [12]:
frequent_items_types = []
for i in frequent_items:
    s = []
    for j in i:
        id_ = j if isinstance(j, int) else j[0]
        s.append(id_to_type[id_])
    frequent_items_types.append(s)
        

In [None]:
interaction_lengths = [len(data[i]["actionDetails"]) for i in range(len(data))]
fig, ax = plt.subplots(figsize=(10, 5))

sns.histplot(interaction_lengths, bins=100)
# plt.title('Distribution of interaction lengths')
plt.xlabel('Interaction length')
plt.ylabel('Frequency')
# plt.grid()
#Distribution of interaction lengths