In [75]:
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter
import datetime

In [76]:
# Import the data from the json file
# Data is in format {},{},{},... and in a text file

# Read the data from the file
with open('../../Dataset/build_dataset.txt') as f:
    data = f.readlines()

# Convert the data to a list of dictionaries
data = [json.loads(x.strip()) for x in data]

In [77]:
def get_readers_by_document(data, doc_uuid):
    all_readers = []
    for record in data:
        if 'env_doc_id' in record and  record['env_doc_id'] == doc_uuid:
            all_readers.append(record['visitor_uuid'])
    return all_readers


In [78]:
def get_document_by_readers(data, visitor_uuid):
    all_docs = []
    for record in data:
        if 'visitor_uuid' in record and record['visitor_uuid'] == visitor_uuid:
            all_docs.append(record['env_doc_id'])
    return all_docs


In [79]:
def also_likes(data, doc_uuid, visitor_uuid=None, sorting_function=None):
    all_readers = get_readers_by_document(data, doc_uuid)
    liked_documents = {}

    for reader in all_readers:
        if reader != visitor_uuid:
            for doc in get_document_by_readers(data, reader):
                liked_documents[doc] = liked_documents.get(doc, 0) + 1

    sorted_docs = sorted(liked_documents.items(), key=sorting_function, reverse=True)
    return [doc[0] for doc in sorted_docs]

In [80]:
def sort_by_readers(item):
    return item[1]

In [81]:
doc_uuid = "140204115519-f5fa6ce8b288c9f10e0c8bc7e1a456a0"  # Replace with actual document UUID
visitor_uuid = "8fd99d4cbfb9b8d8"  # Replace with actual visitor UUID

# Get the top 10 'also likes' documents
top_liked_docs = also_likes(data, doc_uuid, visitor_uuid, sort_by_readers)
print(top_liked_docs)

['140228202800-6ef39a241f35301a9a42cd0ed21e5fb0', '140204115519-f5fa6ce8b288c9f10e0c8bc7e1a456a0', '140219141540-c900b41f845c67cc08b58911155c681c']


In [82]:
def get_unique_visitors(data):
    unique_visitors = set()
    for record in data:
        if 'visitor_uuid' in record:
            unique_visitors.add(record['visitor_uuid'])
    return unique_visitors

def get_unique_documents(data):
    unique_documents = set()
    for record in data:
        if 'env_doc_id' in record:
            unique_documents.add(record['env_doc_id'])
    return unique_documents



distinct_user_count = len(get_unique_visitors(data))
distinct_doc_count = len(get_unique_documents(data))

print(f"Count of distinct users: {distinct_user_count}")
print(f"Count of distinct documents: {distinct_doc_count}")

Count of distinct users: 1317
Count of distinct documents: 1007


In [83]:
def most_popular_documents(data):
    from collections import Counter

    doc_impressions = Counter(record['env_doc_id'] for record in data if 'env_doc_id' in record)
    return doc_impressions.most_common()


In [84]:
print(most_popular_documents(data))

[('140228202800-6ef39a241f35301a9a42cd0ed21e5fb0', 264), ('140224101516-e5c074c3404177518bab9d7a65fb578e', 186), ('140228101942-d4c9bd33cc299cc53d584ca1a4bf15d9', 157), ('111114223935-a39e830a44fa40099a28f587673c4663', 86), ('110704213724-43e4d4195daa434ca5d21d27b360f464', 84), ('140227170505-3f2a61697e872609b7e5fa39ee27c8b0', 79), ('121005130815-84fd9c18031a48d3b842ca7f3dda4a2f', 75), ('140228055617-e02bf54d34f27eadb187d5609354b97d', 70), ('140203223002-4e399bf7aa834060afb9e19ce3237c5a', 69), ('140220182246-a781d17fb18fa53a7c0ae34242d71d3d', 68), ('140224093301-60151c849f742e45bfb63d18ab9ded78', 68), ('120612175518-55d4e7622fae4ee88a787efa2e7c920f', 63), ('140227183313-12474c3737c08861fc81b7c426d49126', 62), ('131017132845-471a4eb3685475663890ccdb4c1b5996', 60), ('120904224614-3ba76c94abae44419eda04b07017ed72', 60), ('140217071905-b3b7d787c2eef33b3d42d8e014b606d2', 59), ('140206010823-b14c9d966be950314215c17923a04af7', 58), ('140226165207-0000000081ae8f2b9acdf0324d892435', 58), ('1312

In [85]:
def average_reading_time_per_document(data):
    total_time = {}
    counts = {}

    for record in data:
        if 'event_readtime' in record and 'env_doc_id' in record:
            doc_id = record['env_doc_id']
            read_time = record['event_readtime']

            total_time[doc_id] = total_time.get(doc_id, 0) + read_time
            counts[doc_id] = counts.get(doc_id, 0) + 1

    average_time = {doc: total_time[doc] / counts[doc] for doc in total_time}
    return average_time

In [86]:
print(average_reading_time_per_document(data))

{'140222143932-91796b01f94327ee809bd759fd0f6c76': 2508.875, '140211154215-0f1d8b14a65ebfbc5f0a9ec478d47119': 1831.2666666666667, '140206010823-b14c9d966be950314215c17923a04af7': 455.95, '140228200319-2e42a83d7bcf9c386123877f640505f6': 486053.0, '130722115624-7d5acdf3f4554d0d890fea074686400d': 929.9, '140228202800-6ef39a241f35301a9a42cd0ed21e5fb0': 7362.5424836601305, '110322220408-aadc46d7849e4c38bd0392de0e4a7605': 1558.375, '130621202640-754a9683afea4bb8ace329763d4a92a9': 3019.3333333333335, '140228170447-b29252d9c1e464cbf83019dcaec91e30': 2265.222222222222, '140103105022-d0c7d706a1df5106cf88686fb67092ed': 1349.1818181818182, '130906093612-00000000e976612072abfdd0e95e7cb1': 1452.2727272727273, '110727005030-000000009cca70787e5fba1fda005c85': 2825.75, '140228030434-d5c063c15739f1060d8146d5f19160d1': 4743.291666666667, '140226195757-5c0986853713e5c1ad82a1de54dd5f08': 15670.5, '140224185207-00000000ae9c45d83841e38fe87a1307': 1809.25, '140226194346-101c05c7d744b2644f916d0d76c2c110': 2272.

In [87]:
def reader_engagement_analysis(data):
    engagement_data = [record for record in data if 'event_readtime' in record]
    df = pd.DataFrame(engagement_data)
    avg_read_time = df.groupby('env_doc_id')['event_readtime'].mean()
    return avg_read_time.sort_values(ascending=False)

In [88]:
print(reader_engagement_analysis(data))

env_doc_id
140228063003-d050cbfb8e1f49b4bbfce0b27419e0eb    1.798883e+06
140228200319-2e42a83d7bcf9c386123877f640505f6    4.860530e+05
130820224820-fcbd041abbe25191beb411e1196977dd    3.241590e+05
130805005838-486e788d702cf9213625e07d358733c2    1.613810e+05
140205095752-094c2347a6590d839407e8011b7dc09c    6.825100e+04
                                                     ...     
130110235810-b613403455e9481c9b9e2fb1d0b3c9f4    7.045000e+02
140203223002-4e399bf7aa834060afb9e19ce3237c5a    6.814828e+02
140228055617-e02bf54d34f27eadb187d5609354b97d    5.796333e+02
140206010823-b14c9d966be950314215c17923a04af7    4.559500e+02
140227170940-34de9e35bf48cc823639123aeb06939b    3.360000e+02
Name: event_readtime, Length: 654, dtype: float64


In [89]:
def peak_access_times(data):
    times = [datetime.datetime.fromtimestamp(record['ts']).hour for record in data]
    peak_times = Counter(times)
    return peak_times.most_common()

In [90]:
peak_access_times(data)

[(4, 5567), (3, 4436)]

In [91]:
# Analyze the number of pages per document accessed by users.
def document_page_count_analysis(data):
    from collections import defaultdict

    page_count_per_doc = defaultdict(set)

    for record in data:
        if 'subject_page' in record and 'env_doc_id' in record:
            doc_id = record['env_doc_id']
            page_count_per_doc[doc_id].add(record['subject_page'])

    page_count = {doc: len(pages) for doc, pages in page_count_per_doc.items()}
    return page_count

In [92]:
document_page_count_analysis(data)

{'140224195414-e5a9acedd5eb6631bb6b39422fba6798': 2,
 '140222143932-91796b01f94327ee809bd759fd0f6c76': 2,
 '140107093723-c8ea2ac1d6ac55ba74c5c6243201b1ce': 3,
 '130325130327-d5889c2cf2e642b6867cb9005e12297f': 1,
 '140211154215-0f1d8b14a65ebfbc5f0a9ec478d47119': 9,
 '140206010823-b14c9d966be950314215c17923a04af7': 23,
 '140228200319-2e42a83d7bcf9c386123877f640505f6': 1,
 '130722115624-7d5acdf3f4554d0d890fea074686400d': 9,
 '140223165942-2f8f5368b8fe5223eb8a2c6db7f8d0d9': 3,
 '140228202800-6ef39a241f35301a9a42cd0ed21e5fb0': 33,
 '131108170128-a751118da2a8be01672dd9bdaab254aa': 7,
 '120111003737-ff0d62c2f9e64064b73f058095e4f081': 28,
 '100608165922-d5c05908b7044d97a13a952e808ccba7': 6,
 '101222231929-89f477d342c8441fbc4921422568dd0a': 6,
 '110322220408-aadc46d7849e4c38bd0392de0e4a7605': 4,
 '131127172259-125c42d114eb8458b3f19c348823a625': 7,
 '130621202640-754a9683afea4bb8ace329763d4a92a9': 5,
 '111114003638-5c1524d5306e4feb901639193560d2e7': 1,
 '120423113954-db9fae68b608474f9fef4ec8797c