In [None]:
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter
import datetime

In [None]:
# Import the data from the json file
# Data is in format {},{},{},... and in a text file

# Read the data from the file
with open('../../Dataset/build_dataset.txt') as f:
    data = f.readlines()

# Convert the data to a list of dictionaries
data = [json.loads(x.strip()) for x in data]

In [None]:
def get_unique_visitors(data):
    unique_visitors = set()
    for record in data:
        if 'visitor_uuid' in record:
            unique_visitors.add(record['visitor_uuid'])
    return unique_visitors

def get_unique_documents(data):
    unique_documents = set()
    for record in data:
        if 'env_doc_id' in record:
            unique_documents.add(record['env_doc_id'])
    return unique_documents



distinct_user_count = len(get_unique_visitors(data))
distinct_doc_count = len(get_unique_documents(data))

print(f"Count of distinct users: {distinct_user_count}")
print(f"Count of distinct documents: {distinct_doc_count}")

In [None]:
def most_popular_documents(data):
    from collections import Counter

    doc_impressions = Counter(record['env_doc_id'] for record in data if 'env_doc_id' in record)
    return doc_impressions.most_common()


In [None]:
def average_reading_time_per_document(data):
    total_time = {}
    counts = {}

    for record in data:
        if 'event_readtime' in record and 'env_doc_id' in record:
            doc_id = record['env_doc_id']
            read_time = record['event_readtime']

            total_time[doc_id] = total_time.get(doc_id, 0) + read_time
            counts[doc_id] = counts.get(doc_id, 0) + 1

    average_time = {doc: total_time[doc] / counts[doc] for doc in total_time}
    return average_time

In [None]:
def reader_engagement_analysis(data):
    engagement_data = [record for record in data if 'event_readtime' in record]
    df = pd.DataFrame(engagement_data)
    avg_read_time = df.groupby('env_doc_id')['event_readtime'].mean()
    return avg_read_time.sort_values(ascending=False)

In [None]:
def peak_access_times(data):
    times = [datetime.datetime.fromtimestamp(record['ts']).hour for record in data]
    peak_times = Counter(times)
    return peak_times.most_common()

In [None]:
# Analyze the number of pages per document accessed by users.
def document_page_count_analysis(data):
    from collections import defaultdict

    page_count_per_doc = defaultdict(set)

    for record in data:
        if 'subject_page' in record and 'env_doc_id' in record:
            doc_id = record['env_doc_id']
            page_count_per_doc[doc_id].add(record['subject_page'])

    page_count = {doc: len(pages) for doc, pages in page_count_per_doc.items()}
    return page_count