In [14]:
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter
import datetime

In [15]:
# Import the data from the json file
# Data is in format {},{},{},... and in a text file

# Read the data from the file
with open('../../Dataset/build_dataset.txt') as f:
    data = f.readlines()

# Convert the data to a list of dictionaries
data = [json.loads(x.strip()) for x in data]

In [16]:
def get_readers_by_document(data, doc_uuid):
    all_readers = []
    for record in data:
        if 'subject_doc_id' in record and  record['subject_doc_id'] == doc_uuid:
            all_readers.append(record['visitor_uuid'])
    return all_readers


In [23]:
def get_document_by_readers(data, visitor_uuid):
    all_docs = []
    for record in data:
        if 'visitor_uuid' in record and record['visitor_uuid'] == visitor_uuid and 'subject_doc_id' in record:
            all_docs.append(record['subject_doc_id'])
    return all_docs


In [39]:
def also_likes(data, doc_uuid, sorting_function=None):

    if sorting_function is None:
        sorting_function = lambda x: x[1]

    all_readers = get_readers_by_document(data, doc_uuid)
    unique_readers = set(all_readers)

    liked_documents = {}

    for reader in unique_readers:
            docs = get_document_by_readers(data, reader)
            unique_docs = set(docs)

            for doc in unique_docs:
                if doc != doc_uuid:
                    liked_documents[doc] = liked_documents.get(doc, 0) + 1

    sorted_docs = sorted(liked_documents.items(), key=sorting_function, reverse=True)
    return [doc[0] for doc in sorted_docs[:10]]

In [40]:
def sort_by_readers(item):
    return item[1]

In [41]:
doc_uuid = "100713205147-2ee05a98f1794324952eea5ca678c026"  # Replace with actual document UUID
visitor_uuid = "8fd99d4cbfb9b8d8"  # Replace with actual visitor UUID

# Get the top 10 'also likes' documents
top_liked_docs = also_likes(data, doc_uuid, sort_by_readers)
print(top_liked_docs)

['140218233015-c848da298ed6d38b98e18a85731a83f4', '131202094202-a4ae3185bc84368f14bff266d276eb4b', '131218101426-7fe24377c762b8fe53d21b65fcfa9b25', '131121160738-0db9af9dbc996be9fce9d81638868dde', '130902223509-8fed6b88ae0937c1c43fb30cb9f87ad8', '140227101855-42de650464f91d12c6a5644f999c6287', '140128221921-4c304a111b9e0a3425f419563ad6e29a', '140228142350-9f269f7cc77eed6045f5930e276d280d', '140227005600-2a71e9e5c5780a23f540112bd0038467', '131026155107-acf6375f44f1648870c9a12396618a62']


In [44]:
from graphviz import Digraph

# Assuming the also_likes function and other necessary functions are defined
# Here's an implementation of the graph generation:

def generate_graph(data, doc_uuid):
    # Create a Digraph object
    dot = Digraph(comment='Also Likes Graph')

    # Highlight the input document and visitor
    dot.node(doc_uuid[-4:], doc_uuid[-4:], style='filled', fillcolor='green')
    if visitor_uuid:
        dot.node(visitor_uuid[-4:], visitor_uuid[-4:], style='filled', fillcolor='green')

    # Get the list of "also likes" documents using the also_likes function
    also_likes_docs = also_likes(data, doc_uuid)
    print
    # For each "also likes" document, get the readers and create edges
    for also_likes_doc in also_likes_docs:
        dot.node(also_likes_doc[-4:], also_likes_doc[-4:])
        readers = get_readers_by_document(data, also_likes_doc)
        for reader in readers:
            reader_id = reader[-4:]
            # Add the reader node
            dot.node(reader_id, reader_id)
            # Add an edge from reader to the document
            dot.edge(reader_id, also_likes_doc[-4:])

    # Generate and save the graph
    print(dot.source)  # Optionally print the dot script
    dot.render('also_likes_graph', format='png', cleanup=True)  # Save the graph as a PNG file


# Call the function with the specific document UUID and visitor UUID
generate_graph(data, "100713205147-2ee05a98f1794324952eea5ca678c026")


// Also Likes Graph
digraph {
	c026 [label=c026 fillcolor=green style=filled]
	b8d8 [label=b8d8 fillcolor=green style=filled]
	"83f4" [label="83f4"]
	"05a1" [label="05a1"]
	"05a1" -> "83f4"
	c199 [label=c199]
	c199 -> "83f4"
	"3d35" [label="3d35"]
	"3d35" -> "83f4"
	f2da [label=f2da]
	f2da -> "83f4"
	eb4b [label=eb4b]
	"05a1" [label="05a1"]
	"05a1" -> eb4b
	c199 [label=c199]
	c199 -> eb4b
	a4c5 [label=a4c5]
	a4c5 -> eb4b
	"08a5" [label="08a5"]
	"08a5" -> eb4b
	f2da [label=f2da]
	f2da -> eb4b
	"3d35" [label="3d35"]
	"3d35" -> eb4b
	"9b25" [label="9b25"]
	"05a1" [label="05a1"]
	"05a1" -> "9b25"
	c199 [label=c199]
	c199 -> "9b25"
	a4c5 [label=a4c5]
	a4c5 -> "9b25"
	f2da [label=f2da]
	f2da -> "9b25"
	"8dde" [label="8dde"]
	"3d35" [label="3d35"]
	"3d35" -> "8dde"
	"7ad8" [label="7ad8"]
	"3d35" [label="3d35"]
	"3d35" -> "7ad8"
	3822 [label=3822]
	3822 -> "7ad8"
	3822 [label=3822]
	3822 -> "7ad8"
	"3d35" [label="3d35"]
	"3d35" -> "7ad8"
	"3d35" [label="3d35"]
	"3d35" -> "7ad8"
	"3d35" [label=