In [15]:
import os
import json
import urllib
import numpy as np
import pandas as pd
import pickle as pkl
from collections import defaultdict

data_file = 'data/wiki_data.pkl'
links_file = 'data/links.tsv'
out_dir = 'data/daily_visitors'

Firstly, read visit data.

In [28]:
def read_data(file):
    data = pkl.load(open(file, 'rb'))

    data = data.drop('categories', axis=1)
    traffic = data['traffic']
    
    # Remove data for which there are not enough days
    day_count = traffic.apply(lambda x: len(x))
    traffic = traffic[day_count == 365]
    
    # Separate data by days
    day_series = {}
    for i in range(365):
        day = traffic.iloc[0][i][1].split('-')
        day[2] = day[2].split('T')[0]
        day = f'{day[2]}/{day[1]}'
        day_series[day] = traffic.apply(lambda x: x[i][0])
    
    # Add visit by day columns to the dataframe
    data = data[day_count == 365]
    data = pd.concat([data, pd.DataFrame(day_series)], axis=1)
    data = data.drop('traffic', axis=1)    
    return data


visit_data = read_data(data_file)
display(visit_data.head())

Unnamed: 0,article,01/01,02/01,03/01,04/01,05/01,06/01,07/01,08/01,09/01,...,22/12,23/12,24/12,25/12,26/12,27/12,28/12,29/12,30/12,31/12
0,Áedán mac Gabráin,35,42,63,46,70,54,43,48,44,...,28,38,26,43,54,44,36,45,29,84
1,Åland,136,184,162,163,209,168,131,157,203,...,110,95,95,109,123,112,141,135,122,99
2,Édouard Manet,703,895,951,1081,1047,977,984,1121,1221,...,553,643,638,745,793,773,763,725,737,630
3,Éire,320,368,396,378,414,438,354,394,405,...,364,289,326,372,360,389,389,368,346,390
4,Óengus I of the Picts,28,32,22,18,33,18,22,32,25,...,21,26,12,34,47,26,24,23,35,28


Now, we can find the links between the articles.

In [29]:
# Read hyperlinks dataset
links = pd.read_csv('data/links.tsv', 
                    sep='\t', 
                    encoding='utf-8', 
                    engine='python', 
                    header=None, 
                    comment='#',
                    names=['source', 'target'])

links['source'] = links['source'].apply(lambda s: urllib.parse.unquote(s))
links['target'] = links['target'].apply(lambda s: urllib.parse.unquote(s))

display(len(links))

119882

In [30]:
links[links['source'] == 'Darth_Vader']

Unnamed: 0,source,target
29326,Darth_Vader,Binoculars
29327,Darth_Vader,Clone_Wars_(Star_Wars)
29328,Darth_Vader,Darth_Vader
29329,Darth_Vader,Dutch_language
29330,Darth_Vader,Frankenstein
29331,Darth_Vader,German_language
29332,Darth_Vader,Japan
29333,Darth_Vader,King_Arthur
29334,Darth_Vader,Natalie_Portman
29335,Darth_Vader,Obi-Wan_Kenobi


We can start generating the data. We will first generate the edges, which are common for all the data files.

In [36]:
# Generate mapping between article name and its ID for fast retrieval
article_dict = {}
for i in range(len(visit_data)):
    article_dict[visit_data.iloc[i].article] = f'n{i}'

edges = []
cnt = 0
for i in range(len(links)):
    source = links.iloc[i].source.replace('_', ' ')
    target = links.iloc[i].target.replace('_', ' ')
    if source in article_dict and target in article_dict:
        edge = {
            'id': f'e{cnt}',
            'source': article_dict[source],
            'target': article_dict[target],
            'type': 'arrow'
        }
        cnt += 1
        edges.append(edge)

Now we generate the node data.

In [86]:
daily_nodes = defaultdict(list)
for i in range(len(visit_data)):
    for day in visit_data.columns[1:]:
        node = {
            'id': article_dict[visit_data.iloc[i].article],
            'label': visit_data.iloc[i].article,
            'size': visit_data[day].iloc[i]
        }
        daily_nodes[day].append(node)

In [37]:
# Save the edge and the node data in pickle files
pkl.dump(edges, open('data/edges.pkl', 'wb'))
pkl.dump(daily_nodes, open('data/nodes.pkl', 'wb'))

In [57]:
edges = pkl.load(open('data/edges.pkl', 'rb'))
nodes = pkl.load(open('data/nodes.pkl', 'rb'))

In [14]:
sizes = [x['size'] for x in nodes['01/01']]
print(min(sizes), max(sizes))

0 112460


In [19]:
# Generate GEXF file to load in Gephi in order to generate a layout for the nodes
import xml.etree.cElementTree as ET

# Initialize XML
root = ET.Element('gexf')
root.set('xmlns', 'http://www.gexf.net/1.2draft')
root.set('xmlns:viz', 'http://www.gexf.net/1.3draft/viz')
root.set('version', '1.2')

# Add the graph structure
graph = ET.SubElement(root, 'graph')
graph.set('mode', 'static')
graph.set('defaultedgetype', 'directed')

# Add nodes
xml_nodes = ET.SubElement(graph, 'nodes')
for node in nodes['01/01']:
    xml_node = ET.SubElement(xml_nodes, 'node')
    xml_node.set('id', node['id'])
    xml_node.set('label', node['label'])
    xml_node_size = ET.SubElement(xml_node, 'viz:size')
    # Map number of visits to a reasonable node size
    node_size = int(np.log2(max(2, node['size']))) * 10
    xml_node_size.set('value', str(node_size))

# Add edges
xml_edges = ET.SubElement(graph, 'edges')
for edge in edges:
    xml_edge = ET.SubElement(xml_edges, 'edge')
    xml_edge.set('id', edge['id'])
    xml_edge.set('source', edge['source'])
    xml_edge.set('target', edge['target'])

# Write data to the XML file
tree = ET.ElementTree(root)
tree.write('sample_data.gexf')

In [28]:
# Extract the data from the GEXF file generated by Gephi with the desired layout
tree = ET.parse('graph2.gexf')
root = tree.getroot()

# Extract the graph element
cnt = 0
for child in root:
    if cnt == 1:
        graph = child
    cnt += 1

# Extract the nodes structure
cnt = 0
for child in graph:
    if cnt == 0:
        nodes_xml = child
        break
        
# Extract the node coordinates
coords = {}
for child in nodes_xml:
    id = child.attrib['id']
    cnt = 0
    for c in child:
        if cnt == 1:
            coords[id] = {'x': float(c.attrib['x']), 'y': float(c.attrib['y'])}
        cnt += 1

In [29]:
DATA_DIR = 'data/daily_visitors'
if not os.path.exists(DATA_DIR):
    os.makedirs(DATA_DIR)

# Generate the data files
for key in nodes:
    data = list(map(lambda x: {
        'id': x['id'],
        'label': x['label'],
        'size': int(x['size']),
        'x': coords[x['id']]['x'],
        'y': coords[x['id']]['y']
    }, nodes[key]))
    key = key.replace('/', '_')
    file = os.path.join(DATA_DIR, f'data{key}.json')
    json.dump(data, open(file, 'w'))