In [3]:
import os
import json
import urllib
import datetime
import numpy as np
import pandas as pd
import pickle as pkl
from collections import defaultdict
import xml.etree.cElementTree as ET

data_file = '../data/wiki_data.pkl'
links_file = '../data/links.tsv'
out_dir = '../data/daily_visitors'

Firstly, read visit data.

In [4]:
def read_data(file):
    data = pkl.load(open(file, 'rb'))

    data = data.drop('categories', axis=1)
    traffic = data['traffic']
    
    # Remove data for which there are not enough days
    day_count = traffic.apply(lambda x: len(x))
    traffic = traffic[day_count == 365]
    
    # Separate data by days
    day_series = {}
    for i in range(365):
        day = traffic.iloc[0][i][1].split('-')
        day[2] = day[2].split('T')[0]
        day = f'{day[2]}/{day[1]}'
        day_series[day] = traffic.apply(lambda x: x[i][0])
    
    # Add visit by day columns to the dataframe
    data = data[day_count == 365]
    data = pd.concat([data, pd.DataFrame(day_series)], axis=1)
    data = data.drop('traffic', axis=1)    
    return data


visit_data = read_data(data_file)
display(visit_data.head())

Unnamed: 0,article,01/01,01/02,01/03,01/04,01/05,01/06,01/07,01/08,01/09,...,30/10,30/11,30/12,31/01,31/03,31/05,31/07,31/08,31/10,31/12
0,Áedán mac Gabráin,35,35,62,59,70,63,42,47,44,...,67,34,29,51,44,45,45,35,27,84
1,Åland,136,141,192,120,135,153,124,165,116,...,135,131,122,164,124,143,111,102,136,99
2,Édouard Manet,703,1201,1306,848,1385,823,672,768,659,...,1012,952,737,1281,1106,1030,771,756,962,630
3,Éire,320,430,471,396,433,419,386,405,444,...,407,459,346,410,454,407,375,415,400,390
4,Óengus I of the Picts,28,16,24,19,21,8,17,23,20,...,14,75,35,31,18,27,27,19,26,28


Now, we can find the links between the articles.

In [29]:
# Read hyperlinks dataset
links = pd.read_csv('data/links.tsv', 
                    sep='\t', 
                    encoding='utf-8', 
                    engine='python', 
                    header=None, 
                    comment='#',
                    names=['source', 'target'])

links['source'] = links['source'].apply(lambda s: urllib.parse.unquote(s))
links['target'] = links['target'].apply(lambda s: urllib.parse.unquote(s))

display(len(links))

119882

In [30]:
links[links['source'] == 'Darth_Vader']

Unnamed: 0,source,target
29326,Darth_Vader,Binoculars
29327,Darth_Vader,Clone_Wars_(Star_Wars)
29328,Darth_Vader,Darth_Vader
29329,Darth_Vader,Dutch_language
29330,Darth_Vader,Frankenstein
29331,Darth_Vader,German_language
29332,Darth_Vader,Japan
29333,Darth_Vader,King_Arthur
29334,Darth_Vader,Natalie_Portman
29335,Darth_Vader,Obi-Wan_Kenobi


We can start generating the data. We will first generate the edges, which are common for all the data files.

In [36]:
# Generate mapping between article name and its ID for fast retrieval
article_dict = {}
for i in range(len(visit_data)):
    article_dict[visit_data.iloc[i].article] = f'n{i}'

edges = []
cnt = 0
for i in range(len(links)):
    source = links.iloc[i].source.replace('_', ' ')
    target = links.iloc[i].target.replace('_', ' ')
    if source in article_dict and target in article_dict:
        edge = {
            'id': f'e{cnt}',
            'source': article_dict[source],
            'target': article_dict[target],
            'type': 'arrow'
        }
        cnt += 1
        edges.append(edge)

Now we generate the node data.

In [86]:
daily_nodes = defaultdict(list)
for i in range(len(visit_data)):
    for day in visit_data.columns[1:]:
        node = {
            'id': article_dict[visit_data.iloc[i].article],
            'label': visit_data.iloc[i].article,
            'size': visit_data[day].iloc[i]
        }
        daily_nodes[day].append(node)

In [37]:
# Save the edge and the node data in pickle files
pkl.dump(edges, open('../data/edges.pkl', 'wb'))
pkl.dump(daily_nodes, open('../data/nodes.pkl', 'wb'))

In [6]:
edges = pkl.load(open('../data/edges.pkl', 'rb'))
nodes = pkl.load(open('../data/nodes.pkl', 'rb'))

In [7]:
sizes = [x['size'] for x in nodes['01/01']]
print(min(sizes), max(sizes))

0 112460


We need a way to arrange the nodes, so we will use Gephi to distribute the nodes with a force-directed layout. We first generate a file to load in Gephi and apply the layout.

In [19]:
# Generate GEXF file to load in Gephi in order to generate a layout for the nodes
# Initialize XML
root = ET.Element('gexf')
root.set('xmlns', 'http://www.gexf.net/1.2draft')
root.set('xmlns:viz', 'http://www.gexf.net/1.3draft/viz')
root.set('version', '1.2')

# Add the graph structure
graph = ET.SubElement(root, 'graph')
graph.set('mode', 'static')
graph.set('defaultedgetype', 'directed')

# Add nodes
xml_nodes = ET.SubElement(graph, 'nodes')
for node in nodes['01/01']:
    xml_node = ET.SubElement(xml_nodes, 'node')
    xml_node.set('id', node['id'])
    xml_node.set('label', node['label'])
    xml_node_size = ET.SubElement(xml_node, 'viz:size')
    # Map number of visits to a reasonable node size
    node_size = int(np.log2(max(2, node['size']))) * 10
    xml_node_size.set('value', str(node_size))

# Add edges
xml_edges = ET.SubElement(graph, 'edges')
for edge in edges:
    xml_edge = ET.SubElement(xml_edges, 'edge')
    xml_edge.set('id', edge['id'])
    xml_edge.set('source', edge['source'])
    xml_edge.set('target', edge['target'])

# Write data to the XML file
tree = ET.ElementTree(root)
tree.write('sample_data.gexf')

Having generated the node coordinates, we now parse the Gephi file and associated the coordinates to our node data.

In [11]:
# Extract the data from the GEXF file generated by Gephi with the desired layout
tree = ET.parse('graph.gexf')
root = tree.getroot()

# Extract the graph element
cnt = 0
for child in root:
    if cnt == 1:
        graph = child
    cnt += 1

# Extract the nodes structure
cnt = 0
for child in graph:
    if cnt == 0:
        nodes_xml = child
        break
        
# Extract the node coordinates
coords = {}
for child in nodes_xml:
    id = child.attrib['id']
    cnt = 0
    for c in child:
        if cnt == 1:
            coords[id] = {'x': float(c.attrib['x']), 'y': float(c.attrib['y'])}
        cnt += 1

In [12]:
coords["n1090"]

{'x': -575.2387, 'y': 789.6351}

In [13]:
# Save the coordinates in a file
json_nodes = []
for node in nodes['01/01']:
    node = {
        'id': node['id'],
        'label': node['label'],
        'x': coords[node['id']]['x'],
        'y': coords[node['id']]['y']
    }
    json_nodes.append(node)
json.dump(json_nodes, open('../data/nodes.json', 'w'))

Until now, we have only inspected the overall popularity of articles each day. What would be more interesting to see is to view for each day the articles that suffered an important change in the number of visitors. We begin by computing the difference in the number of visitors from day to day and for each article the difference from the average number of visits.

In [14]:
# Copy the visit_data dataframe
visit_daily_change = visit_data.copy()
visit_average_change = visit_data.copy()

# Compute the change in the number of visitors from the average
average_visits = np.repeat(visit_data.mean(axis=1).astype(int).values.reshape(-1, 1), 365, axis=1)
visit_average_change.iloc[:, 1:] -= average_visits

# Compute the everyday change
cols = visit_data.columns[1:]
for col in cols:
    day, month = col.split('/')
    date = datetime.datetime(2017, int(month), int(day))
    date -= datetime.timedelta(days=1)
    if date.year == 2017:
        date = date.isoformat()
        month = date.split('-')[1]
        day = date.split('-')[2].split('T')[0]
        prev_col = f'{day}/{month}'
        visit_daily_change[col] -= visit_data[prev_col]

In [15]:
# View samples from all the dataframes
display(visit_data.head())
display(visit_daily_change.head())
display(visit_average_change.head())

Unnamed: 0,article,01/01,01/02,01/03,01/04,01/05,01/06,01/07,01/08,01/09,...,30/10,30/11,30/12,31/01,31/03,31/05,31/07,31/08,31/10,31/12
0,Áedán mac Gabráin,35,35,62,59,70,63,42,47,44,...,67,34,29,51,44,45,45,35,27,84
1,Åland,136,141,192,120,135,153,124,165,116,...,135,131,122,164,124,143,111,102,136,99
2,Édouard Manet,703,1201,1306,848,1385,823,672,768,659,...,1012,952,737,1281,1106,1030,771,756,962,630
3,Éire,320,430,471,396,433,419,386,405,444,...,407,459,346,410,454,407,375,415,400,390
4,Óengus I of the Picts,28,16,24,19,21,8,17,23,20,...,14,75,35,31,18,27,27,19,26,28


Unnamed: 0,article,01/01,01/02,01/03,01/04,01/05,01/06,01/07,01/08,01/09,...,30/10,30/11,30/12,31/01,31/03,31/05,31/07,31/08,31/10,31/12
0,Áedán mac Gabráin,35,-16,5,15,37,18,0,2,9,...,36,-30,-16,0,-25,-12,5,-29,-40,55
1,Åland,136,-23,45,-4,-1,10,6,54,14,...,1,24,-13,4,-8,1,-13,15,1,-23
2,Édouard Manet,703,-80,-110,-258,247,-207,-41,-3,-97,...,185,-41,12,-16,-71,48,79,16,-50,-107
3,Éire,320,20,5,-58,101,12,-68,30,29,...,25,-7,-22,-40,2,1,23,-43,-7,44
4,Óengus I of the Picts,28,-15,-6,1,2,-19,-14,-4,1,...,3,52,12,8,-5,-27,-2,-10,12,-7


Unnamed: 0,article,01/01,01/02,01/03,01/04,01/05,01/06,01/07,01/08,01/09,...,30/10,30/11,30/12,31/01,31/03,31/05,31/07,31/08,31/10,31/12
0,Áedán mac Gabráin,-8,-8,19,16,27,20,-1,4,1,...,24,-9,-14,8,1,2,2,-8,-16,41
1,Åland,-3,2,53,-19,-4,14,-15,26,-23,...,-4,-8,-17,25,-15,4,-28,-37,-3,-40
2,Édouard Manet,-314,184,289,-169,368,-194,-345,-249,-358,...,-5,-65,-280,264,89,13,-246,-261,-55,-387
3,Éire,-121,-11,30,-45,-8,-22,-55,-36,3,...,-34,18,-95,-31,13,-34,-66,-26,-41,-51
4,Óengus I of the Picts,6,-6,2,-3,-1,-14,-5,1,-2,...,-8,53,13,9,-4,5,5,-3,4,6


Finally, we can generate the files we need for each of the days of the year.

In [16]:
DATA_DIR = 'data/daily_visitors'
if not os.path.exists(DATA_DIR):
    os.makedirs(DATA_DIR)

# Generate the data files
for key in nodes:
    data = list(map(lambda x: {
        'id': x['id'],
        'absolute_size': int(x['size']),
        'daily_change_size': int(visit_daily_change[key].iloc[int(x['id'][1:])]),
        'average_change_size': int(visit_average_change[key].iloc[int(x['id'][1:])])
    }, nodes[key]))
    key = key.replace('/', '_')
    file = os.path.join(DATA_DIR, f'data{key}.json')
    json.dump(data, open(file, 'w'))