In [69]:
import os
import urllib
import pandas as pd
import pickle as pkl
from collections import defaultdict

data_file = 'data/wiki_data.pkl'
links_file = 'data/links.tsv'
out_dir = 'data/daily_visitors'

Firstly, read visit data.

In [57]:
def read_data(file):
    data = pkl.load(open(file, 'rb'))

    data = data.drop('categories', axis=1)
    traffic = data['traffic']
    
    # Remove data for which there are not enough days
    day_count = traffic.apply(lambda x: len(x))
    traffic = traffic[day_count == 365]
    
    # Separate data by days
    day_series = {}
    for i in range(365):
        day = traffic.iloc[0][i][1].split('-')
        day[2] = day[2].split('T')[0]
        day = f'{day[2]}/{day[1]}'
        day_series[day] = traffic.apply(lambda x: x[i][0])
    
    # Add visit by day columns to the dataframe
    data = data[day_count == 365]
    data = pd.concat([data, pd.DataFrame(day_series)], axis=1)
    data = data.drop('traffic', axis=1)    
    return data


visit_data = read_data(data_file)
display(visit_data.head())

Unnamed: 0,article,01/01,01/02,01/03,01/04,01/05,01/06,01/07,01/08,01/09,...,30/10,30/11,30/12,31/01,31/03,31/05,31/07,31/08,31/10,31/12
0,Áedán mac Gabráin,35,35,62,59,70,63,42,47,44,...,67,34,29,51,44,45,45,35,27,84
1,Åland,136,141,192,120,135,153,124,165,116,...,135,131,122,164,124,143,111,102,136,99
2,Édouard Manet,703,1201,1306,848,1385,823,672,768,659,...,1012,952,737,1281,1106,1030,771,756,962,630
3,Éire,320,430,471,396,433,419,386,405,444,...,407,459,346,410,454,407,375,415,400,390
4,Óengus I of the Picts,28,16,24,19,21,8,17,23,20,...,14,75,35,31,18,27,27,19,26,28


Now, we can find the links between the articles.

In [62]:
# Read hyperlinks dataset
links = pd.read_csv('data/links.tsv', 
                    sep='\t', 
                    encoding='utf-8', 
                    engine='python', 
                    header=None, 
                    comment='#',
                    names=['source', 'target'])

links['source'] = links['source'].apply(lambda s: urllib.parse.unquote(s))
links['target'] = links['target'].apply(lambda s: urllib.parse.unquote(s))

display(links.head())

Unnamed: 0,source,target
0,Áedán_mac_Gabráin,Bede
1,Áedán_mac_Gabráin,Columba
2,Áedán_mac_Gabráin,Dál_Riata
3,Áedán_mac_Gabráin,Great_Britain
4,Áedán_mac_Gabráin,Ireland


We can start generating the data. We will first generate the edges, which are common for all the data files.

In [66]:
# Generate mapping between article name and its ID for fast retrieval
article_dict = {}
for i in range(len(visit_data)):
    article_dict[visit_data.iloc[i].article] = f'n{i}'

edges = []
cnt = 0
for i in range(len(links)):
    source = links.iloc[i].source
    target = links.iloc[i].target
    if source in article_dict and target in article_dict:
        edge = {
            'id': f'e{cnt}',
            'source': article_dict[source],
            'target': article_dict[target],
            'type': 'arrow'
        }
        cnt += 1
        edges.append(edge)

Now we generate the node data.

In [86]:
daily_nodes = defaultdict(list)
for i in range(len(visit_data)):
    for day in visit_data.columns[1:]:
        node = {
            'id': article_dict[visit_data.iloc[i].article],
            'label': visit_data.iloc[i].article,
            'size': visit_data[day].iloc[i]
        }
        daily_nodes[day].append(node)

In [91]:
# Save the edge and the node data in pickle files
pkl.dump(edges, open('data/edges.pkl', 'wb'))
pkl.dump(daily_nodes, open('data/nodes.pkl', 'wb'))