In [114]:
import os
import ast
import json
import urllib
import numpy as np
import pandas as pd
import pickle as pkl
from collections import defaultdict

data_file = 'data/wiki_data.pkl'
links_file = 'data/links.tsv'
out_dir = 'data/daily_visitors'

Firstly, read visit data.

In [28]:
def read_data(file):
    data = pkl.load(open(file, 'rb'))

    data = data.drop('categories', axis=1)
    traffic = data['traffic']
    
    # Remove data for which there are not enough days
    day_count = traffic.apply(lambda x: len(x))
    traffic = traffic[day_count == 365]
    
    # Separate data by days
    day_series = {}
    for i in range(365):
        day = traffic.iloc[0][i][1].split('-')
        day[2] = day[2].split('T')[0]
        day = f'{day[2]}/{day[1]}'
        day_series[day] = traffic.apply(lambda x: x[i][0])
    
    # Add visit by day columns to the dataframe
    data = data[day_count == 365]
    data = pd.concat([data, pd.DataFrame(day_series)], axis=1)
    data = data.drop('traffic', axis=1)    
    return data


visit_data = read_data(data_file)
display(visit_data.head())

Unnamed: 0,article,01/01,02/01,03/01,04/01,05/01,06/01,07/01,08/01,09/01,...,22/12,23/12,24/12,25/12,26/12,27/12,28/12,29/12,30/12,31/12
0,Áedán mac Gabráin,35,42,63,46,70,54,43,48,44,...,28,38,26,43,54,44,36,45,29,84
1,Åland,136,184,162,163,209,168,131,157,203,...,110,95,95,109,123,112,141,135,122,99
2,Édouard Manet,703,895,951,1081,1047,977,984,1121,1221,...,553,643,638,745,793,773,763,725,737,630
3,Éire,320,368,396,378,414,438,354,394,405,...,364,289,326,372,360,389,389,368,346,390
4,Óengus I of the Picts,28,32,22,18,33,18,22,32,25,...,21,26,12,34,47,26,24,23,35,28


Now, we can find the links between the articles.

In [29]:
# Read hyperlinks dataset
links = pd.read_csv('data/links.tsv', 
                    sep='\t', 
                    encoding='utf-8', 
                    engine='python', 
                    header=None, 
                    comment='#',
                    names=['source', 'target'])

links['source'] = links['source'].apply(lambda s: urllib.parse.unquote(s))
links['target'] = links['target'].apply(lambda s: urllib.parse.unquote(s))

display(len(links))

119882

In [30]:
links[links['source'] == 'Darth_Vader']

Unnamed: 0,source,target
29326,Darth_Vader,Binoculars
29327,Darth_Vader,Clone_Wars_(Star_Wars)
29328,Darth_Vader,Darth_Vader
29329,Darth_Vader,Dutch_language
29330,Darth_Vader,Frankenstein
29331,Darth_Vader,German_language
29332,Darth_Vader,Japan
29333,Darth_Vader,King_Arthur
29334,Darth_Vader,Natalie_Portman
29335,Darth_Vader,Obi-Wan_Kenobi


We can start generating the data. We will first generate the edges, which are common for all the data files.

In [36]:
# Generate mapping between article name and its ID for fast retrieval
article_dict = {}
for i in range(len(visit_data)):
    article_dict[visit_data.iloc[i].article] = f'n{i}'

edges = []
cnt = 0
for i in range(len(links)):
    source = links.iloc[i].source.replace('_', ' ')
    target = links.iloc[i].target.replace('_', ' ')
    if source in article_dict and target in article_dict:
        edge = {
            'id': f'e{cnt}',
            'source': article_dict[source],
            'target': article_dict[target],
            'type': 'arrow'
        }
        cnt += 1
        edges.append(edge)

Now we generate the node data.

In [86]:
daily_nodes = defaultdict(list)
for i in range(len(visit_data)):
    for day in visit_data.columns[1:]:
        node = {
            'id': article_dict[visit_data.iloc[i].article],
            'label': visit_data.iloc[i].article,
            'size': visit_data[day].iloc[i]
        }
        daily_nodes[day].append(node)

In [37]:
# Save the edge and the node data in pickle files
pkl.dump(edges, open('data/edges.pkl', 'wb'))
pkl.dump(daily_nodes, open('data/nodes.pkl', 'wb'))

In [2]:
edges = pkl.load(open('data/edges.pkl', 'rb'))
nodes = pkl.load(open('data/nodes.pkl', 'rb'))

FileNotFoundError: [Errno 2] No such file or directory: 'data/edges.pkl'

In [40]:
sample_data = {
    'nodes': nodes['01/01'],
    'edges': edges
}
sample_data['nodes'] = list(map(lambda x: {
    'id': x['id'],
    'label': x['label'],
    'size': int(x['size'])
}, sample_data['nodes']))

# json.dump(sample_data, open('sample_data.json', 'w'))

116230

Group data by month.

In [2]:
# Group data by month
data_dir = 'data'
files = sorted(os.listdir(data_dir))
monthly_data = defaultdict(list)

# Read data from each file and distibute it accordingly
for file in files:
    if file[:4] != 'data' or file.find('_') == -1:
        continue
    month = int(file.split('.')[0].split('_')[1])
    monthly_data[month].append(json.load(open(os.path.join(data_dir, file), 'r')))

In [3]:
# For each month find the maximum size for each id
for i in range(1, 13):
    month_data = monthly_data[i]
    max_month_data = []
    for j in range(len(month_data[0])):
        absolute = [d[j]['absolute_size'] for d in month_data]
        average = [d[j]['average_change_size'] for d in month_data]
        if i == 1:
            daily = [month_data[k][j]['daily_change_size'] for k in range(1, len(month_data))]
            daily_diff = 2
        else:
            daily = [d[j]['daily_change_size'] for d in month_data]
            daily_diff = 1
        
        max_absolute, max_absolute_day = int(np.max(absolute)), int(np.argmax(absolute)) + 1
        max_daily, max_daily_day = int(np.max(daily)), int(np.argmax(daily)) + daily_diff
        max_average, max_average_day = int(np.max(average)), int(np.argmax(average)) + 1

        max_month_data.append({
            'id': month_data[0][j]['id'],
            'absolute_size': max_absolute,
            'absolute_day': max_absolute_day,
            'daily_change_size': max_daily,
            'daily_change_day': max_daily_day,
            'average_change_size': max_average,
            'average_change_day': max_average_day,
            'all_visits': absolute
        })
    # Write data to file
    json.dump(max_month_data, open(os.path.join(data_dir, f'data{i}.json'), 'w'))

In [6]:
# Map ids to labels
nodes = json.load(open(os.path.join(data_dir, 'nodes.json'), 'r'))
id_to_label = {}
for node in nodes:
    id_to_label[node['id']] = node['label']

# Select the most interesting articles for each month
NO_ARTICLES = 30
articles = {}
for i in range(1, 13):
    json_data = json.load(open(os.path.join(data_dir, f'data{i}.json'), 'r'))
    data = [[d['daily_change_size'], id_to_label[d['id']], d['id'], d['daily_change_day']] for d in json_data]
    data = sorted(data, reverse=True)
    articles[i] = data[:NO_ARTICLES]

In [7]:
# Save the articles
news_data = ''
for month in articles:
    for art in articles[month]:
        news_data += f'{art[1]} {art[3]}/{month}\n'
        
with open('articles.txt', 'w') as f:
    f.write(news_data)

Get category visit counts each day.

In [81]:
data_dir = 'data'
categories = os.path.join(data_dir, 'article_subject.csv')

# Read categories file in a dataframe
categories_df = pd.read_csv(categories).drop('Unnamed: 0', axis=1)
categories_df['name'] = categories_df['name'].apply(lambda s: urllib.parse.unquote(s))
categories_df['name'] = categories_df['name'].apply(lambda x: x.replace('__', ': ').replace('_', ' '))

In [87]:
# Filter dataframe elements
nodes_data = json.load(open(os.path.join(data_dir, 'nodes.json'), 'r'))
nodes_dict = {}
for node in nodes_data:
    name = node['label']
    if categories_df[categories_df['name'] == name].categories.shape[0] > 0:
        nodes_dict[name] = [node['id'], categories_df[categories_df['name'] == name].categories.iloc[0]]

In [117]:
# Count for each category the number of visits per day
day_visits = {}
keys = list(nodes_dict.keys())
for file in os.listdir(data_dir):
    if file.find('data') == -1 or file.find('_') == -1:
        continue
    day = file[4:9]
    data = json.load(open(os.path.join(data_dir, file), 'r'))
    day_dict = defaultdict(int)
    ind_data, ind_dict = 0, 0    
    while ind_data < len(data) - 1 and ind_dict < len(keys) - 1:
        if data[ind_data]['id'] == nodes_dict[keys[ind_dict]][0]:
            ind_dict += 1
            visits = data[ind_data]['absolute_size']
            cat_list = ast.literal_eval(nodes_dict[keys[ind_dict]][1])
            for category in cat_list:
                day_dict[category] += visits
        ind_data += 1
    day_visits[day] = day_dict

In [124]:
# Convert to normal dictionary
for key in day_visits:
    day_visits[key] = dict(day_visits[key])

# Write data to json file
with open(os.path.join(data_dir, 'categories.json'), 'w') as f:
    json.dump(day_visits, f)