In [None]:
%load_ext autoreload
%autoreload 2
%config Completer.use_jedi = False

In [None]:
import itertools
import json
from pathlib import Path
from datetime import timedelta, datetime
from scipy.sparse import csr_matrix, vstack

from sklearn.metrics import pairwise_distances
from sklearn.neighbors import NearestNeighbors
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
import matplotlib.pyplot as plt
from wikipedia_cleanup.data_processing import get_data
from wikipedia_cleanup.data_filter import KeepAttributesDataFilter, generate_default_filters

In [None]:
input_path = Path("../../../data/custom-format-default-filtered/")
#input_path = Path("/run/media/secret/manjaro-home/secret/mp-data/custom-format-default-filtered")

In [None]:
n_files=4
n_jobs=0
filters = generate_default_filters()
keep_filter = KeepAttributesDataFilter(['page_id', 'infobox_key', 'page_title', 'property_name', 'previous_value' ,'current_value', 'value_valid_from', 'value_valid_to'])
filters.append(keep_filter)
df = get_data(input_path,n_files=n_files, n_jobs=n_jobs, filters=filters)

In [None]:
data = df

In [None]:
data = data[data['previous_value'] != '']
data = data[data['current_value'] != '']

In [None]:
data = data[~data['previous_value'].isnull()]
data = data[~data['current_value'].isnull()]
data = data.copy()

# Wikilinks on infoboxes

In [None]:
import re
regex_str = "\\[\\[((?:\\w+:)?[^<>\\[\\]\"\\|]+)(?:\\|[^\\n\\]]+)?\\]\\]"
regex = re.compile(regex_str)

infobox_key_to_related_page_titles = {}

grouped_infoboxes = data.groupby('infobox_key')['current_value'].unique()

for key, row in tqdm(grouped_infoboxes.iteritems(), total=len(grouped_infoboxes)):
    infobox_key_to_related_page_titles[key] = list(set(match.groups()[0] for value in row if value
                                                       for match in regex.finditer(value) if not match.groups()[0].startswith(("Image:", "File:"))))

In [None]:
num_links = [len(v) for v in infobox_key_to_related_page_titles.values()]
print(f"mean: {np.mean(num_links)}")
print(f"median: {np.median(num_links)}")
print(f"std: {np.std(num_links)}")
plt.hist(num_links, bins=100)

# Working with sparse arrays

In [None]:
def create_time_series(a, duration):
    series = np.zeros(duration)
    uniques, counts = np.unique(a, return_counts=True)
    series[uniques] = counts
    return csr_matrix(series)

def create_bool_time_series(a, duration):
    series = np.zeros(duration, dtype=bool)
    uniques, counts = np.unique(a, return_counts=True)
    series[uniques] = 1
    return series

In [None]:
%%time
bins = pd.date_range(data['value_valid_from'].min().date(), data['value_valid_from'].max().date()+timedelta(1))
total_days = len(bins)
bins = pd.cut(data['value_valid_from'], bins, labels=False)
data['bin_idx'] = bins

num_required_changes = 5
groups = data.groupby(['infobox_key', 'property_name'])
min_support_groups = data[groups['bin_idx'].transform('count') > num_required_changes].groupby(['infobox_key', 'page_id', 'property_name'])
min_support_groups = min_support_groups['bin_idx'].apply(create_time_series, duration=total_days)

In [None]:
print(f"Number of properties: {len(groups)}")
group_index = min_support_groups.index
#x = groups.loc["100593997-0"].values
x = vstack(min_support_groups.to_numpy())
print(f"Len min support groups : {x.shape[0]}")
#x = csr_matrix(x)

## Nearest Neighbors

In [None]:
def percentage_manhatten_adaptive_time_lag(arr1, arr2):
    DELAY_RANGE = 3
    MAX_PERCENTAGE_CHANGES_DIFF = 0.8
    
    arr1 = arr1.toarray()
    arr2 = arr2.toarray()
    max_changes = arr1.sum()
    mask = np.nonzero(arr1)
    error = 0    
    
    for idx in mask[1]:
        needed_num_changes = arr1[0, idx]
        for off in range(-min(DELAY_RANGE, idx), min(DELAY_RANGE, arr2.shape[1] - idx)):
            used_changes = min(needed_num_changes, arr2[0, idx + off])
            arr2[0, idx + off] -= used_changes
            needed_num_changes -= used_changes
        error += needed_num_changes
    
    return error / max_changes

def percentage_manhatten_adaptive_time_lag_symmetric(arr1, arr2):
    return max(percentage_manhatten_adaptive_time_lag(arr1, arr2), percentage_manhatten_adaptive_time_lag(arr2, arr1))

In [None]:
page_id_groups = min_support_groups.reset_index()
page_id_groups = page_id_groups.groupby(['page_id'])[['property_name', 'bin_idx', 'infobox_key']].agg(list)

In [None]:
%%time

max_dist = 0.05

same_infoboxes = []
matches = []
for key, row in tqdm(page_id_groups.iterrows(), total=len(page_id_groups)):
    if len(row[1]) > 1:
        input_data = vstack(row[1])
        neighbor_indices = []
        for i in range(input_data.shape[0]):
            current_neighbors = []
            for j in range(i+1, input_data.shape[0]):
                current_dist = percentage_manhatten_adaptive_time_lag_symmetric(input_data[i], input_data[j])
                if current_dist <= max_dist:
                    current_neighbors.append(j)
            neighbor_indices.append(current_neighbors)
        #neigh = NearestNeighbors(radius=max_dist, metric=percentage_manhatten_adaptive_time_lag_symmetric)#
        #neigh.fit(input_data)
        #neighbor_indices = neigh.radius_neighbors(return_distance=False)
        for i, neighbors in enumerate(neighbor_indices):
            infobox = row[2][i]
            if len(neighbors) > 0:
                infobox_keys = np.array(row[2])[neighbors]
                same_infobox = infobox_keys == infobox
                same_infoboxes.append(same_infobox)
                
                property_names = np.array(row[0])[neighbors]
                match = list(zip(infobox_keys, property_names))
                match.append((infobox, row[0][i]))
                matches.append(match)

In [None]:
def rule_to_data_entries(data, info_keys, property_names):
    idx = data['infobox_key'].isin(info_keys) & data['property_name'].isin(property_names)
    return data[idx].sort_values('value_valid_from')

match = matches[5]
with pd.option_context('display.max_rows', None, 'display.max_columns', None, 'display.precision', 3, ):
    display(match[-1])
    display(rule_to_data_entries(data, *zip(*match)))

In [None]:
num_matches = []
percent_same_matched = []
max_num_matches = 100
for i in range(1, max_num_matches):
    num_matches.append(np.sum(np.fromiter( (len(a) for a in same_infoboxes), dtype=int) == i))
    percent_same_matched.append(np.mean([np.mean(entry) for entry in same_infoboxes if len(entry) == i]))
percent_same_infobox = np.multiply(num_matches, percent_same_matched)

plt.bar(range(1, max_num_matches), percent_same_infobox, label='Same infobox')
plt.bar(range(1, max_num_matches), num_matches-percent_same_infobox, bottom = percent_same_infobox, label='Different Infobox')
plt.legend(loc='upper right')
plt.title(f"Number of properties with #matches == x (#considered: {len(groups)})")
plt.figtext(0.1, 0, "Color represents the % of matches that are not from the same infobox")

# Old nearest neighbor code

In [None]:
max_dist = 2
#max_dist = 0.2
neigh = NearestNeighbors(radius=max_dist, p=1)
#neigh = NearestNeighbors(radius=max_dist, metric=percentage_eucledian)
neigh.fit(x)

In [None]:
%%time
dist, ind = neigh.radius_neighbors()

In [None]:
a = group_index.values
same_infobox = []
for i, indices in enumerate(ind):
    infobox = group_index[i][0]
    if len(indices) > 0:
        matching_pages = np.array([key[0] for key in group_index[indices]])
        same_infobox.append(infobox == matching_pages)
n_total = len(ind)
n_matched = len(same_infobox)

In [None]:
print(f"number of pages considered: {data['page_id'].nunique()}")

In [None]:
num_matches = []
percent_same_matched = []
max_num_matches = 100
for i in range(1, max_num_matches):
    num_matches.append(np.sum(np.fromiter( (len(a) for a in same_infobox), dtype=int) == i))
    percent_same_matched.append(np.mean([np.sum(entry) / i for entry in same_infobox if len(entry) == i]))
percent_same_infobox = np.multiply(num_matches, percent_same_matched)
plt.bar(range(1, max_num_matches), percent_same_infobox, label='Same infobox')
plt.bar(range(1, max_num_matches), num_matches-percent_same_infobox, bottom = percent_same_infobox, label='Different Infobox')
plt.legend(loc='upper right')
plt.title(f"Number of properties with #matches == x (#considered: {len(groups)})")
plt.figtext(0.1, 0, "Color represents the % of matches that are not from the same infobox")

In [None]:
#0.963084495488105
# 95 93 87 87 95
# 93 91 84 93 87
# 94 91 80 90 88
percent_same_matched

Look at some of the examples
 - Many examples have just creation / deletion
 - When filtering these out, some examples are:
     - batting averages (247568784-0), box2..., Statistics in general
     - Temperature reading for cities (infoboxes that are basically plots) - https://de.wikipedia.org/wiki/Ottawa
     - Uniforms for soccer games and stuff https://en.wikipedia.org/wiki/FC_Bayern_Munich_(women)
     - Perfect synchronous data https://en.wikipedia.org/wiki/Delta_County_Airport
     - Career listings and other list elements - https://en.wikipedia.org/wiki/Akaki_Khubutia
     - Stuff that is updated real time, once (Elections https://en.wikipedia.org/wiki/2019_European_Parliament_election_in_the_United_Kingdom)
     - railway stations , passenger numbers (https://en.wikipedia.org/wiki/Windsor_station_(Vermont))
     - Vandalism

In [None]:
a = group_index[ind[np.array([len(g) for g in ind]) > 2][70]]
print(a)
a = a[6]
data[(data['name'] == a[1]) & (data['key'] == a[0])]

In [None]:
data[data['key'] == '131458332-1']

In [None]:
data[data['key'] == '131458332-2']

In [None]:
data[(data['name'] == 'ability') & (data['key'] == '131458332-2')]

In [None]:
ind[np.array([len(g) for g in ind]) > 20]

In [None]:
group_index[ind[np.array([len(g) for g in ind]) > 20][123]]
data[(data['key'] == '110474295-0') & (data['name'] == 'area_rank')]