In [None]:
%load_ext autoreload
%autoreload 2
%config Completer.use_jedi = False

In [None]:
if None:
    print("True")

In [None]:
import itertools
import json
from pathlib import Path
from datetime import timedelta, datetime
from scipy.sparse import csr_matrix, vstack


import numpy as np
import pandas as pd
from tqdm.auto import tqdm
import matplotlib.pyplot as plt
from wikipedia_cleanup.data_processing import get_data
from wikipedia_cleanup.data_filter import KeepAttributesDataFilter, generate_default_filters

In [None]:
mp_drive_dir = Path("/media/hpi_share/")#Path("//FS23/projekte$/MP2021/MPWS2021/MPWS2021FN1")
mp_plot_dir = mp_drive_dir / "plots"
input_path = Path("../../../data")
input_data = list(input_path.rglob("*.json"))
files = [x for x in input_data if x.is_file()]
len(files) # total 580

In [None]:
n_files=40
n_jobs=0
filters = generate_default_filters()
keep_filter = KeepAttributesDataFilter(['page_id', 'infobox_key', 'page_title', 'property_name', 'previous_value' ,'current_value', 'value_valid_from', 'value_valid_to'])
filters.append(keep_filter)
df = get_data(input_path,n_files=n_files, n_jobs=n_jobs, filters=generate_default_filters())

In [None]:
data = df

In [None]:
data = data[data['previous_value'] != '']

In [None]:
data = data[data['current_value'] != '']

In [None]:
data = data[~data['previous_value'].isnull()]
data = data[~data['current_value'].isnull()]
data = data.copy()

In [None]:
num_edits = 0
change_tuples = []
#np.random.seed(42)
for file in tqdm(np.random.choice(files, 3, replace=False)):
    with open(file, 'r', encoding='utf-8') as f:
        for jsonObj in f:
            single_edit = json.loads(jsonObj)
            #entries.append(single_edit)
            key = single_edit['key']
            pid = single_edit['pageID']
            user = single_edit['username'] if 'username' in single_edit.keys() else None
            title = single_edit['pageTitle']#['key']
            changes = single_edit['changes']
            valid_from = single_edit['validFrom']
            edit_type = single_edit['type']
            for change in changes:
                name = change['property']['name']
                if 'currentValue' in change.keys() and 'previousValue' in change.keys():
                    current_value = change['currentValue'] if 'currentValue' in change.keys() else None
                    previous_value = change['previousValue'] if 'previousValue' in change.keys() else None
                    valid_to = change['valueValidTo'] if 'valueValidTo' in change.keys() else None
                    if current_value != '' and previous_value != '':
                        change_tuples.append((key, pid, title, user, name, previous_value, current_value, valid_from, valid_to))

In [None]:
data = pd.DataFrame(change_tuples, columns=['key', 'pageID', 'title', 'user','name', 'previous_value', 'current_value', 'value_valid_from', 'valid_to'])
data['value_valid_from'] = pd.to_datetime(data['value_valid_from']).dt.tz_localize(None)
data['valid_to'] = pd.to_datetime(data['valid_to']).dt.tz_localize(None)

print(f"Num data points: {len(data)}")

# Working with sparse arrays

In [None]:
def create_time_series(a, duration):
    series = np.zeros(duration)
    uniques, counts = np.unique(a, return_counts=True)
    series[uniques] = counts
    return csr_matrix(series)

def create_bool_time_series(a, duration):
    series = np.zeros(duration, dtype=bool)
    uniques, counts = np.unique(a, return_counts=True)
    series[uniques] = 1
    return series

In [None]:
%%time
bins = pd.date_range(data['value_valid_from'].min().date(), data['value_valid_from'].max().date()+timedelta(1))
total_days = len(bins)
bins = pd.cut(data['value_valid_from'], bins, labels=False)
data['bin_idx'] = bins

num_required_changes = 5
groups = data.groupby(['infobox_key', 'property_name'])
min_support_groups = data[groups['bin_idx'].transform('count') > num_required_changes].groupby(['infobox_key', 'property_name'])
min_support_groups = min_support_groups['bin_idx'].apply(create_time_series, duration=total_days)

print(f"Number of properties: {len(groups)}")
group_index = min_support_groups.index
#x = groups.loc["100593997-0"].values
x = vstack(min_support_groups.to_numpy())
print(f"Len min support groups : {x.shape[0]}")
#x = csr_matrix(x)

## Working just with lists

In [None]:
groups = data.groupby(['infobox_key', 'property_name'])
num_required_changes = 5
min_support_groups = data[groups['value_valid_from'].transform('count') > num_required_changes]#.groupby(['infobox_key', 'property_name'])
min_support_groups['value_valid_from'] = min_support_groups['value_valid_from'].dt.date
min_support_groups = min_support_groups.groupby(['infobox_key', 'property_name'])['value_valid_from'].apply(list)

In [None]:
x = min_support_groups.to_numpy()

In [None]:
%%timeit
func(x[i], x[j])

In [None]:
%load_ext Cython

In [None]:
func(x[i], x[j])

## Nearest Neighbors
Probably more suited than clustering methods

In [None]:
from sklearn.metrics import pairwise_distances

In [None]:
def percentage_eucledian(arr1, arr2):
    arr1 = arr1.toarray()
    arr2 = arr2.toarray()
    return  np.sum(np.absolute(arr1 - arr2)) / max(arr1.sum(), arr2.sum())

In [None]:
%%timeit
percentage_eucledian(x[32], x[11])

In [None]:
%%timeit
pairwise_distances(x[32], x[11], metric='manhattan')

In [None]:
%%timeit
pairwise_distances(x, metric='manhattan')

In [None]:
%%timeit
pairwise_distances(x, metric=percentage_eucledian)

In [None]:
%%timeit
percentage_eucledian(x[312], x[311])

In [None]:
import numpy as np
from sklearn.neighbors import NearestNeighbors
max_dist = 2
#max_dist = 0.2
neigh = NearestNeighbors(radius=max_dist, p=1)
#neigh = NearestNeighbors(radius=max_dist, metric=percentage_eucledian)
neigh.fit(x)

In [None]:
%%time
dist, ind = neigh.radius_neighbors()

In [None]:
a = group_index.values
same_infobox = []
for i, indices in enumerate(ind):
    infobox = group_index[i][0]
    if len(indices) > 0:
        matching_pages = np.array([key[0] for key in group_index[indices]])
        same_infobox.append(infobox == matching_pages)
n_total = len(ind)
n_matched = len(same_infobox)

In [None]:
print(f"number of pages considered: {data['page_id'].nunique()}")

In [None]:
num_matches = []
percent_same_matched = []
max_num_matches = 100
for i in range(1, max_num_matches):
    num_matches.append(np.sum(np.fromiter( (len(a) for a in same_infobox), dtype=int) == i))
    percent_same_matched.append(np.mean([np.sum(entry) / i for entry in same_infobox if len(entry) == i]))
percent_same_infobox = np.multiply(num_matches, percent_same_matched)
plt.bar(range(1, max_num_matches), percent_same_infobox, label='Same infobox')
plt.bar(range(1, max_num_matches), num_matches-percent_same_infobox, bottom = percent_same_infobox, label='Different Infobox')
plt.legend(loc='upper right')
plt.title(f"Number of properties with #matches == x (#considered: {len(groups)})")
plt.figtext(0.1, 0, "Color represents the % of matches that are not from the same infobox")

In [None]:
#0.963084495488105
# 95 93 87 87 95
# 93 91 84 93 87
# 94 91 80 90 88
percent_same_matched

Look at some of the examples
 - Many examples have just creation / deletion
 - When filtering these out, some examples are:
     - batting averages (247568784-0), box2..., Statistics in general
     - Temperature reading for cities (infoboxes that are basically plots) - https://de.wikipedia.org/wiki/Ottawa
     - Uniforms for soccer games and stuff https://en.wikipedia.org/wiki/FC_Bayern_Munich_(women)
     - Perfect synchronous data https://en.wikipedia.org/wiki/Delta_County_Airport
     - Career listings and other list elements - https://en.wikipedia.org/wiki/Akaki_Khubutia
     - Stuff that is updated real time, once (Elections https://en.wikipedia.org/wiki/2019_European_Parliament_election_in_the_United_Kingdom)
     - railway stations , passenger numbers (https://en.wikipedia.org/wiki/Windsor_station_(Vermont))
     - Vandalism

In [None]:
a = group_index[ind[np.array([len(g) for g in ind]) > 2][70]]
print(a)
a = a[6]
data[(data['name'] == a[1]) & (data['key'] == a[0])]

In [None]:
data[data['key'] == '131458332-1']

In [None]:
data[data['key'] == '131458332-2']

In [None]:
data[(data['name'] == 'ability') & (data['key'] == '131458332-2')]

In [None]:
ind[np.array([len(g) for g in ind]) > 20]

In [None]:
group_index[ind[np.array([len(g) for g in ind]) > 20][123]]
data[(data['key'] == '110474295-0') & (data['name'] == 'area_rank')]

## KShape, apparently good fast clustering
Seems to be ill-suited for the job

In [None]:
plt.figure(figsize=(100,2))
for k, row in groups.loc["100593997-0"].items():
    plt.plot(row)

In [None]:
from tslearn.clustering import KShape
n_clusters = 4
classifier = KShape(n_clusters)
classifier.fit(x)
predictions = classifier.predict(x)

In [None]:
for i in range(n_clusters):
    selected_data = x[predictions==i]
    plt.figure(figsize=(100,2))
    for j in range(len(selected_data)):
        plt.plot(selected_data[j])
    plt.show()

## DBScan

In [None]:
from sklearn.cluster import DBSCAN

In [None]:
from dtaidistance import dtw

In [None]:
print(dtw.distance.__doc__)

In [None]:
def change_distance(a, b):
    return dtw.distance(a, b, window=14, use_pruning=True, use_c=True)

In [None]:
x.shape

In [None]:
%%time
clustering = DBSCAN(eps=1, min_samples=2, n_jobs=6, metric=change_distance).fit(x)

In [None]:
clusters = np.unique(clustering.labels_)
n_clusters = len(clusters)
if -1 in clusters:
    n_clusters -=1

In [None]:
for i in range(n_clusters):
    selected_data = x[clustering.labels_==i]
    plt.figure(figsize=(100,2))
    for j in range(len(selected_data)):
        plt.plot(selected_data[j])
    plt.show()