In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import itertools
import json
from pathlib import Path
from datetime import timedelta, datetime
from scipy.sparse import csr_matrix, vstack


import numpy as np
import pandas as pd
from tqdm.auto import tqdm
import matplotlib.pyplot as plt

In [None]:
mp_drive_dir = Path("/media/hpi_share/")#Path("//FS23/projekte$/MP2021/MPWS2021/MPWS2021FN1")
mp_plot_dir = mp_drive_dir / "plots"
input_path = Path("../../../data")
input_data = list(input_path.rglob("*.json"))
files = [x for x in input_data if x.is_file()]
len(files) # total 580

In [None]:
num_edits = 0
change_tuples = []
for file in tqdm(files[:10]):
    with open(file, 'r', encoding='utf-8') as f:
        for jsonObj in f:
            single_edit = json.loads(jsonObj)
            #entries.append(single_edit)
            key = single_edit['key']
            pid = single_edit['pageID']
            user = single_edit['username'] if 'username' in single_edit.keys() else None
            title = single_edit['pageTitle']#['key']
            changes = single_edit['changes']
            timestamp = single_edit['validFrom']
            edit_type = single_edit['type']
            for change in changes:
                name = change['property']['name']
                current_value = change['currentValue'] if 'currentValue' in change.keys() else None
                previous_value = change['previousValue'] if 'previousValue' in change.keys() else None
                change_tuples.append((key, pid, title, user, name, previous_value, current_value, timestamp, edit_type))


In [None]:
data = pd.DataFrame(change_tuples, columns=['key', 'pageID', 'title', 'user','name', 'previous_value', 'current_value', 'timestamp', 'edit_type'])
data['timestamp'] = pd.to_datetime(data['timestamp']).dt.tz_localize(None)

In [None]:
bins = pd.date_range(data['timestamp'].min().date(), data['timestamp'].max().date()+timedelta(1))
total_days = len(bins)
bins = pd.cut(data['timestamp'], bins, labels=False)
data['bin_idx'] = bins

In [None]:
def create_time_series(a, duration):
    series = np.zeros(duration)
    uniques, counts = np.unique(a, return_counts=True)
    series[uniques] = counts
    return csr_matrix(series)

def create_bool_time_series(a, duration):
    series = np.zeros(duration, dtype=bool)
    uniques, counts = np.unique(a, return_counts=True)
    series[uniques] = 1
    return series

In [None]:
print(f"Num data points: {len(data)}")

In [None]:
groups = data.groupby(['key', 'name'])['bin_idx'].apply(create_time_series, duration=total_days)

In [None]:
print(f"Number of properties: {len(groups)}")
num_required_changes = 10
x = groups[groups.apply(lambda x: np.sum(x.toarray())) > num_required_changes].values
#x = groups.loc["100593997-0"].values
x = vstack(x)
print(x.shape)
#x = csr_matrix(x)

## Nearest Neighbors
Probably more suited than clustering methods

In [None]:
import numpy as np
from sklearn.neighbors import NearestNeighbors
max_dist = 2
neigh = NearestNeighbors(radius=max_dist, p=1)
neigh.fit(x)

In [None]:
%%time
dist, ind = neigh.radius_neighbors()

In [None]:
num_matches = []
max_num_matches = 50
for i in range(max_num_matches):
    num_matches.append(np.sum(np.fromiter( (len(a) for a in ind), dtype=int) >= i))

In [None]:
plt.bar(range(max_num_matches), num_matches)

## KShape, apparently good fast clustering
Seems to be ill-suited for the job

In [None]:
plt.figure(figsize=(100,2))
for k, row in groups.loc["100593997-0"].items():
    plt.plot(row)

In [None]:
from tslearn.clustering import KShape
n_clusters = 4
classifier = KShape(n_clusters)
classifier.fit(x)
predictions = classifier.predict(x)

In [None]:
for i in range(n_clusters):
    selected_data = x[predictions==i]
    plt.figure(figsize=(100,2))
    for j in range(len(selected_data)):
        plt.plot(selected_data[j])
    plt.show()

## DBScan

In [None]:
from sklearn.cluster import DBSCAN

In [None]:
from dtaidistance import dtw

In [None]:
print(dtw.distance.__doc__)

In [None]:
def change_distance(a, b):
    return dtw.distance(a, b, window=14, use_pruning=True, use_c=True)

In [None]:
x.shape

In [None]:
%%time
clustering = DBSCAN(eps=1, min_samples=2, n_jobs=6, metric=change_distance).fit(x)

In [None]:
clusters = np.unique(clustering.labels_)
n_clusters = len(clusters)
if -1 in clusters:
    n_clusters -=1

In [None]:
for i in range(n_clusters):
    selected_data = x[clustering.labels_==i]
    plt.figure(figsize=(100,2))
    for j in range(len(selected_data)):
        plt.plot(selected_data[j])
    plt.show()