In [None]:
import json
from pathlib import Path
from datetime import timedelta

import numpy as np
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt

In [None]:
mp_drive_dir = Path("/media/hpi_share/")#Path("//FS23/projekte$/MP2021/MPWS2021/MPWS2021FN1")
mp_plot_dir = mp_drive_dir / "plots"
input_data = list(Path("../../data").rglob("*.json"))
files = [x for x in input_data if x.is_file()]
len(files) # total 580

In [None]:
num_edits = 0
change_tuples = []
for file in tqdm(files[:50]):
    with open(file, 'r', encoding='utf-8') as f:
        for jsonObj in f:
            single_edit = json.loads(jsonObj)
            num_edits += 1
            #entries.append(single_edit)
            title = single_edit['pageTitle']#['key']
            changes = single_edit['changes']
            timestamp = single_edit['validFrom']
            for change in changes:
                name = change['property']['name']
                current_value = change['currentValue'] if 'currentValue' in change.keys() else None
                previous_value = change['previousValue'] if 'previousValue' in change.keys() else None
                change_tuples.append((title, name, previous_value, current_value, timestamp))
print(num_edits) # 1934309 for 50
len(change_tuples) # 9715201 for 50

In [None]:
data = pd.DataFrame(change_tuples, columns=['title', 'name', 'previous_value', 'current_value', 'timestamp'])
data['timestamp'] = pd.to_datetime(data['timestamp'])

In [None]:
len(data[['title', 'name', 'timestamp']]) - len(data[['title', 'name', 'timestamp']].drop_duplicates()) 
# 209344 with title as key
# 1794 with key as key

In [None]:
data[data[['title', 'name', 'timestamp']].duplicated(keep=False)].sort_values(['title', 'name', 'timestamp']).head(20)

# Analyze number of changes

In [None]:
changes_per_page = data.groupby(['title'])['timestamp'].count()
plt.hist(changes_per_page.to_numpy(), bins=100)
plt.yscale('log')
plt.title("Number of changes per page")
plt.ylabel("#Occurances, log")
plt.savefig(mp_plot_dir / 'changes_per_page.png')

In [None]:
changes_per_attribute = data.groupby(['title', 'name'])['timestamp'].count()
plt.hist(changes_per_attribute.to_numpy(), bins=100)
plt.yscale('log')
plt.title("Number of changes per attribute")
plt.ylabel("#Occurances, log")
plt.savefig(mp_plot_dir / 'changes_per_attribute.png')

In [None]:
data_with_agg = data.join(changes_per_page, on='title', rsuffix='_r').rename(columns={'timestamp_r': 'changes_per_page'})
data_with_agg = data_with_agg.join(changes_per_attribute, on=['title', 'name'], rsuffix='_r').rename(columns={'timestamp_r': 'changes_per_attribute'})

In [None]:
sorted_data = data_with_agg.sort_values('changes_per_attribute', ascending=False)
sorted_data.head()

In [None]:
sorted_data = data_with_agg.sort_values('changes_per_page', ascending=False)
sorted_data.head()

## Changes per time

In [None]:
changes_per_week = data.groupby(pd.Grouper(key='timestamp', freq='W-MON'))['title'].count()
changes_per_week.plot()
plt.title('Changes per week')
plt.savefig(mp_plot_dir / 'changes_per_week.png')

In [None]:
data['timestamp'].dt.day_name()

In [None]:
changes_on_weekday = data.groupby(data['timestamp'].dt.day_name())['title'].count()
sorted_weekdays = [ 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
changes_on_weekday = changes_on_weekday.reindex(sorted_weekdays)

In [None]:
changes_on_weekday.plot.bar()
plt.savefig(mp_plot_dir / 'changes_on_weekday.png')

# Change frequency

In [None]:
changes_per_week_per_item = data.groupby(['title', 'name', pd.Grouper(key='timestamp', freq='W-MON')])['title'].count()

In [None]:
x = changes_per_week_per_item.to_numpy()
print("different changes per week")
print(np.unique(x))

In [None]:
entries = data.groupby(['title', 'name'])
total_time_interval = entries['timestamp'].max() - entries['timestamp'].min()
total_time_interval_weeks = (total_time_interval.to_numpy() / 10**9).astype(float)  / 60 / 60 / 24 / 7 # weeks

In [None]:
deltas

In [None]:
# make a plot for this
deltas = [0, 1, 7, 30, 180, 365]
deltas.extend([365* i for i in range(2,15)])
total_count = len(total_time_interval)
long_standing_count = []
changes_per_entity = entries.count()['timestamp']
for d in deltas:
    day_delta = timedelta(days=d)
    long_standing_entries = (total_time_interval > day_delta)
    long_standing_count.append(long_standing_entries.sum())
    print(f"Number of entries with more than {day_delta} between edits: {long_standing_entries.sum()}, \t"
          f"Number without: {(~long_standing_entries).sum()}")

In [None]:
x = np.arange(len(long_standing_count))  # the label locations
width = 0.35  # the width of the bars

fig, ax = plt.subplots()
rects1 = ax.bar(x - width/2, long_standing_count, width, label='long standing')
rects2 = ax.bar(x + width/2, [total_count - c for c in long_standing_count], width, label='below threshold')

# Add some text for labels, title and custom x-axis tick labels, etc.
ax.set_ylabel('Counts')
ax.set_xlabel('Time in days')
ax.set_title('Number of entities with difference between first and last update time > certain time')
ax.set_xticks(x)
ax.set_xticklabels(deltas, rotation=45)
ax.legend()

fig.tight_layout()
plt.savefig(mp_plot_dir / 'time_difference_first_last.png')
plt.show()


In [None]:
changes_per_week = (entries.count()['timestamp'] / total_time_interval_weeks)

In [None]:
entries_within_deltas = []
for i in range(len(deltas[1:])):
    d0 = timedelta(days=deltas[i])
    d1 = timedelta(days=deltas[i+1])
    entries_within_deltas.append((total_time_interval > d0) & (total_time_interval < d1))
entries_within_deltas.append((total_time_interval > d1))

entries_within_deltas = [changes_per_week[entry_group] for entry_group in entries_within_deltas]

In [None]:
plt.xticks(range(len(deltas)),labels=deltas, rotation=45)
plt.bar(range(len(entries_within_deltas)), [len(entry) for entry in entries_within_deltas])
plt.title("Number of entries that fall into ranges")
plt.savefig(mp_plot_dir / 'time_difference_first_last_counts.png')

In [None]:
len(entries_within_deltas)

In [None]:
entries_within_deltas[5].hist(bins=20)


In [None]:
fig, ax = plt.subplots(3,5, figsize=(15,10))
ax = ax.reshape(-1)
for i,entry_group in enumerate(entries_within_deltas[4:]):
    entry_group.hist(ax=ax[i], bins=20)
    ax[i].set_title(f"Y:{deltas[i+4]/365:.1f}, Per Year: {entry_group.mean()/7*365:.2f}")
    ax[i].set_xlim(0,2)
fig.suptitle("Changes per Week for different long-standing entities")
plt.tight_layout()
plt.savefig(mp_plot_dir / 'changes_per_week_different_times.png')