In [None]:
import json
from pathlib import Path

import numpy as np
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt
import libarchive.public
import pickle
import random
import Levenshtein

In [None]:
PERCENTAGE = 5

In [None]:
def input_file_size(file_path: Path):
    with libarchive.public.file_reader(str(file_path)) as archive:
        size = 0
        for entry in archive:
            size += entry.size
    return size

In [None]:
input_folder = Path('../../../data/matched-infoboxes-raw')
input_files = list(input_folder.rglob('*.7z'))
len(input_files)

In [None]:
# SAVE change tuples partially on disk.
num_iterations = 100 // PERCENTAGE
for i in range(num_iterations):
    partial_input_files = partial_input_files = input_files[int(len(input_files) / num_iterations * i): int(len(input_files) / num_iterations * (i + 1))]
    print(f'reading {int(len(input_files) / num_iterations * i)} to {int(len(input_files) / num_iterations * (i + 1))}')
    change_tuples = []
    for archive_path in tqdm(partial_input_files):
        with libarchive.public.file_reader(str(archive_path)) as archive:
            for entry in archive:
                content_bytes = bytearray('', encoding='utf_8')
                for block in entry.get_blocks():
                    content_bytes += block
                content = content_bytes.decode(encoding='utf_8')
                jsonObjs = content.split('\n')
                for jsonObj in filter(lambda x: x, jsonObjs):
                    obj = json.loads(jsonObj)
                    title = obj['pageTitle']
                    subject = obj['pageID']
                    changes = obj['changes']
                    valid_from = obj['validFrom']
                    for change in changes:
                        current_value = change['currentValue'] if 'currentValue' in change.keys() else None
                        previous_value = change['previousValue'] if 'previousValue' in change.keys() else None
                        name = change['property']['name']
                        valid_to = change['valueValidTo'] if 'valueValidTo' in change.keys() else None
                        change_tuples.append((subject, title, name, previous_value, current_value, valid_from, valid_to))
    
    print('writing file')
    with open(f'../../../data/raw_change_tuples/partial_change_tuples_part_{i}.pickle', 'wb') as file:
        pickle.dump(change_tuples, file)
    print('successfully wrote file')

In [None]:
with open(f'../../../data/raw_change_tuples/partial_change_tuples_part_{0}.pickle', 'rb') as file:
        change_tuples = pickle.load(file)

In [None]:
data = pd.DataFrame(change_tuples, columns=['subject', 'title', 'name', 'previous_value', 'current_value', 'valid_from', 'valid_to'])
del change_tuples
data['valid_from'] = pd.to_datetime(data['valid_from'])
data['valid_to'] = pd.to_datetime(data['valid_to'])
data['valid_time'] = data['valid_to'] - data['valid_from']
data

In [None]:
data

In [None]:
valid_times = data['valid_time'][data['valid_time'].notnull()].to_numpy().astype('int64')
plt.hist(valid_times, bins=100)
plt.yscale('log')
plt.title("Number of changes per page")
plt.ylabel("Valid time, log")

In [None]:
counted_changes = {}
for archive_path in tqdm(input_files):
    with libarchive.public.file_reader(str(archive_path)) as archive:
        for entry in archive:
            content_bytes = bytearray('', encoding='utf_8')
            for block in entry.get_blocks():
                content_bytes += block
            content = content_bytes.decode(encoding='utf_8')
            jsonObjs = content.split('\n')
            for jsonObj in filter(lambda x: x, jsonObjs):
                obj = json.loads(jsonObj)
                subject = obj['pageTitle']
                changes = obj['changes']
                timestamp = obj['validFrom']
                if subject not in counted_changes.keys():
                    counted_changes[subject] = {}
                curr_counted_changes = counted_changes[subject]
                for change in changes:
                    name = change['property']['name']
                    curr_counted_changes[name] = curr_counted_changes[name] + 1 if name in curr_counted_changes.keys() else 1

In [None]:
change_tuples = []
for archive_path in tqdm(partial_input_files):
    with libarchive.public.file_reader(str(archive_path)) as archive:
        for entry in archive:
            content_bytes = bytearray('', encoding='utf_8')
            for block in entry.get_blocks():
                content_bytes += block
            content = content_bytes.decode(encoding='utf_8')
            jsonObjs = content.split('\n')
            for jsonObj in filter(lambda x: x, jsonObjs):
                obj = json.loads(jsonObj)
                subject = obj['pageTitle']
                changes = obj['changes']
                timestamp = obj['validFrom']
                for change in changes:
                    current_value = change['currentValue'] if 'currentValue' in change.keys() else None
                    previous_value = change['previousValue'] if 'previousValue' in change.keys() else None
                    name = change['property']['name']
                    change_tuples.append((subject, name, previous_value, current_value, timestamp))

In [None]:
num_edits = 0
change_tuples = []
for file in tqdm(files[:50]):
    with open(file, 'r', encoding='utf-8') as f:
        for jsonObj in f:
            single_edit = json.loads(jsonObj)
            num_edits += 1
            #entries.append(single_edit)
            title = single_edit['pageTitle']
            changes = single_edit['changes']
            timestamp = single_edit['validFrom']
            for change in changes:
                name = change['property']['name']
                current_value = change['currentValue'] if 'currentValue' in change.keys() else None
                previous_value = change['previousValue'] if 'previousValue' in change.keys() else None
                change_tuples.append((title, name, previous_value, current_value, timestamp))
print(num_edits) # 1934309 for 50
len(change_tuples) # 9715201 for 50

In [None]:
# Analyse Size of Changes

In [None]:
# SAVE change tuples partially on disk.
num_iterations = 100 // PERCENTAGE
for i in range(num_iterations):
    partial_input_files = partial_input_files = input_files[int(len(input_files) / num_iterations * i): int(len(input_files) / num_iterations * (i + 1))]
    print(f'reading {int(len(input_files) / num_iterations * i)} to {int(len(input_files) / num_iterations * (i + 1))}')
    change_tuples = []
    for archive_path in tqdm(partial_input_files):
        with libarchive.public.file_reader(str(archive_path)) as archive:
            for entry in archive:
                content_bytes = bytearray('', encoding='utf_8')
                for block in entry.get_blocks():
                    content_bytes += block
                content = content_bytes.decode(encoding='utf_8')
                jsonObjs = content.split('\n')
                for jsonObj in filter(lambda x: x, jsonObjs):
                    obj = json.loads(jsonObj)
                    title = obj['pageTitle']
                    subject = obj['pageID']
                    changes = obj['changes']
                    valid_from = obj['validFrom']
                    for change in changes:
                        current_value = change['currentValue'] if 'currentValue' in change.keys() else None
                        previous_value = change['previousValue'] if 'previousValue' in change.keys() else None
                        name = change['property']['name']
                        valid_to = change['valueValidTo'] if 'valueValidTo' in change.keys() else None
                        change_tuples.append((subject, title, name, previous_value, current_value, valid_from, valid_to))
    
    print('writing file')
    with open(f'../../../data/raw_change_tuples/partial_change_tuples_part_{i}.pickle', 'wb') as file:
        pickle.dump(change_tuples, file)
    print('successfully wrote file')ta/raw_change_tuples/partial_change_tuples_part_{0}.pickle', 'rb') as file:
        change_tuples = pickle.load(file)

In [None]:
data = pd.DataFrame(change_tuples, columns=['subject', 'title', 'name', 'previous_value', 'current_value', 'valid_from', 'valid_to'])
del change_tuples
data['valid_from'] = pd.to_datetime(data['valid_from'])
data['valid_to'] = pd.to_datetime(data['valid_to'])
data['valid_time'] = data['valid_to'] - data['valid_from']
data['valid_time'] = data['valid_time'] / np.timedelta64(1, 's')


In [None]:
del data['title']
del data['name']
del data['valid_from']
del data['valid_to']

In [None]:
# data['change_size'] = data[['previous_value', 'current_value']].agg(lambda x:print(x.__class__), axis=1)
data['change_size'] = data['previous_value'].combine(data['current_value'], lambda x, y: Levenshtein.distance(x, y) if x and y else None, fill_value=None)

In [None]:
data.sort_values(by=['valid_time'], inplace=True)

In [None]:
def seconds_to_day(sec):
    return sec / (60 * 60 * 24)

def day_to_seconds(sec):
    return sec * (60 * 60 * 24)

In [None]:
valid_time = seconds_to_day(data['valid_time'].dropna().to_numpy())
valid_time.min(), valid_time.max(), valid_time.mean(), valid_time.std()

In [None]:
recent_changes = data[data['valid_time'] <= day_to_seconds(1)]
plt.hist(recent_changes['change_size'].to_numpy(), bins=list(range(20)))
plt.title("Size of changes (Levenshtein) for changes that held for less than a day")
plt.ylabel("#Occurances, log")

In [None]:
data[data['change_size'] <= 0]

In [None]:
recent_changes = data[data['valid_time'] >= day_to_seconds(356)]
plt.hist(recent_changes['change_size'].to_numpy(), bins=list(range(100)))
plt.title("Size of changes (Levenshtein) for changes that held at least a year")
plt.ylabel("#Occurances, log")

In [None]:
# File Tests

In [None]:
# Does ['property']['type'] have another value as 'attribute'
types = set()
for archive_path in tqdm(input_files):
    with libarchive.public.file_reader(str(archive_path)) as archive:
        for entry in archive:
            content_bytes = bytearray('', encoding='utf_8')
            for block in entry.get_blocks():
                content_bytes += block
            content = content_bytes.decode(encoding='utf_8')
            jsonObjs = content.split('\n')
            for jsonObj in filter(lambda x: x, jsonObjs):
                obj = json.loads(jsonObj)
                changes = obj['changes']
                for change in changes:
                    prop_type = change['property']['type']
                    types.add(prop_type)
types

In [None]:
# How many numeric values
numeric = 0
string = 0
numeric_to_string = 0
string_to_numeric = 0
for archive_path in tqdm(input_files):
    with libarchive.public.file_reader(str(archive_path)) as archive:
        for entry in archive:
            content_bytes = bytearray('', encoding='utf_8')
            for block in entry.get_blocks():
                content_bytes += block
            content = content_bytes.decode(encoding='utf_8')
            jsonObjs = content.split('\n')
            for jsonObj in filter(lambda x: x, jsonObjs):
                obj = json.loads(jsonObj)
                changes = obj['changes']
                for change in changes:
                    curr_val_number = None
                    prev_val_number = None
                    if 'previousValue' in change:
                        try:
                            float(change['previousValue'])
                            prev_val_number = True
                        except ValueError:
                            prev_val_number = False
                    if 'currentValue' in change:iterate
                        try:
                            float(change['currentValue'])
                            curr_val_number = True
                        except ValueError:
                            curr_val_number = False
                    if curr_val_number is not None:
                        numeric += int(curr_val_number)
                        string += int(not curr_val_number)
                        if prev_val_number is not None:
                            if curr_val_number and not prev_val_number:
                                string_to_numeric += 1
                            if not curr_val_number and prev_val_number:
                                numeric_to_string += 1
                                    
print(f'numeric: {numeric} \t\t % {numeric / (numeric + string)}')
print(f'string: {string} \t\t % {string / (numeric + string)}')
print('\n\nType Changes\n\n')
print(f'numeric to string: {numeric_to_string} \t\t % {numeric_to_string / (numeric + string)}')
print(f'string to numeric: {string_to_numeric} \t\t % {string_to_numeric / (numeric + string)}')

In [None]:
# Analyse bot reverts

In [None]:
partial_input_files = partial_input_files = input_files[: int(len(input_files) * (PERCENTAGE / 100))]
change_tuples = []
for archive_path in tqdm(partial_input_files):
    with libarchive.public.file_reader(str(archive_path)) as archive:
        for entry in archive:
            content_bytes = bytearray('', encoding='utf_8')
            for block in entry.get_blocks():
                content_bytes += block
            content = content_bytes.decode(encoding='utf_8')
            jsonObjs = content.split('\n')
            for jsonObj in filter(lambda x: x, jsonObjs):
                obj = json.loads(jsonObj)
                key = obj['key']
                revisionID = obj['revisionId']
                valid_from = obj['validFrom']
                changes = obj['changes']
                for change in changes:
                    current_value = change['currentValue'] if 'currentValue' in change.keys() else None
                    previous_value = change['previousValue'] if 'previousValue' in change.keys() else None
                    name = change['property']['name']
                    valid_to = change['valueValidTo'] if 'valueValidTo' in change.keys() else None
                    change_tuples.append((key, revisionID, name, previous_value, current_value, valid_from, valid_to))

print('writing file')
with open(f'../../../data/raw_change_tuples/partial_change_tuples_part_key_{0}.pickle', 'wb') as file:
    pickle.dump(change_tuples, file)

In [None]:
with open(f'../../../data/raw_change_tuples/partial_change_tuples_part_key_{0}.pickle', 'rb') as file:
        change_tuples = pickle.load(file)

In [None]:
data = pd.DataFrame(change_tuples, columns=['key', 'revisionId', 'name', 'previous_value', 'current_value', 'valid_from', 'valid_to'])
del change_tuples
data['valid_from'] = pd.to_datetime(data['valid_from'])
data['valid_to'] = pd.to_datetime(data['valid_to'])
data['valid_time'] = data['valid_to'] - data['valid_from']
data['valid_time'] = data['valid_time'] / np.timedelta64(1, 's')

In [None]:
groups = data.groupby(by=['key', 'revisionId', 'name'])
groups.filter(lambda x: x.shape[0] > 1)

In [None]:
for key, values in list(groups.groups.iteritems())[:20]:
    print(df.ix[values], "\n\n")

In [None]:
groups.describe()

In [None]:
# Analyse Creations and Deletions

In [None]:
with open(input_folder.joinpath('change_sizes.pickle'), 'rb') as file:
    change_sizes = pickle.load(file)

In [None]:
change_sizes = np.array(change_sizes)
plt.hist(np.array(change_sizes), bins=list(range(100)))
#plt.yscale('log')
plt.title("Size of changes (Levenstein Distance)")
plt.ylabel("#Occurances")

In [None]:
change_sizes.min(), change_sizes.max(), change_sizes.mean(), change_sizes.std()

# Analyse changes

In [None]:
changes_per_page = data.groupby(['title'])['timestamp'].count()
plt.hist(changes_per_page.to_numpy(), bins=100)
plt.yscale('log')
plt.title("Number of changes per page")
plt.ylabel("#Occurances, log")

In [None]:
changes_per_attribute = data.groupby(['title', 'name'])['timestamp'].count()
plt.hist(changes_per_attribute.to_numpy(), bins=100)
plt.yscale('log')
plt.title("Number of changes per attribute")
plt.ylabel("#Occurances, log")

In [None]:
data_with_agg = data.join(changes_per_page, on='title', rsuffix='_r').rename(columns={'timestamp_r': 'changes_per_page'})
data_with_agg = data_with_agg.join(changes_per_attribute, on=['title', 'name'], rsuffix='_r').rename(columns={'timestamp_r': 'changes_per_attribute'})

In [None]:
data_with_agg.sort_values('changes_per_attribute')