In [1]:
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt

In [None]:
from wikipedia_cleanup.data_processing import get_data
from wikipedia_cleanup.data_filter import generate_default_filters, KeepAttributesDataFilter, OnlyUpdatesDataFilter

In [None]:
# Set RAM Limit
import resource
  
def limit_memory(maxsize):
    soft, hard = resource.getrlimit(resource.RLIMIT_AS)
    resource.setrlimit(resource.RLIMIT_AS, (maxsize, hard))
limit_memory(84000000000)

# Enable Code Completion
%config Completer.use_jedi = False

Load data

In [None]:
data = get_data('../../custom-format-default-filtered/', n_jobs=1, filters=[OnlyUpdatesDataFilter(), KeepAttributesDataFilter(['infobox_key', 'page_title', 'property_name', 'template',\
                                                                                                     'previous_value', 'current_value', 'value_valid_from'])])

In [None]:
data = data[data['property_name']!='']
data = data.rename(columns={"value_valid_from": "timestamp"})

Groupby property name and template type to get median and avg change frequencies

In [None]:
g = data.groupby(['property_name','template'])
avg_time_diff = g.progress_apply(lambda x: x['timestamp'].sort_values().diff().mean()).reset_index()
median_time_diff = g.progress_apply(lambda x: x['timestamp'].sort_values().diff().median()).reset_index()
avg_time_diff.rename(columns={0:'avgChangeFrequ'},inplace=True)
median_time_diff.rename(columns={0:'medianChangeFrequ'},inplace=True)
# use everything with more than 13 months as change frequ
avg_time_diff['static'] = avg_time_diff['avgChangeFrequ'] > pd.Timedelta(weeks = 56)
median_time_diff['static'] = median_time_diff['avgChangeFrequ'] > pd.Timedelta(weeks = 56)

Plotting

In [None]:
avg_static_infoboxes = avg_time_diff[avg_time_diff['static']=True][['property_name','template']].value_counts()
avg_dynamic_infoboxes = avg_time_diff[avg_time_diff['static']=False][['property_name','template']].value_counts()
median_static_infoboxes = median_time_diff[median_time_diff['static']=True][['property_name','template']].value_counts()
median_dynamic_infoboxes = median_time_diff[median_time_diff['static']=False][['property_name','template']].value_counts()

In [None]:
plt.figure(figsize=(12,5))
plt.boxplot(avg_static_infoboxes[:10].values,\
           labels=avg_static_infoboxes[:10].index)
plt.title('Count of static property, template pairs')
plt.ylabel('Count of static infobox occurences')
plt.xlabel('Property template pair')
plt.show()

In [None]:
plt.figure(figsize=(12,5))
plt.bar(avg_dynamic_infoboxes[:10].values,\
        labels=avg_dynamic_infoboxes[:10].index)
plt.title('Count of dynamic property, template pairs')
plt.ylabel('Count of dynamic infobox occurences')
plt.xlabel('Property template pair')
plt.show()