In [None]:
%load_ext autoreload
%autoreload 2
%config Completer.use_jedi = False

In [None]:
import itertools
import json
from pathlib import Path
from datetime import timedelta, datetime

import numpy as np
import pandas as pd
from tqdm.auto import tqdm
import matplotlib.pyplot as plt

from wikipedia_cleanup.data_processing import get_data
from wikipedia_cleanup.data_filter import KeepAttributesDataFilter, generate_default_filters
from wikipedia_cleanup.baseline import next_change

from sklearn.metrics import precision_recall_fscore_support

In [None]:
n_files=8
n_jobs=4
input_path = Path("../../data/custom-format-default-filtered/")
keep_filter = KeepAttributesDataFilter(['infobox_key', 'property_name', 'page_id', 'value_valid_from'])

data = get_data(input_path,n_files=n_files, n_jobs=n_jobs, filters=[keep_filter])

In [None]:
data['value_valid_from'] = data['value_valid_from'].dt.tz_localize(None)

In [None]:
testset_start = datetime(2018,9,1)
testset_duration = 365
total_time_window = timedelta(testset_duration)#days
testset_end = testset_start + total_time_window
time_offset = timedelta(1)

In [None]:
class ZeroPredictor:
    def predict_day(self, change_data, current_day):
        return 0
    def predict_week(self, change_data, current_day):
        return self.predict_day(change_data, current_day)
    def predict_month(self, change_data, current_day):
        return self.predict_day(change_data, current_day)
    def predict_year(self, change_data, current_day):
        return self.predict_day(change_data, current_day)

class DummyPredictor(ZeroPredictor):
    def predict_day(self, change_data, current_day):
        pred = next_change(change_data)
        if pred is None:
            return 0
        return pred - current_day > timedelta(1)

In [None]:
# number of changes in our testing period
((data['value_valid_from'] > testset_start) & (data['value_valid_from'] < (testset_start+total_time_window))).sum()

In [None]:
#property_change_history = data.groupby(['infobox_key', 'property_name'])["value_valid_from"].agg(list)+
property_change_history = data.groupby(['page_id'])["value_valid_from"].agg(list)

In [None]:
all_year_predictions = []
all_month_predictions = []
all_week_predictions = []
all_day_predictions = []
all_day_labels = []
model = DummyPredictor()
current_dates = [testset_start + timedelta(days=x) for x in range(testset_duration + 1)]

for key, changes in tqdm(property_change_history.iteritems(), total=len(property_change_history)):
    days_evaluated = 0
    current_date = testset_start
    changes = np.sort(changes)
    train_data_idx = np.searchsorted(changes, current_date, side="right")
    day_predictions = np.empty(testset_duration)
    week_predictions = []
    month_predictions = []
    year_predictions = []
    day_labels = []
    while current_date < testset_end:        
        train_input = changes[:train_data_idx]
        #day_predictions[days_evaluated] = model.predict_day(train_input, current_date)
        #if days_evaluated % 7 == 0:
        #     week_predictions.append(model.predict_week(train_input, current_date))
        if days_evaluated % 30 == 0:
            month_predictions.append(model.predict_month(train_input, current_date))
        if days_evaluated % 365 == 0:
            year_predictions.append(model.predict_year(train_input, current_date))
        if train_data_idx < len(changes):
            day_labels.append(changes[train_data_idx].date() == current_date.date())
        else:
            day_labels.append(False)
        days_evaluated += 1
        current_date = current_dates[days_evaluated]
        while(train_data_idx < len(changes) and changes[train_data_idx] < current_date):
            train_data_idx+=1
    all_day_predictions.append(day_predictions)
    all_week_predictions.append(week_predictions)
    all_month_predictions.append(month_predictions)
    all_year_predictions.append(year_predictions)
    all_day_labels.append(day_labels)

In [None]:
all_year_predictions = np.array(all_year_predictions, dtype=np.bool)
all_month_predictions = np.array(all_month_predictions, dtype=np.bool)
all_week_predictions = np.array(all_week_predictions, dtype=np.bool)
all_day_predictions = np.array(all_day_predictions, dtype=np.bool)

all_day_labels = np.array(all_day_labels, dtype=np.bool)

In [None]:
def aggregate_labels(data, n):
    if testset_duration%n != 0:        
        padded_labels = np.pad(data, ((0,0), (0,n - testset_duration%n)))
    else:
        padded_labels = data
    padded_labels = padded_labels.reshape(-1, n, data.shape[0])
    return np.any(padded_labels, axis=1).reshape(data.shape[0], -1)

all_week_labels = aggregate_labels(all_day_labels, 7)
all_month_labels = aggregate_labels(all_day_labels, 30)
all_year_labels = aggregate_labels(all_day_labels, 365)

# Random prediction

In [None]:
all_day_predictions = np.random.randint(0,2,all_day_labels.shape)
all_week_predictions = np.random.randint(0,2,all_week_labels.shape)
all_month_predictions = np.random.randint(0,2,all_month_labels.shape)
all_year_predictions = np.random.randint(0,2,all_year_labels.shape)

# Metrics

In [None]:
def print_stats(pre_rec_f1_stat, title):
    print(f"{title} \t\t changes \t no changes")
    print(f"Precision:\t\t {pre_rec_f1_stat[0][1]:.4} \t\t {pre_rec_f1_stat[0][0]:.4}")
    print(f"Recall:\t\t\t {pre_rec_f1_stat[1][1]:.4} \t\t {pre_rec_f1_stat[1][0]:.4}")
    print(f"F1score:\t\t {pre_rec_f1_stat[2][1]:.4} \t\t {pre_rec_f1_stat[2][0]:.4}")
    print(f"Percent of Data:\t {pre_rec_f1_stat[3][1] / (pre_rec_f1_stat[3][0]+pre_rec_f1_stat[3][1]):.4}, \tTotal: {pre_rec_f1_stat[3][1]}")
    print()

In [None]:
day_data = precision_recall_fscore_support(all_day_labels.flatten(), all_day_predictions.flatten())
week_data = precision_recall_fscore_support(all_week_labels.flatten(), all_week_predictions.flatten())
month_data = precision_recall_fscore_support(all_month_labels.flatten(), all_month_predictions.flatten())
year_data = precision_recall_fscore_support(all_year_labels.flatten(), all_year_predictions.flatten())

In [None]:
print_stats(day_data, "Per day data")
print_stats(week_data, "Per week data")
print_stats(month_data, "Per month data")
print_stats(year_data, "Per year data")

In [None]:
#print_stats(day_data, "Per day data")
#print_stats(week_data, "Per week data")
print_stats(month_data, "Per month data")
print_stats(year_data, "Per year data")