In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import itertools
import json
from pathlib import Path
from datetime import timedelta, datetime

import numpy as np
import pandas as pd
from tqdm.auto import tqdm
import matplotlib.pyplot as plt

from wikipedia_cleanup.data_processing import read_file, get_data
from wikipedia_cleanup.baseline import next_change

In [None]:
mp_drive_dir = Path("/media/hpi_share/")#Path("//FS23/projekte$/MP2021/MPWS2021/MPWS2021FN1")
mp_plot_dir = mp_drive_dir / "plots"
input_path = Path("../../data")
input_data = list(input_path.rglob("*.json"))
files = [x for x in input_data if x.is_file()]
len(files) # total 580

In [None]:
data = get_data(input_path, n_files=3, n_jobs=6)

In [None]:
data['validFrom'] = data['validFrom'].dt.floor('d').dt.tz_localize(None)
data['validTo'] =  data['validTo'].dt.tz_localize(None)

In [None]:
train_data = data[~data['validTo'].isna()]

# Predict the last timestamp

In [None]:
timestamps_per_page_property = train_data.groupby(['pageID', 'property.name'])['validFrom']
labels = []
predicted_values = []
for key, values in tqdm(timestamps_per_page_property):
    if len(values) > 2:
        to_predict = sorted(values)[-1]
        labels.append(to_predict)
        predicted = next_change(values)
        predicted_values.append(predicted)

In [None]:
labels = np.array(labels)
predicted_values = np.array(predicted_values)
no_pred_filter = predicted_values!= None

In [None]:
r = np.abs(predicted_values[no_pred_filter] - labels.astype(np.datetime64)[no_pred_filter])
converted_error = r.astype('timedelta64[D]').astype(np.int)
mean_absolute_error = np.mean(converted_error)
root_squared_mean_error = np.sqrt(np.mean(np.square(converted_error)))
print(f"Mean days absolute error: {mean_absolute_error}") 
print(f"Root squared mean error: {root_squared_mean_error}") 
# With unique
#Mean
#Mean days absolute error: 481.6636837864201
#Root squared mean error: 704.0731080458512

# Median
#Mean days absolute error: 414.5928156410365
#Root squared mean error: 669.8950754439018

#With Sort

# Median
#Mean days absolute error: 224.4591567177933
#Root squared mean error: 421.5340919090345

# Mean
#Mean days absolute error: 325.01978170499456
#Root squared mean error: 482.0884751074065

#Median unique: 374.2281070831378

In [None]:
test_data = data["validFrom"] > pd.datetime(2018, 9, 1)

In [None]:
x["validFrom"] = x["validFrom"].dt.tz_localize(None)

# Is the value changing next year?

In [None]:
is_test_data = (data["validFrom"] > datetime(2018, 9, 1)) & (data["validFrom"] < datetime(2019, 9, 1))
is_train_data = data["validFrom"] <= datetime(2018, 9, 1)
train_data = data[is_train_data]
test_data = data[is_test_data]
test_data = test_data.set_index(['pageID', 'property.name'])
#test_data[test_data.set_index(['pageID', 'property.name']).index.isin(
#    train_data.set_index(['pageID', 'property.name']).index)]

In [None]:
train_data['next_year'] = (train_data["validTo"] > datetime(2018, 9, 1)) & (data["validTo"] < datetime(2019, 9, 1))

In [None]:
train_data