In [None]:
%load_ext autoreload
%autoreload 2
%config Completer.use_jedi = False

In [None]:
from datetime import datetime, timedelta
from pathlib import Path

from wikipedia_cleanup.data_filter import KeepAttributesDataFilter, generate_default_filters
from wikipedia_cleanup.predict import TrainAndPredictFramework
from wikipedia_cleanup.predictor import ZeroPredictor, OnePredictor, MeanPredictor, RandomPredictor, LastChangePredictor
from wikipedia_cleanup.property_correlation import PropertyCorrelationPredictor
from wikipedia_cleanup.random_forest import RandomForestPredictor
from wikipedia_cleanup.ensemble import OrEnsemble, AndEnsemble, AverageEnsemble
from wikipedia_cleanup.ar import AssociationRulesTemplatePredictor, AssociationRulesPredictor, AssociationRulesInfoboxPredictor

In [None]:
import numpy as np
from tqdm.auto import tqdm
from sklearn.linear_model import LogisticRegression
import pickle
import itertools
import matplotlib.pyplot as plt
from wikipedia_cleanup.utils import result_directory

In [None]:
n_files = 10
n_jobs = 4
input_path = Path("/run/media/secret/manjaro-home/secret/mp-data/new_costum_filtered_format_with_features")
input_path = Path("../../data/new_costum_filtered_format_with_features/")
#model = PropertyCorrelationPredictor(use_cache=True)
#model = RandomForestPredictor(use_cache=True)
# model = MeanPredictor()
# MeanPredictor(), LastChangePredictor()
predictors = [AssociationRulesTemplatePredictor(), AssociationRulesInfoboxPredictor(), AssociationRulesPredictor()]#, 

#model = AveragingEnsemble(predictors)
model = OrEnsemble(predictors)
model = AndEnsemble(predictors)
#model = AverageEnsemble(predictors)


In [None]:
test_start_date = datetime(2018, 9, 1)
predictor_train_date = test_start_date - timedelta(days=365)

In [None]:
def fit_predictors(predictor_train_date, run_id):
    predictions = []
    for model in predictors:
        framework = TrainAndPredictFramework(model, group_key=['infobox_key', 'property_name'], run_id=run_id, test_start_date = predictor_train_date)
        framework.load_data(input_path, n_files, n_jobs)
        framework.fit_model()
        framework.test_model(predict_subset=1, save_results=False)
        predictions.append(framework.run_results['predictions'])
        keys = framework.run_results['keys']
        labels = framework.run_results['labels']
        print("------------------------------------------------")
    return predictions, labels, keys

def generate_features():
    feature_map = dict()
    key_column_idx = framework.data.columns.tolist().index('key')
    for key, group in itertools.groupby(
        framework.data[framework.data["value_valid_from"] < framework.test_start_date].to_numpy(), lambda x: x[key_column_idx]
    ):
        feature_map[key] = sum(1 for x in group)
    return np.vectorize(feature_map.get)(keys, 0)

In [None]:
predictions, labels, keys = fit_predictors(predictor_train_date, run_id="Ensemble_subset_train")

In [None]:
out_path = result_directory() / 'Ensemble_training_predictions'
out_path.mkdir(exist_ok=True, parents=True)

In [None]:
# save cache
with open(out_path / 'predictions.pickle', "wb") as f:
    pickle.dump({'keys' : keys, 'labels': labels, 'predictions': predictions},f)

In [None]:
# load cache
with open(out_path / 'predictions.pickle', "rb") as f:
    cache = pickle.load(f)
    predictions = cache['predictions']
    keys = cache['keys']
    labels = cache['labels']

In [None]:
framework = TrainAndPredictFramework(model, group_key=['infobox_key', 'property_name'], run_id="finished_ensemble", test_start_date = predictor_train_date)
framework.load_data(input_path, n_files, n_jobs)

In [None]:
n_training_samples = generate_features()

In [None]:
ensembles = []
for i in tqdm(range(len(framework.testing_timeframes))):
    current_predictions = np.array([predictions[x][i] for x in range(len(predictors))], dtype=bool)
    current_predictions = np.vstack((current_predictions, n_training_samples[:, None].repeat(current_predictions.shape[2], axis=1)[None, ...]))
    current_predictions = current_predictions.reshape(current_predictions.shape[0], -1).T
    current_labels = np.array(labels[i], dtype=bool).reshape(-1)
    
    ensemble = LogisticRegression()
    ensemble.fit(current_predictions, current_labels)
    ensembles.append(ensemble)

## Evaluation code
Later cells only work for 2 classes

In [None]:
print(ensembles[0].coef_)
print(ensembles[1].coef_)
print(ensembles[2].coef_)
print(ensembles[3].coef_)

In [None]:
timeframe = 0
step = 5
predictions = list(range(0,700,step))
preds = []

for k, v in [(0,0), (0,1), (1,0), (1,1)]:
    current_preds = []
    for i in predictions:
        current_preds.append(ensembles[timeframe].predict_proba(np.array([k,v, i])[None, :])[0,1])
    preds.append(current_preds)

In [None]:
plt.plot(np.array(preds).T)
plt.legend(['no pred', 'regression', 'correlation', 'both'])
plt.xticks(ticks=np.array(predictions[::10])/step, labels=predictions[::10])
plt.ylabel("% change")
plt.xlabel("num changes in train set")

# Test the ensembles

In [None]:
predictions, labels, keys = fit_predictors(test_start_date, run_id="Ensemble_full_train")

In [None]:
out_path = result_directory() / 'Ensemble_testing_predictions'
out_path.mkdir(exist_ok=True, parents=True)

In [None]:
# save cache
with open(out_path / 'predictions.pickle', "wb") as f:
    pickle.dump({'keys' : keys, 'labels': labels, 'predictions': predictions, 'ensembles': ensembles},f)

In [None]:
# load cache
with open(out_path / 'predictions.pickle', "rb") as f:
    cache = pickle.load(f)
    predictions = cache['predictions']
    keys = cache['keys']
    labels = cache['labels']
    ensembles = cache['ensembles']

In [None]:
framework = TrainAndPredictFramework(model, group_key=['infobox_key', 'property_name'], run_id="finished_ensemble", test_start_date = test_start_date)
framework.load_data(input_path, n_files, n_jobs)

n_training_samples = generate_features()

In [None]:
ensemble_predictions = []   
for i in tqdm(range(len(framework.testing_timeframes))):
    current_predictions = np.array([predictions[x][i] for x in range(len(predictors))], dtype=bool)
    current_predictions = np.vstack((current_predictions, n_training_samples[:, None].repeat(current_predictions.shape[2], axis=1)[None, ...]))
    current_predictions = current_predictions.reshape(current_predictions.shape[0], -1).T
    current_labels = np.array(labels[i], dtype=bool).reshape(-1)
    
    ensemble = ensembles[i]
    ensemble_predictions.append(ensemble.predict_proba(current_predictions)[:, 1])

In [None]:
thresholded_ensemble_predictions = []
threshold = 0.5
for i in range(len(framework.testing_timeframes)):
    thresholded_ensemble_predictions.append(ensemble_predictions[i].reshape(predictions[0][i].shape) > threshold)

In [None]:
thresholded_ensemble_predictions = thresholded_ensemble_predictions

In [None]:
new_run_results = {'keys' : keys, 'labels': labels, 'predictions': thresholded_ensemble_predictions}
framework.run_results = new_run_results
try:
    framework.data["value_valid_from"] = framework.data["value_valid_from"].dt.date
except AttributeError:
    pass
print(framework._evaluate_predictions(thresholded_ensemble_predictions, labels[0]))
framework.generate_plots()