# Loading and Splitting

The loading mechanism gets the class of dataset that the user wants to use for
evaluation on his algorithmn.

The splitting will take in a split type and create the necessary split on the
dataset.

In [None]:
from streamsight.splits import SingleTimePointSetting
from streamsight.datasets import AmazonMusicDataset

dataset = AmazonMusicDataset()
# yelp or amazon as a base instead
# movielens timestamp curtting might be problematic
data = dataset.load()

# user creates his own custom dataset class if needed
setting = SingleTimePointSetting(
    1406851200,
    1398556800
)
# once a setting is defined, it can be used to split data
# the data will be stored in the attribute of the setting object
setting.split(data)

# Training the Algorithm

Training the RecSys algorithm is as straight forward. The choice of the algorithm
is selected by instantiating the class of algorithm choice then training the
model with the dataset from the setting. The setting class provides multiple
public attribute calls that can be used by the programmer.

We will demo a simple example below.

In [None]:
############# Single global timeline split #############
# The class of the algorithm tho be tested can be instantiated
from streamsight.algorithms.itemknn import ItemKNN


algo = ItemKNN()
# each algorithm has a fit method that takes the training data and fits the model
algo.fit(setting.background_data)

# Evaluation

In [None]:
from streamsight.metrics.precision import PrecisionK


X_pred = algo.predict(setting.unlabeled_data_series)
metric = PrecisionK(10)
metric.calculate(setting.ground_truth_data_series.binary_values, X_pred)
metric.value

# Evaluation for sliding window setting

In [None]:
############# Sliding window split #############
from streamsight.algorithms import ItemKNNIncremental
algo = ItemKNNIncremental()

# itemknn
# (1) use the inital batch of data and no change aka no new learning
# (2) inital batch and update with new batch add more data essentially
# (3) just use the new batch of data

background_data = setting.background_data
algo.fit(background_data)

for _ in range(setting.num_split_set):
    unlabeled_data = setting.next_unlabeled_data()
    ground_truth_data = setting.next_ground_truth_data()

    # Eval model
    X_pred = algo.predict(setting.unlabeled_data_series)
    metric = PrecisionK(10)
    metric.calculate(setting.ground_truth_data_series.binary_values, X_pred)
    print(metric.value)

    # Release ground truth to model
    current_training_set = ground_truth_data
    algo.fit(current_training_set)


# Pipeline to streamline API usage

In [None]:
pipeline_builder = PipelineBuilder()
pipeline_builder.set_dataset(AmazonMusicDataset)
pipeline_builder.set_splitter(SingleTimePointSetting)
pipeline_builder.add_algorithm(["KNN","UserKNN"])

# incremental issue: on the algo 
algo.get_train_data()
algo.get_test_data()


pipeline_builder.add_metric("Recall")
pipeline_builder.add_metric("Precision")

############# Running pipeline as a whole #############
# provide ability to step through each window 
pipeline = pipeline_builder.build()
pipeline.run()

############# Running pipeline by stepping #############
# provide ability to step through each window 
pipeline = pipeline_builder.build()
pipeline.step(n=1,verbose=True)
pipeline.display_metrics() # show metrics for the current window
