# Loading and Splitting

The loading mechanism gets the class of dataset that the user wants to use for
evaluation on his algorithmn.

The splitting will take in a split type and create the necessary split on the
dataset.

In [1]:
from streamsight.setting import SingleTimePointSetting
from streamsight.datasets import AmazonMusicDataset

dataset = AmazonMusicDataset()
# yelp or amazon as a base instead
# movielens timestamp curtting might be problematic
data = dataset.load()

# user creates his own custom dataset class if needed
setting = SingleTimePointSetting(
    1406851200,
    1398556800
)
# once a setting is defined, it can be used to split data
# the data will be stored in the attribute of the setting object
setting.split(data)

[37mDEBUG   [0m - streamsight - [34mLogging is configured.[0m
[32mINFO    [0m - streamsight - [34mLogging started[0m
[37mDEBUG   [0m - streamsight.datasets.base - [34mAmazonMusicDataset being initialized with 'data' as the base path.[0m
[37mDEBUG   [0m - streamsight.datasets.base - [34mAmazonMusicDataset is initialized.[0m
[32mINFO    [0m - streamsight.datasets.base - [34mAmazonMusicDataset is loading dataset...[0m
[37mDEBUG   [0m - streamsight.datasets.base - [34mData file is in memory and in dir specified.[0m


  from .autonotebook import tqdm as notebook_tqdm


[37mDEBUG   [0m - streamsight.datasets.base - [34mAmazonMusicDataset applying filters set.[0m
[37mDEBUG   [0m - streamsight.preprocessing.preprocessor - [34m	interactions before preprocess: 1584082[0m
[37mDEBUG   [0m - streamsight.preprocessing.preprocessor - [34m	items before preprocess: 456992[0m
[37mDEBUG   [0m - streamsight.preprocessing.preprocessor - [34m	users before preprocess: 840372[0m
[32mINFO    [0m - streamsight.datasets.base - [34mAmazonMusicDataset dataset loaded.[0m
[32mINFO    [0m - streamsight.setting.single_time_point_setting - [34mSplitting data at time 1406851200 with delta_after_t interval 2147483647 and delta_before_t interval 1398556800[0m
[37mDEBUG   [0m - streamsight.matrix.interation_matrix - [34mPerforming lt(t, 1406851200)[0m
[37mDEBUG   [0m - streamsight.matrix.interation_matrix - [34mPerforming ge(t, 8294400)[0m
[37mDEBUG   [0m - streamsight.matrix.interation_matrix - [34mPerforming ge(t, 1406851200)[0m
[37mDEBUG   [0

In [4]:
data._df

Unnamed: 0,interactionid,uid,iid,ts
0,0,0,9714721180,877305600
1,1,1,B0013D89TW,880675200
2,2,1,B0013D89TW,880675200
3,3,2,B00122Z4Y2,893203200
4,4,2,B00122Z4Y2,893203200
...,...,...,...,...
1584077,1584077,840368,B01GPDFR26,1538265600
1584078,1584078,840369,B01HFJ54YK,1538265600
1584079,1584079,840370,B01G4DH4AU,1538352000
1584080,1584080,840371,B01H7XADLS,1538438400


# Training the Algorithm

Training the RecSys algorithm is as straight forward. The choice of the algorithm
is selected by instantiating the class of algorithm choice then training the
model with the dataset from the setting. The setting class provides multiple
public attribute calls that can be used by the programmer.

We will demo a simple example below.

In [2]:
############# Single global timeline split #############
# The class of the algorithm tho be tested can be instantiated
from streamsight.algorithms.itemknn import ItemKNN


algo = ItemKNN()
# each algorithm has a fit method that takes the training data and fits the model
algo.fit(setting.background_data)

ValueError: invalid literal for int() with base 10: 'B0013D89TW'

# Evaluation

In [3]:
from streamsight.metrics.precision import PrecisionK


X_pred = algo.predict(setting.unlabeled_data)
metric = PrecisionK(10)
metric.calculate(setting.ground_truth_data.binary_values, X_pred)
metric.value

NotFittedError: This ItemKNN instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.

# Evaluation for sliding window setting

In [None]:
############# Sliding window split #############
from streamsight.algorithms import ItemKNNIncremental
algo = ItemKNNIncremental()

# itemknn
# (1) use the inital batch of data and no change aka no new learning
# (2) inital batch and update with new batch add more data essentially
# (3) just use the new batch of data

background_data = setting.background_data
algo.fit(background_data)

for _ in range(setting._num_split_set):
    unlabeled_data = setting.next_unlabeled_data()
    ground_truth_data = setting.next_ground_truth_data()

    # Eval model
    X_pred = algo.predict(unlabeled_data)
    metric = PrecisionK(10)
    metric.calculate(ground_truth_data.binary_values, X_pred)
    print(metric.value)

    # Release ground truth to model
    current_training_set = ground_truth_data
    algo.fit(current_training_set)


# Pipeline to streamline API usage

In [None]:
pipeline_builder = PipelineBuilder()
pipeline_builder.set_dataset(AmazonMusicDataset)
pipeline_builder.set_splitter(SingleTimePointSetting)
pipeline_builder.add_algorithm(["KNN","UserKNN"])

# incremental issue: on the algo 
algo.get_train_data()
algo.get_test_data()


pipeline_builder.add_metric("Recall")
pipeline_builder.add_metric("Precision")

############# Running pipeline as a whole #############
# provide ability to step through each window 
pipeline = pipeline_builder.build()
pipeline.run()

############# Running pipeline by stepping #############
# provide ability to step through each window 
pipeline = pipeline_builder.build()
pipeline.step(n=1,verbose=True)
pipeline.display_metrics() # show metrics for the current window
