# 1. Setup and Configuration
This notebook loads cache acces data from a CSV, identifies the most popular objects, filters the data to include only top-n popular objects, and saves the processed data

In [1]:
import pandas as pd
from collections import Counter
import pickle
import os

In [2]:
CSV_FILE_PATH = 'data.csv'
NUM_POPULAR_OBJECTS = 15000000
PROCESSED_DATA_PICKLE_PATH = 'processed_cache_data.pkl'
MAX_FILTERED_SEQUENCE_LENGTH = 15000

# 2. Load Data from CSV
- Reads the CSV file which has no header and two columns: `timestamp`, `obj_id`
- Sorts the data by timestamp to maintain the correct sequence order

In [3]:
print(f"Loading data from {CSV_FILE_PATH}...")
df = pd.read_csv(CSV_FILE_PATH, header=None, names=['timestamp', 'obj_id'])
df = df.sort_values(by='timestamp')
all_obj_ids_sequence = df['obj_id'].tolist()
print(f"Loaded {len(all_obj_ids_sequence)} object accesses.")
if not all_obj_ids_sequence:
    print("CSV file loaded, but no objects IDs found. Please check the CSV content.")
    raise ValueError("No object IDs found in the CSV.")

Loading data from data.csv...
Loaded 45623306 object accesses.


In [4]:
df.head()

Unnamed: 0,timestamp,obj_id
0,0,9923303287488963378
1,58,9939978422402668152
2,65,4704144935105825719
3,66,4526530743497726240
5,67,1196122231523595215


# 3. Identify Most Popular Objects
- Count the frequency of each object ID
- Selects the top `NUM_POPULAR_OBJECTS`

In [5]:
if all_obj_ids_sequence:
    print(f"Identifying the top {NUM_POPULAR_OBJECTS} most popular objects...")
    obj_id_counts = Counter(all_obj_ids_sequence)

    top_popular_objects_with_counts = obj_id_counts.most_common(NUM_POPULAR_OBJECTS)
    initial_top_popular_object_ids_set = {obj_id for obj_id, count in top_popular_objects_with_counts}
    sorted_popular_object_ids_list = sorted(list(initial_top_popular_object_ids_set))
    print(f"Found {len(sorted_popular_object_ids_list)} popular objects (target was {NUM_POPULAR_OBJECTS}).")
    print("First 10 popular objects:", sorted_popular_object_ids_list[:10])

Identifying the top 15000000 most popular objects...
Found 12262537 popular objects (target was 15000000).
First 10 popular objects: [1792336380206, 2980637468929, 3979782410864, 4040846203747, 8889298622010, 9019821228723, 9413421785408, 11933909851519, 12933419578731, 14438286000342]


# 4. Filter Original Sequence
- Creates a new sequence containing only the accesses to the identified popular objects
- The temporal order of these accesses is preserved
- Further limits this filtered sequence to the first `MAX_FILTERED_SEQUENCES_TO_KEEP` elements

In [6]:
final_filtered_sequence = []
final_list_of_objects_for_vocab = []

if all_obj_ids_sequence and initial_top_popular_object_ids_set:
    print("Filtering the original sequence for initially popular objects...")
    # Step 1: Filter for initially popular objects
    sequence_of_popular_only = [obj_id for obj_id in all_obj_ids_sequence if obj_id in initial_top_popular_object_ids_set]
    
    print(f"Original sequence length: {len(all_obj_ids_sequence)}")
    print(f"Length after filtering for initially popular objects: {len(sequence_of_popular_only)}")

    # Step 2: Limit to the first MAX_FILTERED_SEQUENCE_LENGTH elements
    if len(sequence_of_popular_only) > MAX_FILTERED_SEQUENCE_LENGTH:
        print(f"Limiting the popular-only sequence from {len(sequence_of_popular_only)} to the first {MAX_FILTERED_SEQUENCE_LENGTH} elements.")
        final_filtered_sequence = sequence_of_popular_only[:MAX_FILTERED_SEQUENCE_LENGTH]
    else:
        final_filtered_sequence = sequence_of_popular_only
        print(f"Popular-only sequence length ({len(final_filtered_sequence)}) is within or equal to the limit of {MAX_FILTERED_SEQUENCE_LENGTH}.")

    print(f"Final sequence length to be used for training: {len(final_filtered_sequence)}")
    
    if not final_filtered_sequence:
        print("Warning: The final filtered sequence is empty.")
    else:
        # Step 3: Derive the actual vocabulary from this final_filtered_sequence
        actual_objects_in_final_sequence = Counter(final_filtered_sequence).keys()
        final_list_of_objects_for_vocab = sorted(list(actual_objects_in_final_sequence))
        print(f"Actual number of unique objects in the final {len(final_filtered_sequence)}-element sequence: {len(final_list_of_objects_for_vocab)}")
        print(f"First 10 objects in the final vocabulary (if available): {final_list_of_objects_for_vocab[:10]}")

elif not initial_top_popular_object_ids_set:
     print("Skipping sequence filtering as no initial popular objects were identified.")
else: # all_obj_ids_sequence is empty
    print("Skipping sequence filtering as the initial sequence was empty.")

Filtering the original sequence for initially popular objects...
Original sequence length: 45623306
Length after filtering for initially popular objects: 45623306
Limiting the popular-only sequence from 45623306 to the first 15000 elements.
Final sequence length to be used for training: 15000
Actual number of unique objects in the final 15000-element sequence: 7588
First 10 objects in the final vocabulary (if available): [2890813961931749, 8895774005720897, 15011980255538162, 19339850754295784, 22706151954170961, 24207063545345919, 25074599234442272, 25663089186828960, 28203062247696373, 32216709220688372]


# 5. Save Processed Data
- Saves the `filtered_obj_ids_sequence` (lsit of raw obj_ids, on order, containing only popular ones)
- Saves the `sorted_popular_object_ids_list` (the list of unique popular obj_ids, defines the vocabulary scope)
- These are saved into a single pickle file

In [7]:
if final_filtered_sequence and final_list_of_objects_for_vocab: 
    processed_data = {
        'filtered_sequence_popular_obj_ids': final_filtered_sequence,
        'list_of_popular_obj_ids': final_list_of_objects_for_vocab 
    }

    print(f"Saving processed data to {PROCESSED_DATA_PICKLE_PATH}...")
    try:
        with open(PROCESSED_DATA_PICKLE_PATH, 'wb') as f:
            pickle.dump(processed_data, f)
        print("Processed data saved successfully.")
        print(f"  - Saved filtered sequence length: {len(processed_data['filtered_sequence_popular_obj_ids'])}")
        print(f"  - Number of unique objects in final vocab: {len(processed_data['list_of_popular_obj_ids'])}")
    except Exception as e:
        print(f"Error saving processed data: {e}")
else:
    print("No processed data to save: either the final filtered sequence or the final vocabulary is empty.")

Saving processed data to processed_cache_data.pkl...
Processed data saved successfully.
  - Saved filtered sequence length: 15000
  - Number of unique objects in final vocab: 7588
