In [None]:
import pandas as pd
import pickle
from sklearn.model_selection import train_test_split

def save_pickle(file_path, obj):
    with open(file_path, "wb") as f:
        pickle.dump(obj, f)

In [2]:
# Load the dataset
file_path = './ml-100k.rating.data'
columns = ['user', 'item', 'rating', 'timestamp']
ratings = pd.read_csv(file_path, sep='\t', names=columns, engine='python')

# Step 1
positive_ratings = ratings[ratings['rating'] > 0]

# Step 2: Apply k-core filtering
def filter_k_core(df, k=10):
    while True:
        before_shape = df.shape[0]
        user_counts = df['user'].value_counts()
        item_counts = df['item'].value_counts()
        
        df = df[df['user'].isin(user_counts[user_counts >= k].index)]
        df = df[df['item'].isin(item_counts[item_counts >= k].index)]
        
        after_shape = df.shape[0]
        if before_shape == after_shape:
            break
    return df

# K: 10
filtered_ratings = filter_k_core(positive_ratings, k=10)

# Step 3: Sort the data by timestamp
filtered_ratings = filtered_ratings.sort_values(by='timestamp')

# Step 4: Remap user and item IDs starting from 0
unique_user_ids = filtered_ratings['user'].unique()
unique_item_ids = filtered_ratings['item'].unique()

user_id_map = {old_id: new_id for new_id, old_id in enumerate(unique_user_ids)}
item_id_map = {old_id: new_id for new_id, old_id in enumerate(unique_item_ids)}

filtered_ratings['user'] = filtered_ratings['user'].map(user_id_map)
filtered_ratings['item'] = filtered_ratings['item'].map(item_id_map)

# Assign unique interaction IDs starting from 0
filtered_ratings['interactionId'] = range(len(filtered_ratings))

# Step 5: Determine splits based on interaction counts
total_interactions = len(filtered_ratings)
base_count = int(total_interactions * 0.6)
inc1_count = int(total_interactions * 0.4/3)
inc2_count = int(total_interactions * 0.4/3)
inc3_count = total_interactions - (base_count + inc1_count + inc2_count)

# Step 6: Split the data based on interaction counts
base_data = filtered_ratings.iloc[:base_count]
inc1_data = filtered_ratings.iloc[base_count:base_count + inc1_count]
inc2_data = filtered_ratings.iloc[base_count + inc1_count:base_count + inc1_count + inc2_count]
inc3_data = filtered_ratings.iloc[base_count + inc1_count + inc2_count:]

# Step 7: Split train-valid-test in each user's interaction for base data
def split_base_data(df, train_frac=8, val_frac=1, test_frac=1):
    user_groups = df.groupby('user')
    train_set = pd.DataFrame()
    val_set = pd.DataFrame()
    test_set = pd.DataFrame()
    
    for user, group in user_groups:
        if len(group) < 3:
            train_set = pd.concat([train_set, group])
            continue
        
        test_size = test_frac / (train_frac + val_frac + test_frac)
        val_size = val_frac / (train_frac + val_frac)
        
        train_val_data, test_data = train_test_split(group, test_size=test_size, shuffle=True)
        train_data, val_data = train_test_split(train_val_data, test_size=val_size, shuffle=True)
        
        train_set = pd.concat([train_set, train_data])
        val_set = pd.concat([val_set, val_data])
        test_set = pd.concat([test_set, test_data])
    
    return train_set, val_set, test_set

# Step 8: Split each incremental block into validation and test sets

def split_inc_block_userwise(df):
    user_groups = df.groupby('user')
    train_set = pd.DataFrame()
    val_set = pd.DataFrame()
    test_set = pd.DataFrame()

    for user, group in user_groups:
        if len(group) < 3:
            train_set = pd.concat([train_set, group])
            continue
        
        test_size = 1/10
        valid_size = 1/9
        
        train_val_data, test_data = train_test_split(group, test_size = test_size, shuffle=False)
        train_data, val_data = train_test_split(train_val_data, test_size = valid_size, shuffle=False)

        train_set = pd.concat([train_set, train_data])
        val_set = pd.concat([val_set, val_data])
        test_set = pd.concat([test_set, test_data])

    return train_set, val_set, test_set

total_block = [base_data, inc1_data, inc2_data, inc3_data]
train_base, val_base, test_base = split_base_data(base_data)
train_inc1, val_inc1, test_inc1 = split_inc_block_userwise(inc1_data) 
train_inc2, val_inc2, test_inc2 = split_inc_block_userwise(inc2_data)
train_inc3, val_inc3, test_inc3 = split_inc_block_userwise(inc3_data)

In [None]:
print(len(base_data))
print(len(inc1_data))
print(len(inc2_data))
print(len(inc3_data))

In [None]:
print(filtered_ratings['user'].max())
print(filtered_ratings['item'].max())

In [5]:
TASK_0 = {'train': train_base, 'valid': val_base, 'test': test_base}
TASK_1 = {'train': train_inc1, 'valid': val_inc1, 'test': test_inc1}
TASK_2 = {'train': train_inc2, 'valid': val_inc2, 'test': test_inc2}
TASK_3 = {'train': train_inc3, 'valid': val_inc3, 'test': test_inc3}

TASK = [TASK_0, TASK_1, TASK_2, TASK_3]

In [6]:
# save_pickle('./total_blocks_timestamp.pickle', total_block)

In [7]:
# pickle_list = []

# for idx, task in enumerate(TASK):
#     train_dict = {}
#     valid_dict = {}
#     test_dict = {}
    
#     res_item_list = []
    
#     # train
#     train = task['train']
#     user_groups = train.groupby('user')
    
#     for user, group in user_groups:
#         item_list = group['item'].values.tolist()
#         train_dict[user] = item_list
#         res_item_list.extend(item_list)
    
#     # valid
#     valid = task['valid']
#     user_groups = valid.groupby('user')
    
#     for user, group in user_groups:
#         item_list = group['item'].values.tolist()
#         valid_dict[user] = item_list
#         res_item_list.extend(item_list)
    
#     # test
#     test = task['test']
#     user_groups = test.groupby('user')
    
#     for user, group in user_groups:
#         item_list = group['item'].values.tolist()
#         test_dict[user] = item_list
#         res_item_list.extend(item_list)
    
    
    
#     TASK_pickle = {'train_dict': train_dict, 'valid_dict': valid_dict, 'test_dict': test_dict, 'item_list': res_item_list}
    
#     pickle_list.append(TASK_pickle)
    
#     save_pickle(f'./TASK_{idx}.pickle', TASK_pickle)