# MovieLens-100k Data Preprocessing for P5

This notebook preprocesses MovieLens-100k dataset for P5 training.

## Download the dataset:
```bash
cd raw_data
wget https://files.grouplens.org/datasets/movielens/ml-100k.zip
unzip ml-100k.zip
```

In [22]:
from collections import defaultdict
import os
import torch
import random
import numpy as np
import pandas as pd
import json
import pickle
from tqdm import tqdm

os.chdir(r"C:\Users\lehoa\OneDrive\Documents\College\Lab\Code\P5")

def load_pickle(filename):
    with open(filename, "rb") as f:
        return pickle.load(f)

def save_pickle(data, filename):
    with open(filename, "wb") as f:
        pickle.dump(data, f, protocol=pickle.HIGHEST_PROTOCOL)

def load_json(file_path):
    with open(file_path, "r") as f:
        return json.load(f)

# Set seeds
seed = 2020
torch.manual_seed(seed)
random.seed(seed)
np.random.seed(seed)

In [23]:
# Create output directory
short_data_name = 'ml100k'
os.makedirs(short_data_name, exist_ok=True)

## Load MovieLens-100k Data

In [24]:
def load_movielens_ratings(rating_score=3.0):
    """
    Load MovieLens-100k ratings data
    Format: user_id \t item_id \t rating \t timestamp
    """
    datas = []
    data_file = './raw_data/ml-100k/u.data'
    
    with open(data_file, 'r') as f:
        for line in f:
            parts = line.strip().split('\t')
            user = parts[0]
            item = parts[1]
            rating = float(parts[2])
            timestamp = int(parts[3])
            
            if rating <= rating_score:  # Filter low ratings
                continue
            
            datas.append((user, item, timestamp))
    
    return datas

def load_movielens_meta():
    """
    Load MovieLens-100k item metadata
    Format: movie_id | movie_title | release_date | video_release_date | IMDb_URL | genres
    """
    meta_file = './raw_data/ml-100k/u.item'
    meta_data = {}
    
    genre_names = ['unknown', 'Action', 'Adventure', 'Animation', 'Children', 'Comedy',
                   'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror',
                   'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']
    
    with open(meta_file, 'r', encoding='latin-1') as f:
        for line in f:
            parts = line.strip().split('|')
            item_id = parts[0]
            title = parts[1]
            
            # Get genres (last 19 columns are binary genre indicators)
            genres = []
            for i, is_genre in enumerate(parts[5:]):
                if is_genre == '1':
                    genres.append(genre_names[i])
            
            meta_data[item_id] = {
                'title': title,
                'categories': [genres] if genres else [['unknown']]
            }
    
    return meta_data

## Core Processing Functions

In [25]:
def get_interaction(datas):
    """Convert raw data to user sequences"""
    user_seq = {}
    for data in datas:
        user, item, time = data
        if user in user_seq:
            user_seq[user].append((item, time))
        else:
            user_seq[user] = [(item, time)]
    
    # Sort by timestamp
    for user, item_time in user_seq.items():
        item_time.sort(key=lambda x: x[1])
        items = [t[0] for t in item_time]
        user_seq[user] = items
    
    return user_seq

def check_Kcore(user_items, user_core, item_core):
    """Check if data satisfies K-core property"""
    user_count = defaultdict(int)
    item_count = defaultdict(int)
    for user, items in user_items.items():
        for item in items:
            user_count[user] += 1
            item_count[item] += 1
    
    for user, num in user_count.items():
        if num < user_core:
            return user_count, item_count, False
    for item, num in item_count.items():
        if num < item_core:
            return user_count, item_count, False
    return user_count, item_count, True

def filter_Kcore(user_items, user_core, item_core):
    """Iteratively filter data to satisfy K-core"""
    user_count, item_count, isKcore = check_Kcore(user_items, user_core, item_core)
    while not isKcore:
        for user, num in list(user_count.items()):
            if user_count[user] < user_core:
                user_items.pop(user, None)
            else:
                user_items[user] = [item for item in user_items[user] if item_count[item] >= item_core]
        user_count, item_count, isKcore = check_Kcore(user_items, user_core, item_core)
    return user_items

def id_map(user_items):
    """Create ID mappings for users and items"""
    user2id = {}
    item2id = {}
    id2user = {}
    id2item = {}
    user_id = 1
    item_id = 1
    final_data = {}
    
    random_user_list = list(user_items.keys())
    random.shuffle(random_user_list)
    
    for user in random_user_list:
        items = user_items[user]
        if user not in user2id:
            user2id[user] = str(user_id)
            id2user[str(user_id)] = user
            user_id += 1
        
        iids = []
        for item in items:
            if item not in item2id:
                item2id[item] = str(item_id)
                id2item[str(item_id)] = item
                item_id += 1
            iids.append(item2id[item])
        
        uid = user2id[user]
        final_data[uid] = iids
    
    data_maps = {
        'user2id': user2id,
        'item2id': item2id,
        'id2user': id2user,
        'id2item': id2item
    }
    return final_data, user_id-1, item_id-1, data_maps

## Process MovieLens Data

In [26]:
rating_score = 0.0  # Filter threshold
user_core = 5
item_core = 5

# Load data
datas = load_movielens_ratings(rating_score=rating_score)
print(f'MovieLens-100k raw data loaded! Total interactions: {len(datas)}')

# Get user-item interactions
user_items = get_interaction(datas)
print(f'Converted to user sequences. Users: {len(user_items)}')

# Apply K-core filtering
user_items = filter_Kcore(user_items, user_core=user_core, item_core=item_core)
print(f'User {user_core}-core and Item {item_core}-core complete!')

# Create ID mappings
user_items, user_num, item_num, data_maps = id_map(user_items)
user_count, item_count, _ = check_Kcore(user_items, user_core=user_core, item_core=item_core)

# Statistics
user_count_list = list(user_count.values())
user_avg, user_min, user_max = np.mean(user_count_list), np.min(user_count_list), np.max(user_count_list)
item_count_list = list(item_count.values())
item_avg, item_min, item_max = np.mean(item_count_list), np.min(item_count_list), np.max(item_count_list)
interact_num = np.sum(user_count_list)
sparsity = (1 - interact_num / (user_num * item_num)) * 100

print(f'Total User: {user_num}, Avg User: {user_avg:.4f}, Min Len: {user_min}, Max Len: {user_max}')
print(f'Total Item: {item_num}, Avg Item: {item_avg:.4f}, Min Inter: {item_min}, Max Inter: {item_max}')
print(f'Interaction Num: {interact_num}, Sparsity: {sparsity:.2f}%')

MovieLens-100k raw data loaded! Total interactions: 100000
Converted to user sequences. Users: 943
User 5-core and Item 5-core complete!
Total User: 943, Avg User: 105.2884, Min Len: 19, Max Len: 648
Total Item: 1349, Avg Item: 73.6004, Min Inter: 5, Max Inter: 583
Interaction Num: 99287, Sparsity: 92.20%


## Process Metadata

In [27]:
# Load metadata
raw_meta = load_movielens_meta()
print(f'Loaded metadata for {len(raw_meta)} items')

# Filter metadata to only include items in our dataset
meta_data = {}
for raw_item_id, info in raw_meta.items():
    if raw_item_id in data_maps['item2id']:
        item_id = data_maps['item2id'][raw_item_id]
        meta_data[item_id] = info

print(f'Filtered metadata to {len(meta_data)} items in dataset')

# Save as gzipped JSON (following P5 format)
import gzip
meta_file = f'./{short_data_name}/meta.json.gz'
with gzip.open(meta_file, 'wt', encoding='utf-8') as f:
    json.dump(meta_data, f)

Loaded metadata for 1682 items
Filtered metadata to 1349 items in dataset


## Save Sequential Data

In [28]:
# Save sequential data
data_file = f'./{short_data_name}/sequential_data.txt'
with open(data_file, 'w') as out:
    for user, items in user_items.items():
        out.write(user + ' ' + ' '.join(items) + '\n')

# Save datamaps
datamaps_file = f'./{short_data_name}/datamaps.json'
with open(datamaps_file, 'w') as out:
    json.dump(data_maps, out)

print(f'Saved sequential_data.txt and datamaps.json to {short_data_name}/')

Saved sequential_data.txt and datamaps.json to ml100k/


## Generate Negative Samples for Testing

In [29]:
def sample_test_data(user_items, test_num=99, sample_type='random'):
    """
    Sample negative items for each user for testing
    sample_type: 'random' or 'pop'
    """
    item_count = defaultdict(int)
    for user, items in user_items.items():
        for item in items:
            item_count[int(item)] += 1
    
    all_item = list(item_count.keys())
    count = list(item_count.values())
    sum_value = np.sum(count)
    probability = [value / sum_value for value in count]
    
    user_neg_items = {}
    
    for user, user_seq in user_items.items():
        user_seq_int = [int(i) for i in user_seq]
        test_samples = []
        while len(test_samples) < test_num:
            if sample_type == 'random':
                sample_ids = np.random.choice(all_item, test_num, replace=False)
            else:
                sample_ids = np.random.choice(all_item, test_num, replace=False, p=probability)
            sample_ids = [str(item) for item in sample_ids if item not in user_seq_int and str(item) not in test_samples]
            test_samples.extend(sample_ids)
        test_samples = test_samples[:test_num]
        user_neg_items[user] = test_samples
    
    return user_neg_items

# Generate negative samples
user_neg_items = sample_test_data(user_items)

# Save negative samples
test_file = f'./{short_data_name}/negative_samples.txt'
with open(test_file, 'w') as out:
    for user, samples in user_neg_items.items():
        out.write(user + ' ' + ' '.join(samples) + '\n')

print(f'Saved negative_samples.txt with {len(user_neg_items)} users')

Saved negative_samples.txt with 943 users


## Create Rating Splits

For rating prediction task, we need to create train/val/test splits from the ratings data.

In [30]:
def load_all_ratings():
    """Load all ratings (not filtered by score)"""
    rating_data = []
    data_file = './raw_data/ml-100k/u.data'
    
    with open(data_file, 'r') as f:
        for line in f:
            parts = line.strip().split('\t')
            user = parts[0]
            item = parts[1]
            rating = float(parts[2])
            timestamp = int(parts[3])
            
            # Only include users and items that are in our filtered dataset
            if user in data_maps['user2id'] and item in data_maps['item2id']:
                user_id = data_maps['user2id'][user]
                item_id = data_maps['item2id'][item]
                
                rating_data.append({
                    'user': user_id,
                    'item': item_id,
                    'rating': rating,
                    'timestamp': timestamp
                })
    
    return rating_data

rating_data = load_all_ratings()
print(f'Loaded {len(rating_data)} rating records')

Loaded 99287 rating records


In [31]:
# Create train/val/test splits (80/10/10)
population = len(rating_data)
indices = list(range(population))
random.shuffle(indices)

# Ensure each user and item appears at least once in training
user_mention_dict = defaultdict(list)
item_mention_dict = defaultdict(list)

for i in indices:
    user = rating_data[i]['user']
    item = rating_data[i]['item']
    user_mention_dict[user].append(i)
    item_mention_dict[item].append(i)

# Add at least one sample per user and item to training
train_indices = set()
for user, idx_list in tqdm(user_mention_dict.items()):
    train_indices.add(random.choice(idx_list))
for item, idx_list in tqdm(item_mention_dict.items()):
    train_indices.add(random.choice(idx_list))

print(f'Initial train indices from coverage: {len(train_indices)}')

# Fill remaining to reach 80%
remaining_indices = list(set(indices) - train_indices)
random.shuffle(remaining_indices)

train_target = int(population * 0.8)
need_more = train_target - len(train_indices)
train_indices.update(remaining_indices[:need_more])
train_indices = list(train_indices)

print(f'Final train indices: {len(train_indices)}')

# Split remaining into val/test
val_test_indices = list(set(indices) - set(train_indices))
random.shuffle(val_test_indices)

val_size = len(val_test_indices) // 2
val_indices = val_test_indices[:val_size]
test_indices = val_test_indices[val_size:]

print(f'Train: {len(train_indices)}, Val: {len(val_indices)}, Test: {len(test_indices)}')

100%|██████████| 943/943 [00:00<?, ?it/s]
100%|██████████| 1349/1349 [00:00<?, ?it/s]

Initial train indices from coverage: 2279
Final train indices: 79429
Train: 79429, Val: 9929, Test: 9929





In [32]:
# Create split datasets
train_rating_data = [rating_data[i] for i in train_indices]
val_rating_data = [rating_data[i] for i in val_indices]
test_rating_data = [rating_data[i] for i in test_indices]

rating_splits = {
    'train': train_rating_data,
    'val': val_rating_data,
    'test': test_rating_data,
    'train_indices': train_indices,
    'val_indices': val_indices,
    'test_indices': test_indices
}

# Save rating splits
save_pickle(rating_splits, f'./{short_data_name}/rating_splits.pkl')
print(f'Saved rating_splits.pkl')

Saved rating_splits.pkl


## Summary

The following files have been created in the `ml100k/` directory:
- `sequential_data.txt`: User interaction sequences for sequential recommendation
- `datamaps.json`: User and item ID mappings
- `meta.json.gz`: Movie metadata (titles, genres)
- `negative_samples.txt`: Negative samples for evaluation
- `rating_splits.pkl`: Train/val/test splits for rating prediction

You can now train P5 on this dataset!