### Data Augmentation for Negative Implicit Feedback Data

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
from tqdm import tqdm
import matplotlib.pyplot as plt
from google.colab import files
from google.colab import drive

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [None]:
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
ratings = pd.read_csv('/content/drive/MyDrive/dataset/rating.csv')
ratings['timestamp'] = pd.to_datetime(ratings['timestamp'])

# Only considering recent 1.4M transactional data, given the compute constraint
ratings = ratings[ratings['timestamp'].dt.year > 2012]
ratings.to_csv('/content/drive/MyDrive/small_dataset/rating.csv', index=False)

# Make the most recent interaction as the test dataset
ratings['rank_latest'] = ratings.groupby(['userId'])['timestamp'].rank(method='first', ascending=False)
cleaned_ratings = ratings.drop(columns=['timestamp'])

train_ratings = cleaned_ratings[ratings['rank_latest'] != 1]
test_ratings = cleaned_ratings[ratings['rank_latest'] == 1]

test_ratings.head()
print(f'Shape: {test_ratings.shape}')

Unnamed: 0,userId,movieId,rating,rank_latest
3548,31,2096,2.0,1.0
4482,42,1250,3.0,1.0
4965,49,2858,3.5,1.0
7012,59,1230,4.0,1.0
8037,71,104337,3.5,1.0


Shape: (14315, 4)


In [None]:
# Create train and test datasets
train_ratings.to_csv('/content/drive/MyDrive/small_dataset/train_dataset.csv', index=False)
test_ratings.to_csv('/content/drive/MyDrive/small_dataset/test_dataset.csv', index=False)

In [None]:
users, items, labels = train_ratings['userId'].to_list(), train_ratings['movieId'].to_list(), list(np.ones(train_ratings.shape[0]))

movie_ids = train_ratings['movieId'].unique()
user_movie_list = zip(train_ratings['userId'], train_ratings['movieId'])

# The set is used only for faster lookup. There are no duplicates here
user_movie_set = set(user_movie_list)

# Create 4 negative interaction points for each user
for (u, i) in tqdm(user_movie_set):
    for _ in range(4):
        negative_item = np.random.choice(movie_ids) 
        while (u, negative_item) in user_movie_set:
            negative_item = np.random.choice(movie_ids)
        users.append(u)
        items.append(negative_item)
        labels.append(0)

100%|██████████| 1431786/1431786 [01:42<00:00, 13923.73it/s]


In [None]:
dataset = pd.DataFrame({'userId': users, 'movieId': items, 'label': labels})
dataset.to_csv('/content/drive/MyDrive/small_dataset/augmented_dataset.csv', index=False)

In [None]:
dataset['label'].value_counts()

0.0    5727144
1.0    1431786
Name: label, dtype: int64

In [None]:
users, items = test_ratings['userId'].to_list(), test_ratings['movieId'].to_list()

movie_ids = ratings['movieId'].unique()
user_test_movie_list = zip(test_ratings['userId'], test_ratings['movieId'])

# The set is used only for faster lookup. There are no duplicates here
user_test_movie_set = set(user_test_movie_list)

# Create 99 negative interaction points for each user to create the 100 count sample
for (u, i) in tqdm(user_test_movie_set):

    for _ in range(99):
        negative_item = np.random.choice(movie_ids) 
        while ((u, negative_item) in user_test_movie_set) or ((u, negative_item) in user_movie_set):
            negative_item = np.random.choice(movie_ids)
        users.append(u)
        items.append(negative_item)

100%|██████████| 14315/14315 [00:23<00:00, 600.14it/s]


In [None]:
dataset = pd.DataFrame({'userId': users, 'movieId': items})
dataset.to_csv('/content/drive/MyDrive/small_dataset/augmented_test_dataset.csv', index=False)