## Imports

In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf

## Data Set-up

In [2]:
np.random.seed(0)
ratings_dir = './Data/ml-latest-small/ratings.csv'

# Read in ratings data
ratings = pd.read_csv(ratings_dir)
print(ratings)

        userId  movieId  rating   timestamp
0            1        1     4.0   964982703
1            1        3     4.0   964981247
2            1        6     4.0   964982224
3            1       47     5.0   964983815
4            1       50     5.0   964982931
...        ...      ...     ...         ...
100831     610   166534     4.0  1493848402
100832     610   168248     5.0  1493850091
100833     610   168250     5.0  1494273047
100834     610   168252     5.0  1493846352
100835     610   170875     3.0  1493846415

[100836 rows x 4 columns]


In [3]:
# Find the latest rating each user has made
ratings['latest'] = ratings.groupby(['userId'])['timestamp'].rank(method='first', ascending=False)

# Separate the latest rating into the test dataset
# Keep all other ratings in the train dataset
train_ratings = ratings[ratings['latest'] != 1]
test_ratings = ratings[ratings['latest'] == 1]

# Remove timestamp field
train_ratings = train_ratings[['userId', 'movieId', 'rating']]
test_ratings = test_ratings[['userId', 'movieId', 'rating']]

print('Columns:', train_ratings.columns.values)
print('Interactions in Training Set:', train_ratings.shape[0])
print('Interactions in Testing Set:', test_ratings.shape[0])

Columns: ['userId' 'movieId' 'rating']
Interactions in Training Set: 100226
Interactions in Testing Set: 610


### Convert to Implicit Feedback

In [4]:
# Convert rating to 1 for everything to mark that the user has watched this item
train_ratings.loc[:, 'rating'] = 1
print(train_ratings)

        userId  movieId  rating
0            1        1       1
1            1        3       1
2            1        6       1
3            1       47       1
4            1       50       1
...        ...      ...     ...
100831     610   166534       1
100832     610   168248       1
100833     610   168250       1
100834     610   168252       1
100835     610   170875       1

[100226 rows x 3 columns]


In [5]:
# Add negative samples with rating = 0
all_movies = ratings['movieId'].unique()

users, items, labels = [], [], []
user_item_set = set(zip(train_ratings['userId'], train_ratings['userId']))
num_negatives = 4

for (u, i) in user_item_set:
    users.append(u)
    items.append(i)
    labels.append(1)
    for _ in range(num_negatives):
        negative_item = np.random.choice(all_movies)
        while (u, negative_item) in user_item_set:
            negative_item = np.random.choice(all_movies)
        users.append(u)
        items.append(negative_item)
        labels.append(0)

### Create TF Dataset

In [6]:
train_ds = tf.data.Dataset.from_tensor_slices((users, items, labels))

In [7]:
train_ds = train_ds.shuffle(1000)
for elem in train_ds.take(10):
    tf.print('User ID: ', elem[0], 'Movie ID: ', elem[1], 'User Viewed Movie: ', elem[2])

User ID:  561 Movie ID:  2210 User Viewed Movie:  0
User ID:  567 Movie ID:  190209 User Viewed Movie:  0
User ID:  80 Movie ID:  6790 User Viewed Movie:  0
User ID:  383 Movie ID:  383 User Viewed Movie:  1
User ID:  574 Movie ID:  1573 User Viewed Movie:  0
User ID:  191 Movie ID:  4734 User Viewed Movie:  0
User ID:  508 Movie ID:  1123 User Viewed Movie:  0
User ID:  546 Movie ID:  4714 User Viewed Movie:  0
User ID:  403 Movie ID:  403 User Viewed Movie:  1
User ID:  279 Movie ID:  1552 User Viewed Movie:  0
