In [1]:
import numpy as np
import pandas as pd
from interactions_data import create_index, reset_ids
from sklearn.model_selection import train_test_split

In [2]:
path = 'data/ml-100k/'

In [3]:
df = pd.read_csv(path+'ratings.csv').drop('timestamp', axis=1)

In [4]:
links = pd.read_csv(path+'links.csv')
movies = pd.read_csv(path+'movies.csv')
tags = pd.read_csv(path+'tags.csv')

In [5]:
df.head()

Unnamed: 0,userId,movieId,rating
0,1,31,2.5
1,1,1029,3.0
2,1,1061,3.0
3,1,1129,2.0
4,1,1172,4.0


In [6]:
df.shape

(100004, 3)

In [7]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [8]:
movies.shape

(9125, 3)

In [9]:
tags.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,15,339,sandra 'boring' bullock,1138537770
1,15,1955,dentist,1193435061
2,15,7478,Cambodia,1170560997
3,15,32892,Russian,1170626366
4,15,34162,forgettable,1141391765


In [10]:
tags.shape

(1296, 4)

In [11]:
links.head()

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


In [12]:
links.shape

(9125, 3)

In [13]:
user_ids = df['userId'].unique()
movie_ids = df['movieId'].unique()

In [14]:
# length before
print(len(movies), len(tags), len(links))

# keep metadata only for movies that have receieved ratings
movies = movies[movies['movieId'].isin(movie_ids)]
links = links[links['movieId'].isin(movie_ids)]
tags = tags[tags['movieId'].isin(movie_ids)]
tags = tags[tags['userId'].isin(user_ids)]

# length after
print(len(movies), len(tags), len(links))

9125 1296 9125
9066 1222 9066


In [15]:
# create ID encoders
user_encoder, _ = create_index(user_ids)
movie_encoder, _ = create_index(movie_ids)

### Encode IDs from 0-n

In [16]:
# replace all IDs with sequential 0-N IDs
df['userId'] = df['userId'].apply(lambda x: user_encoder[x])
df['movieId'] = df['movieId'].apply(lambda x: movie_encoder[x])

links['movieId'] = links['movieId'].apply(lambda x: movie_encoder[x])
movies['movieId'] = movies['movieId'].apply(lambda x: movie_encoder[x])
tags['movieId'] = tags['movieId'].apply(lambda x: movie_encoder[x])
tags['userId'] = tags['userId'].apply(lambda x: user_encoder[x])

In [17]:
n_users = df.userId.nunique()
n_items = df.movieId.nunique()
max_score = df.rating.max()
min_score = df.rating.min()

In [18]:
print(n_users, df.userId.max())
print(n_items, df.movieId.max())
print(min_score, max_score)

671 670
9066 9065
0.5 5.0


### Train / test split

In [19]:
train, test, _, _ = train_test_split(df, df.rating,
                                     test_size = .2,
                                     random_state=42,
                                     stratify=df.userId)

In [20]:
train.shape

(80003, 3)

In [21]:
test.shape

(20001, 3)

In [22]:
train.to_csv(path+'train.csv', index=False)
test.to_csv(path+'test.csv', index=False)

In [23]:
links.to_csv(path+'links.csv', index=False)
movies.to_csv(path+'movies.csv', index=False)
tags.to_csv(path+'tags.csv', index=False)