In [1]:
import numpy as np
import pandas as pd
from interactions_data import create_index, reset_ids
from sklearn.model_selection import train_test_split

In [2]:
path = 'data/ml-10M100K/'

In [3]:
df = pd.read_csv(path+'ratings.dat',
                 sep='::',
                 names=['userId', 'movieId', 'rating', 'timestamp'],
                 engine='python').drop('timestamp', axis=1)

In [4]:
movies = pd.read_csv(path+'movies.dat',
                     sep='::',
                     names=['movieId', 'title', 'genres'],
                     engine='python')

tags = pd.read_csv(path+'tags.dat',
                   sep='::',
                   names=['userId', 'movieId', 'tag', 'timestamp'],
                   engine='python')

In [5]:
df.head()

Unnamed: 0,userId,movieId,rating
0,1,122,5.0
1,1,185,5.0
2,1,231,5.0
3,1,292,5.0
4,1,316,5.0


In [6]:
df.shape

(10000054, 3)

In [7]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [8]:
movies.shape

(10681, 3)

In [9]:
tags.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,15,4973,excellent!,1215184630
1,20,1747,politics,1188263867
2,20,1747,satire,1188263867
3,20,2424,chick flick 212,1188263835
4,20,2424,hanks,1188263835


In [10]:
tags.shape

(95580, 4)

In [11]:
user_ids = df['userId'].unique()
movie_ids = df['movieId'].unique()

In [12]:
# length before
print(len(movies))

# keep metadata only for movies that have receieved ratings
movies = movies[movies['movieId'].isin(movie_ids)]
tags = tags[tags['movieId'].isin(movie_ids)]
tags = tags[tags['userId'].isin(user_ids)]

# length after
print(len(movies))

10681
10677


In [13]:
# create ID encoders
user_encoder, _ = create_index(user_ids)
movie_encoder, _ = create_index(movie_ids)

### Encode IDs from 0-n

In [14]:
# replace all IDs with sequential 0-N IDs
df['userId'] = df['userId'].apply(lambda x: user_encoder[x])
df['movieId'] = df['movieId'].apply(lambda x: movie_encoder[x])

tags['userId'] = tags['userId'].apply(lambda x: user_encoder[x])
tags['movieId'] = tags['movieId'].apply(lambda x: movie_encoder[x])

movies['movieId'] = movies['movieId'].apply(lambda x: movie_encoder[x])

In [15]:
n_users = df.userId.nunique()
n_items = df.movieId.nunique()
max_score = df.rating.max()
min_score = df.rating.min()

In [16]:
print(n_users, df.userId.max())
print(n_items, df.movieId.max())
print(min_score, max_score)

69878 69877
10677 10676
0.5 5.0


### Train / test split

In [17]:
train, test, _, _ = train_test_split(df, df.rating,
                                     test_size = .2,
                                     random_state=42,
                                     stratify=df.userId)

In [20]:
# split train into 2 files to allow github storage
train1, train2, _, _ = train_test_split(train, train.rating,
                                       test_size = .5,
                                       random_state=42)

In [21]:
train1.shape, train2.shape

((4000021, 3), (4000022, 3))

In [22]:
test.shape

(2000011, 3)

In [23]:
train1.to_csv(path+'train1.csv', index=False)
train2.to_csv(path+'train2.csv', index=False)
test.to_csv(path+'test.csv', index=False)

In [24]:
movies.to_csv(path+'movies.csv', index=False)
tags.to_csv(path+'tags.csv', index=False)