In [1]:
import numpy as np
import pandas as pd
from interactions_data import create_index, reset_ids
from sklearn.model_selection import train_test_split

In [2]:
path = 'data/ml-1m/'

In [3]:
df = pd.read_csv(path+'ratings.dat',
                 sep='::',
                 names=['userId', 'movieId', 'rating', 'timestamp'],
                 engine='python').drop('timestamp', axis=1)

In [4]:
movies = pd.read_csv(path+'movies.dat',
                     sep='::',
                     names=['movieId', 'title', 'genres'],
                     engine='python')

users = pd.read_csv(path+'users.dat',
                    sep='::',
                    names=['userId', 'gender', 'age', 'occupation', 'zipCode'],
                    engine='python')

In [5]:
df.head()

Unnamed: 0,userId,movieId,rating
0,1,1193,5
1,1,661,3
2,1,914,3
3,1,3408,4
4,1,2355,5


In [6]:
df.shape

(1000209, 3)

In [7]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [8]:
movies.shape

(3883, 3)

In [9]:
users.head()

Unnamed: 0,userId,gender,age,occupation,zipCode
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455


In [10]:
users.shape

(6040, 5)

In [11]:
user_ids = df['userId'].unique()
movie_ids = df['movieId'].unique()

In [12]:
# length before
print(len(movies), len(users))

# keep metadata only for movies that have receieved ratings
movies = movies[movies['movieId'].isin(movie_ids)]
users = users[users['userId'].isin(user_ids)]

# length after
print(len(movies), len(users))

3883 6040
3706 6040


In [13]:
# create ID encoders
user_encoder, _ = create_index(user_ids)
movie_encoder, _ = create_index(movie_ids)

### Encode IDs from 0-n

In [14]:
# replace all IDs with sequential 0-N IDs
df['userId'] = df['userId'].apply(lambda x: user_encoder[x])
df['movieId'] = df['movieId'].apply(lambda x: movie_encoder[x])

users['userId'] = users['userId'].apply(lambda x: user_encoder[x])
movies['movieId'] = movies['movieId'].apply(lambda x: movie_encoder[x])

In [15]:
n_users = df.userId.nunique()
n_items = df.movieId.nunique()
max_score = df.rating.max()
min_score = df.rating.min()

In [16]:
print(n_users, df.userId.max())
print(n_items, df.movieId.max())
print(min_score, max_score)

6040 6039
3706 3705
1 5


### Train / test split

In [17]:
train, test, _, _ = train_test_split(df, df.rating,
                                     test_size = .2,
                                     random_state=42,
                                     stratify=df.userId)

In [18]:
train.shape

(800167, 3)

In [19]:
test.shape

(200042, 3)

In [20]:
train.to_csv(path+'train.csv', index=False)
test.to_csv(path+'test.csv', index=False)

In [21]:
movies.to_csv(path+'movies.csv', index=False)
users.to_csv(path+'users.csv', index=False)