#### First look at the data

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('../data/ml-1m/ratings.dat', header=None, delimiter='::', engine='python',
                 names=['user_id', 'item_id', 'rating', 'timestamp'])
print(df.shape)
df.head()

(1000209, 4)


Unnamed: 0,user_id,item_id,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [3]:
df['rating'].value_counts()

4    348971
3    261197
5    226310
2    107557
1     56174
Name: rating, dtype: int64

In [4]:
df['user_id'].value_counts().min()

20

In [5]:
df['user_id'].value_counts().shape

(6040,)

In [6]:
df['item_id'].value_counts().min()

1

In [7]:
(df['item_id'].value_counts() >= 10).mean()

0.8796546141392336

#### Users with few reviews - don't exist

By design the dataset includes only users with at least 20 reviews

We don't filter this dataset - to be able to compare fully to the original paper

#### Moving to 0-based indexing

It would be easiest if both user ids and item ids were 0-indexed (e.g. for embedding layers later on, etc.)

In [10]:
df['user_id'].max(), df['user_id'].nunique()

(6040, 6040)

In [11]:
df['item_id'].max(), df['item_id'].nunique()

(3952, 3706)

We see that this is currently not the case, so let's take care of this

In [12]:
user2idx = dict(zip(df['user_id'].unique().tolist(), range(df['user_id'].nunique())))

In [13]:
item2idx = dict(zip(df['item_id'].unique().tolist(), range(df['item_id'].nunique())))

In [14]:
df['user_id'] = df['user_id'].apply(user2idx.get)
df['item_id'] = df['item_id'].apply(item2idx.get)

In [15]:
df.head(20)

Unnamed: 0,user_id,item_id,rating,timestamp
0,0,0,5,978300760
1,0,1,3,978302109
2,0,2,3,978301968
3,0,3,4,978300275
4,0,4,5,978824291
5,0,5,3,978302268
6,0,6,5,978302039
7,0,7,5,978300719
8,0,8,4,978302268
9,0,9,4,978301368


In [16]:
df[['user_id', 'item_id']].isnull().max()

user_id    False
item_id    False
dtype: bool

In [17]:
df['user_id'].max(), df['user_id'].nunique()

(6039, 6040)

In [18]:
df['item_id'].max(), df['item_id'].nunique()

(3705, 3706)

#### Saving preprocessed dataset

In [19]:
df[['user_id', 'item_id', 'rating']].to_csv('../data/ml_1m_preprocessed.csv', index=False)