# Script to create train/test dataset.
### Please download movielens-1M data to script folder.

https://grouplens.org/datasets/movielens/1m/

In [1]:
%matplotlib inline

import numpy as np
import pandas as pd

In [2]:
users_df = pd.read_csv('./users.dat', sep='::', names=['userId', 'gender', 'age', 'occupation', 'zipCode'])

  """Entry point for launching an IPython kernel.


In [3]:
users_df['userId'] = users_df['userId'].astype(np.int)
users_df['age'] = users_df['age'].astype(np.int)
users_df['occupation'] = users_df['occupation'].astype(np.int)
users_df['zipCode'] = users_df['zipCode'].astype(str)

In [4]:
movies_df = pd.read_csv('./movies.dat', sep='::', names=['movieId', 'title', 'genres'])

  """Entry point for launching an IPython kernel.


In [5]:
movies_df['movieId'] = movies_df['movieId'].astype(np.int)

In [6]:
ratings_df = pd.read_csv('./ratings.dat', sep='::', names=['userId', 'movieId', 'ratting', 'timestamp'])

  """Entry point for launching an IPython kernel.


In [7]:
ratings_df['userId'] = ratings_df['userId'].astype(np.int)
ratings_df['movieId'] = ratings_df['movieId'].astype(np.int)

In [8]:
users_df.head()

Unnamed: 0,userId,gender,age,occupation,zipCode
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455


In [9]:
movies_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [10]:
ratings_df.head()

Unnamed: 0,userId,movieId,ratting,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [11]:
# there are 6040 users

users_df.userId.unique().shape[0]

6040

In [12]:
# there are 3883 movies

movies_df.movieId.unique().shape[0]

3883

In [13]:
# Users have rated at least 20 movies.

ratings_df.groupby(by='userId', as_index=False).movieId.count().movieId.min()

20

# seperate train/test by userIds

In [14]:
from sklearn.model_selection import train_test_split

In [15]:
trainUserIds, testUserIds = train_test_split(
    users_df.userId.unique(), test_size=0.2, random_state=42)

In [16]:
train_ratings_df = ratings_df[ratings_df.userId.isin(trainUserIds)]
test_ratings_df = ratings_df[ratings_df.userId.isin(testUserIds)]

train_ratings_df = train_ratings_df.sort_values(
    by=['userId', 'timestamp'], ascending=[True, True])
test_ratings_df = test_ratings_df.sort_values(
    by=['userId', 'timestamp'], ascending=[True, True])

# prepare history movieIds

In [17]:
def mark_last_timestamp(df):
    last = df[['userId', 'movieId']].groupby(
        by='userId', as_index=False).tail(1).copy()
    last['last'] = 1
    df = pd.merge(
        df, last, how='left',
        on=['userId', 'movieId'])
    df.loc[~df['last'].isnull(), 'last'] = 1
    df.loc[df['last'].isnull(), 'last'] = 0
    
    return df

In [18]:
train_ratings_df = mark_last_timestamp(train_ratings_df)
test_ratings_df = mark_last_timestamp(test_ratings_df)

In [19]:
def get_hist_movie_ids(df, max_len=10):
    hist_movie_ids = list()
    for _, group in df.groupby(by='userId'):
        tmp_hist_movie_ids = list()
        for _, row in group.iterrows():
            # keep high rated movies
            if row['ratting'] >= 4 and row['last'] == 0:
                tmp_hist_movie_ids.append(str(int(row['movieId'])))
        tmp_hist_movie_ids.reverse()
        hist_movie_ids.append('|'.join(tmp_hist_movie_ids[:max_len]))
    return hist_movie_ids

In [20]:
train_hist_movie_ids = get_hist_movie_ids(train_ratings_df)
test_hist_movie_ids = get_hist_movie_ids(test_ratings_df)

In [21]:
train_ratings_df = train_ratings_df[train_ratings_df['last'] == 1]
train_ratings_df['histHighRatedMovieIds'] = train_hist_movie_ids
test_ratings_df = test_ratings_df[test_ratings_df['last'] == 1]
test_ratings_df['histHighRatedMovieIds'] = test_hist_movie_ids

In [22]:
train_ratings_df.head()

Unnamed: 0,userId,movieId,ratting,timestamp,last,histHighRatedMovieIds
52,1,48,5,978824351,1.0,1907|1566|783|2294|2355|1|588|595|527|1545
181,2,1917,3,978300174,1.0,1544|2002|736|368|648|1370|2353|1527|349|3418
232,3,2081,4,978298504,1.0,104|3552|2355|1259|1079|1304|1394|3421|1136|2735
253,4,1954,5,978294282,1.0,1036|2951|1240|3702|3418|1214|2947|2692|1201|2366
451,5,288,2,978246585,1.0,2029|551|2427|1897|1921|1580|377|506|1715|515


In [23]:
test_ratings_df.head()

Unnamed: 0,userId,movieId,ratting,timestamp,last,histHighRatedMovieIds
105,9,2294,4,978226678,1.0,2278|1653|16|524|349|529|2268|2302|480|590
306,15,3510,5,978361393,1.0,2115|3461|1210|260|1196|3489|1597|1422|648|1892
341,16,2701,2,978174795,1.0,2724|2394|2581|2761|3175|2369|2975|2355|2699|2683
646,18,1683,5,978157434,1.0,26|838|3448|261|3723|249|2573|475|1721|1643
782,24,2657,4,986157459,1.0,1959|2757|425|1635|1193|858|592|2640|3699|1073


# merge with other features

In [24]:
train_ratings_df = pd.merge(train_ratings_df, users_df, how='inner', on='userId')
test_ratings_df = pd.merge(test_ratings_df, users_df, how='inner', on='userId')
train_ratings_df = pd.merge(train_ratings_df, movies_df, how='inner', on='movieId')
test_ratings_df = pd.merge(test_ratings_df, movies_df, how='inner', on='movieId')

# create label

In [25]:
train_ratings_df['label'] = 0
train_ratings_df.loc[train_ratings_df['ratting'] >= 4, 'label'] = 1
test_ratings_df['label'] = 0
test_ratings_df.loc[test_ratings_df['ratting'] >= 4, 'label'] = 1

In [26]:
train_ratings_df.to_csv('./train.csv', index=False)
test_ratings_df.to_csv('./test.csv', index=False)

In [27]:
train_ratings_df.head()

Unnamed: 0,userId,movieId,ratting,timestamp,last,histHighRatedMovieIds,gender,age,occupation,zipCode,title,genres,label
0,1,48,5,978824351,1.0,1907|1566|783|2294|2355|1|588|595|527|1545,F,1,10,48067,Pocahontas (1995),Animation|Children's|Musical|Romance,1
1,2030,48,3,977809337,1.0,2138|3751|3429|3524|1101|1013|3345|2099|1934|3928,M,25,4,77345,Pocahontas (1995),Animation|Children's|Musical|Romance,0
2,4877,48,2,962767153,1.0,3615|3745|2394|3034|2033|364|616|2081|1032|2078,M,25,4,94703,Pocahontas (1995),Animation|Children's|Musical|Romance,0
3,2,1917,3,978300174,1.0,1544|2002|736|368|648|1370|2353|1527|349|3418,M,56,16,70072,Armageddon (1998),Action|Adventure|Sci-Fi|Thriller,0
4,5122,1917,5,962213528,1.0,1580|2428|1573|788|32|2571|1584|2907|2858|3078,M,25,0,20009,Armageddon (1998),Action|Adventure|Sci-Fi|Thriller,1


In [28]:
test_ratings_df.head()

Unnamed: 0,userId,movieId,ratting,timestamp,last,histHighRatedMovieIds,gender,age,occupation,zipCode,title,genres,label
0,9,2294,4,978226678,1.0,2278|1653|16|524|349|529|2268|2302|480|590,M,25,17,61614,Antz (1998),Animation|Children's,1
1,15,3510,5,978361393,1.0,2115|3461|1210|260|1196|3489|1597|1422|648|1892,M,25,7,22903,Frequency (2000),Drama|Thriller,1
2,2165,3510,5,974982139,1.0,3148|3147|104|3210|1266|16|3552|2416|2959|1095,M,25,7,32836,Frequency (2000),Drama|Thriller,1
3,2319,3510,4,974476547,1.0,3148|3481|3408|1096|1961|2716|1302|2243|1240|2987,M,35,7,48104,Frequency (2000),Drama|Thriller,1
4,16,2701,2,978174795,1.0,2724|2394|2581|2761|3175|2369|2975|2355|2699|2683,F,35,0,20670,Wild Wild West (1999),Action|Sci-Fi|Western,0
