# Script to create train/test dataset.
### Please download movielens-1M data to script folder.

https://grouplens.org/datasets/movielens/1m/

In [1]:
%matplotlib inline

import numpy as np
import pandas as pd

In [2]:
users_df = pd.read_csv('./users.dat', sep='::', names=['userId', 'gender', 'age', 'occupation', 'zipCode'])

  """Entry point for launching an IPython kernel.


In [3]:
users_df['userId'] = users_df['userId'].astype(np.int)
users_df['age'] = users_df['age'].astype(np.int)
users_df['occupation'] = users_df['occupation'].astype(np.int)
users_df['zipCode'] = users_df['zipCode'].astype(str)

In [4]:
movies_df = pd.read_csv('./movies.dat', sep='::', names=['movieId', 'title', 'genres'])

  """Entry point for launching an IPython kernel.


In [5]:
movies_df['movieId'] = movies_df['movieId'].astype(np.int)

In [6]:
ratings_df = pd.read_csv('./ratings.dat', sep='::', names=['userId', 'movieId', 'ratting', 'timestamp'])

  """Entry point for launching an IPython kernel.


In [7]:
ratings_df['userId'] = ratings_df['userId'].astype(np.int)
ratings_df['movieId'] = ratings_df['movieId'].astype(np.int)

In [8]:
users_df.head()

Unnamed: 0,userId,gender,age,occupation,zipCode
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455


In [9]:
movies_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [10]:
ratings_df.head()

Unnamed: 0,userId,movieId,ratting,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [11]:
# there are 6040 users

users_df.userId.unique().shape[0]

6040

In [12]:
# there are 3883 movies

movies_df.movieId.unique().shape[0]

3883

In [13]:
# Users have rated at least 20 movies.

ratings_df.groupby(by='userId', as_index=False).movieId.count().movieId.min()

20

# seperate train/test by userIds

In [14]:
from sklearn.model_selection import train_test_split

In [15]:
trainUserIds, testUserIds = train_test_split(
    users_df.userId.unique(), test_size=0.2, random_state=42)

In [16]:
train_ratings_df = ratings_df[ratings_df.userId.isin(trainUserIds)]
test_ratings_df = ratings_df[ratings_df.userId.isin(testUserIds)]

train_ratings_df = train_ratings_df.sort_values(
    by=['userId', 'timestamp'], ascending=[True, True])
test_ratings_df = test_ratings_df.sort_values(
    by=['userId', 'timestamp'], ascending=[True, True])

# prepare history movieIds

In [17]:
def mark_last_timestamp(df):
    last = df[['userId', 'movieId']].groupby(
        by='userId', as_index=False).tail(1).copy()
    last['last'] = 1
    df = pd.merge(
        df, last, how='left',
        on=['userId', 'movieId'])
    df.loc[~df['last'].isnull(), 'last'] = 1
    df.loc[df['last'].isnull(), 'last'] = 0
    
    return df

In [18]:
train_ratings_df = mark_last_timestamp(train_ratings_df)
test_ratings_df = mark_last_timestamp(test_ratings_df)

In [19]:
candidate_movie_ids = movies_df.movieId.values

In [20]:
# slow implement
def neg_sampling(candidates, filters, length):
    max_len = len(candidates)
    
    res = []
    for i in range(length):
        while(1):
            c = candidates[np.random.randint(0, max_len)]
            if c not in filters:
                res.append(str(c))
                filters.add(c)
                break
    return res

In [21]:
def get_hist_movie_ids(df, max_len=10):
    hist_movie_ids = list()
    neg_hist_movie_ids = list()
    for _, group in df.groupby(by='userId'):
        tmp_hist_movie_ids = list()
        for _, row in group.iterrows():
            # keep high rated movies
            if row['ratting'] >= 4 and row['last'] == 0:
                tmp_hist_movie_ids.append(str(int(row['movieId'])))
        # keep latest high rated movies
        tmp_hist_movie_ids.reverse()
        tmp_hist_movie_ids = tmp_hist_movie_ids[:max_len]
        # revert to timestamp order
        tmp_hist_movie_ids.reverse()
        tmp_neg_hist_movie_ids = neg_sampling(
            candidate_movie_ids, set(hist_movie_ids), len(tmp_hist_movie_ids))
        hist_movie_ids.append('|'.join(tmp_hist_movie_ids))
        neg_hist_movie_ids.append('|'.join(tmp_neg_hist_movie_ids))
    return hist_movie_ids, neg_hist_movie_ids

In [22]:
train_hist_movie_ids, train_neg_hist_movie_ids = get_hist_movie_ids(train_ratings_df)
test_hist_movie_ids, test_neg_hist_movie_ids = get_hist_movie_ids(test_ratings_df)

In [23]:
train_ratings_df = train_ratings_df[train_ratings_df['last'] == 1]
train_ratings_df['histHighRatedMovieIds'] = train_hist_movie_ids
train_ratings_df['negHistMovieIds'] = train_neg_hist_movie_ids

test_ratings_df = test_ratings_df[test_ratings_df['last'] == 1]
test_ratings_df['histHighRatedMovieIds'] = test_hist_movie_ids
test_ratings_df['negHistMovieIds'] = test_neg_hist_movie_ids

In [24]:
train_ratings_df.head()

Unnamed: 0,userId,movieId,ratting,timestamp,last,histHighRatedMovieIds,negHistMovieIds
52,1,48,5,978824351,1.0,1545|527|595|588|1|2355|2294|783|1566|1907,393|2562|3289|3442|2808|2259|295|1190|3321|3470
181,2,1917,3,978300174,1.0,3418|349|1527|2353|1370|648|368|736|2002|1544,2117|2560|3480|3678|2798|1222|3872|3503|3913|3494
232,3,2081,4,978298504,1.0,2735|1136|3421|1394|1304|1079|1259|2355|3552|104,2481|3945|3629|3056|1756|3725|1519|1425|307|37
253,4,1954,5,978294282,1.0,2366|1201|2692|2947|1214|3418|3702|1240|2951|1036,1719|2853|3132|1164|1097|931|1558|1363|492|3406
451,5,288,2,978246585,1.0,515|1715|506|377|1580|1921|1897|2427|551|2029,1927|940|2153|865|957|3219|1834|2602|3770|3843


In [25]:
test_ratings_df.head()

Unnamed: 0,userId,movieId,ratting,timestamp,last,histHighRatedMovieIds,negHistMovieIds
105,9,2294,4,978226678,1.0,590|480|2302|2268|529|349|524|16|1653|2278,876|1953|2644|3540|2794|90|3291|406|2282|2406
306,15,3510,5,978361393,1.0,1892|648|1422|1597|3489|1196|260|1210|3461|2115,322|135|1324|3928|2808|3220|2992|3937|652|3671
341,16,2701,2,978174795,1.0,2683|2699|2355|2975|2369|3175|2761|2581|2394|2724,3128|1830|3281|163|1334|585|3580|1776|1392|774
646,18,1683,5,978157434,1.0,1643|1721|475|2573|249|3723|261|3448|838|26,1986|972|2586|2846|1855|2369|568|2144|3329|2000
782,24,2657,4,986157459,1.0,1073|3699|2640|592|858|1193|1635|425|2757|1959,3917|1764|990|2042|1861|3097|3746|2624|3829|3421


# merge with other features

In [26]:
train_ratings_df = pd.merge(train_ratings_df, users_df, how='inner', on='userId')
test_ratings_df = pd.merge(test_ratings_df, users_df, how='inner', on='userId')
train_ratings_df = pd.merge(train_ratings_df, movies_df, how='inner', on='movieId')
test_ratings_df = pd.merge(test_ratings_df, movies_df, how='inner', on='movieId')

# create label

In [27]:
train_ratings_df['label'] = 0
train_ratings_df.loc[train_ratings_df['ratting'] >= 4, 'label'] = 1
test_ratings_df['label'] = 0
test_ratings_df.loc[test_ratings_df['ratting'] >= 4, 'label'] = 1

In [28]:
train_ratings_df.to_csv('./train.csv', index=False)
test_ratings_df.to_csv('./test.csv', index=False)

In [29]:
train_ratings_df.head()

Unnamed: 0,userId,movieId,ratting,timestamp,last,histHighRatedMovieIds,negHistMovieIds,gender,age,occupation,zipCode,title,genres,label
0,1,48,5,978824351,1.0,1545|527|595|588|1|2355|2294|783|1566|1907,393|2562|3289|3442|2808|2259|295|1190|3321|3470,F,1,10,48067,Pocahontas (1995),Animation|Children's|Musical|Romance,1
1,2030,48,3,977809337,1.0,3928|1934|2099|3345|1013|1101|3524|3429|3751|2138,3142|1743|525|1746|3323|935|2183|3556|3343|1542,M,25,4,77345,Pocahontas (1995),Animation|Children's|Musical|Romance,0
2,4877,48,2,962767153,1.0,2078|1032|2081|616|364|2033|3034|2394|3745|3615,2354|747|3478|2638|311|1204|3370|807|29|1380,M,25,4,94703,Pocahontas (1995),Animation|Children's|Musical|Romance,0
3,2,1917,3,978300174,1.0,3418|349|1527|2353|1370|648|368|736|2002|1544,2117|2560|3480|3678|2798|1222|3872|3503|3913|3494,M,56,16,70072,Armageddon (1998),Action|Adventure|Sci-Fi|Thriller,0
4,5122,1917,5,962213528,1.0,3078|2858|2907|1584|2571|32|788|1573|2428|1580,1860|194|742|2379|1966|3397|3195|678|2240|2196,M,25,0,20009,Armageddon (1998),Action|Adventure|Sci-Fi|Thriller,1


In [30]:
test_ratings_df.head()

Unnamed: 0,userId,movieId,ratting,timestamp,last,histHighRatedMovieIds,negHistMovieIds,gender,age,occupation,zipCode,title,genres,label
0,9,2294,4,978226678,1.0,590|480|2302|2268|529|349|524|16|1653|2278,876|1953|2644|3540|2794|90|3291|406|2282|2406,M,25,17,61614,Antz (1998),Animation|Children's,1
1,15,3510,5,978361393,1.0,1892|648|1422|1597|3489|1196|260|1210|3461|2115,322|135|1324|3928|2808|3220|2992|3937|652|3671,M,25,7,22903,Frequency (2000),Drama|Thriller,1
2,2165,3510,5,974982139,1.0,1095|2959|2416|3552|16|1266|3210|104|3147|3148,1936|1248|2241|985|3097|2282|3550|2690|3731|718,M,25,7,32836,Frequency (2000),Drama|Thriller,1
3,2319,3510,4,974476547,1.0,2987|1240|2243|1302|2716|1961|1096|3408|3481|3148,392|2782|1615|3720|781|1511|3225|2273|2655|3798,M,35,7,48104,Frequency (2000),Drama|Thriller,1
4,16,2701,2,978174795,1.0,2683|2699|2355|2975|2369|3175|2761|2581|2394|2724,3128|1830|3281|163|1334|585|3580|1776|1392|774,F,35,0,20670,Wild Wild West (1999),Action|Sci-Fi|Western,0
