In [None]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.datasets import dump_svmlight_file

import boto3
import sagemaker.amazon.common as smac

In [None]:
#Load Movies and Parse Genre

In [None]:
!wget http://files.grouplens.org/datasets/movielens/ml-latest-small.zip

In [None]:
ls

In [None]:
!unzip ml-latest-small.zip

In [None]:
df_movies = pd.read_csv(r'ml-latest-small/movies.csv')

In [None]:
df_movies.shape

In [None]:
df_movies.head()

In [None]:
genre_list = df_movies.genres.map(lambda value: value.split('|'))

In [None]:
genre_list[:10]

In [None]:
def get_unique_genres (genre_list):
    unique_list = set()
    
    for items in genre_list:
        for item in items:
            unique_list.add(item)
    
    return sorted(unique_list)

In [None]:
genre = get_unique_genres(genre_list)

In [None]:
genre, len(genre)

In [None]:
# Table of genre for each movie
df_genre = pd.DataFrame(index=range(df_movies.shape[0]),columns=genre)

In [None]:
df_genre = df_genre.fillna(0)

In [None]:
df_genre.shape

In [None]:
df_genre.head()

In [None]:
genre_list [:11]

In [None]:
# Fill genre for each movie
for row, movie_genre in enumerate(genre_list):
    df_genre.loc[row][movie_genre] = 1

In [None]:
df_genre.head()

In [None]:
# Some movies don't have genre listed
df_genre[df_genre['(no genres listed)'] > 0].head()

In [None]:
# Merge with movie description
df_movies = df_movies.join(df_genre)

In [None]:
df_movies.head()

In [None]:
df_movies.to_csv(r'ml-latest-small/movies_genre.csv', index=False)

In [None]:
# Ratings given by each user for a movie

In [None]:
df_ratings = pd.read_csv(r'ml-latest-small/ratings.csv')

In [None]:
df_ratings.head()

In [None]:
df_ratings.userId.unique().shape

In [None]:
df_ratings.movieId.unique().shape

In [None]:
df_ratings.drop(axis=1,columns=['timestamp'],inplace=True)

In [None]:
# Merge rating and movie description
df_movie_ratings = pd.merge(df_ratings,df_movies,on='movieId')

In [None]:
df_movie_ratings.head(2)

In [None]:
df_movie_ratings.tail(2)

In [None]:
# Training = 70% of the data
# Validation = 30% of the data
# Randomize the datset
np.random.seed(5)
l = list(df_movie_ratings.index)
np.random.shuffle(l)
df = df_movie_ratings.iloc[l]

In [None]:
rows = df.shape[0]
train = int(.7 * rows)
test = rows-train

In [None]:
rows,train,test

In [None]:
df.shape

In [None]:
df.head(2)

In [None]:
# SageMaker Factorization Machine expects all columns to be of float32

y = df['rating'].astype(np.float32).ravel()

In [None]:
len(y)

In [None]:
y.dtype

In [None]:
# create two different training datasets.
# Training 1: rating, user id, movie id
# Training 2: rating, user id, movie id, and movie genre attributes
columns_user_movie = ['userId','movieId']
columns_all = columns_user_movie + genre

In [None]:
columns_user_movie

In [None]:
columns_all

In [None]:
# Store a copy of user id, movie id and rating
# Train and Test
df[['rating','userId','movieId']][:train].to_csv(r'ml-latest-small/user_movie_train.csv', index=False)
df[['rating','userId','movieId']][train:].to_csv(r'ml-latest-small/user_movie_test.csv',index=False)

In [None]:
# One Hot Encode
# Training 1: user id, movie id
# Training 2: user id, movie id, and movie genre attributes
encoder = preprocessing.OneHotEncoder(dtype=np.float32)

In [None]:
X = encoder.fit_transform(df[columns_user_movie])

In [None]:
df.userId.unique().shape, df.movieId.unique().shape

In [None]:
dim_movie = df.userId.unique().shape[0] + df.movieId.unique().shape[0]
with open(r'ml-latest-small/movie_dimension.txt','w') as f:
    f.write(str(dim_movie))

In [None]:
X

In [None]:
X.shape[1]

In [None]:
# Create a spare matrix recordio file
def write_sparse_recordio_file (filename, x, y=None):
    with open(filename, 'wb') as f:
        smac.write_spmatrix_to_sparse_tensor (f, x, y)

In [None]:
# Training recordIO file
write_sparse_recordio_file(r'ml-latest-small/user_movie_train.recordio',X[:train],y[:train])

In [None]:
# Test recordIO file
write_sparse_recordio_file(r'ml-latest-small/user_movie_test.recordio',X[train:],y[train:])

In [None]:
# Create libSVM formatted file. 
# Output stored as rating, user_index:value, movie_index:value
# Store in libSVM format for directly testing with libFM
dump_svmlight_file(X[:train],y[:train],r'ml-latest-small/user_movie_train.svm')
dump_svmlight_file(X[train:],y[train:],r'ml-latest-small/user_movie_test.svm')

In [None]:
# Create two lookup files
# File 1: Categorical Movie ID and corresponding Movie Index in One Hot Encoded Table
# File 2: Categorical User ID and corresponding User Index in One Hot Encoded Table
list_of_movies = df.movieId.unique()
# user 1 and all movies
df_user_movie = pd.DataFrame({'userId': np.full(len(list_of_movies),1), 'movieId' : list_of_movies})

In [None]:
df_user_movie[columns_user_movie].head()

In [None]:
list_of_movies

In [None]:
# Transform to one hot encoding (with existing encoder)
X = encoder.transform(df_user_movie[columns_user_movie])

In [None]:
# Store movieId and corresponding one hot encoded entries
dump_svmlight_file(X,list_of_movies,r'ml-latest-small/one_hot_enc_movies.svm')

In [None]:
# File 2: Categorical User ID and corresponding User Index in One Hot Encoded Table
list_of_users = df.userId.unique()

In [None]:
list_of_users.shape

In [None]:
list_of_users[:10]

In [None]:
# All users and movie 1
df_user_movie = pd.DataFrame({'userId': list_of_users, 'movieId' : np.full(len(list_of_users),1)})

In [None]:
df_user_movie.head()

In [None]:
# Transform to one hot encoding (with existing encoder)
X = encoder.transform(df_user_movie[columns_user_movie])

In [None]:
# Store movieId and corresponding one hot encoded entries
dump_svmlight_file(X,list_of_users,r'ml-latest-small/one_hot_enc_users.svm')