In [4]:
!wget 'https://files.grouplens.org/datasets/movielens/ml-100k.zip'
!unzip ml-100k.zip

--2023-03-31 18:12:23--  https://files.grouplens.org/datasets/movielens/ml-100k.zip
Resolving files.grouplens.org (files.grouplens.org)... 128.101.65.152
Connecting to files.grouplens.org (files.grouplens.org)|128.101.65.152|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4924029 (4.7M) [application/zip]
Saving to: ‘ml-100k.zip’


2023-03-31 18:12:23 (27.4 MB/s) - ‘ml-100k.zip’ saved [4924029/4924029]



In [17]:
import pandas as pd
import os

In [31]:
df = pd.read_csv(os.path.join('ml-100k','u.data'), sep="\t",names=['user','movie','rating','timestamp'])
print(df.head())
df = df.drop(columns=['timestamp'])
print("\n*** After dropping timestamp ***\n")
print(df.head())

   user  movie  rating  timestamp
0   196    242       3  881250949
1   186    302       3  891717742
2    22    377       1  878887116
3   244     51       2  880606923
4   166    346       1  886397596

*** After dropping timestamp ***

   user  movie  rating
0   196    242       3
1   186    302       3
2    22    377       1
3   244     51       2
4   166    346       1


In [48]:
conda install -c conda-forge/label/cf202003 scikit-surprise

Collecting package metadata (current_repodata.json): done
Solving environment: done

# All requested packages already installed.

Retrieving notices: ...working... done

Note: you may need to restart the kernel to use updated packages.


In [64]:
import surprise
from surprise import SVD, NMF, NormalPredictor, KNNBasic
import boto3

In [65]:
reader = surprise.Reader(rating_scale=(1, 5.0))
data = surprise.Dataset.load_from_df(df[['user', 'movie', 'rating']], reader)
train_data, test_data = surprise.model_selection.train_test_split(data, test_size=0.2)
train_df = pd.DataFrame(train_data.all_ratings(), columns=["USER_ID", "ITEM_ID", "RATING"])
test_df = pd.DataFrame(test_data, columns=["USER_ID", "ITEM_ID", "RATING"])

In [67]:
data_folder = 'data'
train_file = 'movie_train'
test_file = 'movie_test'
bucket = #ENTER YOUR BUCKET FROM S3
!mkdir $data_folder

train_df.to_csv(os.path.join(data_folder, train_file), index=False)
test_df.to_csv(os.path.join(data_folder, test_file), index=False, header=False)

boto3.Session().resource("s3").Bucket(bucket).Object(
    "{}/{}".format(data_folder, train_file)
).upload_file(os.path.join(data_folder, train_file))

boto3.Session().resource("s3").Bucket(bucket).Object(
    "{}/{}".format(data_folder, test_file)
).upload_file(os.path.join(data_folder, test_file))

In [69]:
from surprise.model_selection import cross_validate
from surprise import NormalPredictor
from surprise import KNNBasic
from surprise import SVD
from surprise import NMF
benchmark = []
# Iterate over all algorithms
for algorithm in [SVD(), NMF(), NormalPredictor(), KNNBasic()]:
    # Perform cross validation
    results = cross_validate(algorithm, data, measures=['RMSE'], cv=3, verbose=False)
    
    # Get results & append algorithm name
    tmp = pd.DataFrame.from_dict(results).mean(axis=0)
    tmp = tmp.append(pd.Series([str(algorithm).split(' ')[0].split('.')[-1]],index=['Algorithm']))
    benchmark.append(tmp)


Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.


In [70]:
benchmark

[test_rmse    0.945386
 fit_time     3.640697
 test_time    0.287278
 Algorithm         SVD
 dtype: object,
 test_rmse    0.974654
 fit_time     4.072534
 test_time    0.314665
 Algorithm         NMF
 dtype: object,
 test_rmse           1.520267
 fit_time            0.114461
 test_time           0.305803
 Algorithm    NormalPredictor
 dtype: object,
 test_rmse    0.989476
 fit_time     0.349521
 test_time    4.486542
 Algorithm    KNNBasic
 dtype: object]

# Model Training and Deployment
Note: This section is heavily inspired by https://github.com/apac-ml-tfc/recommender-workshop

Check it out for more details! Some of the implementation is slightly out of date

In [None]:
from sagemaker.sklearn.estimator import SKLearn as SMSKLearnEstimator
import sagemaker
script_path = "surprise_trainer.py"
source_dir = "./"
role = sagemaker.get_execution_role()
sagemaker_session = sagemaker.Session()
bucket = "sagemaker-us-east-2-282332614175"
estimator = SMSKLearnEstimator(
    framework_version='0.20.0',
    py_version='py3',
    entry_point=script_path,
    source_dir=source_dir,
    instance_type="ml.c4.xlarge",
    role=role,
    sagemaker_session=sagemaker_session,
    output_path=f"s3://{bucket}/output",
        
    # training on spot instances is an easy way to save cost:
    use_spot_instances=True,
    max_run=60*5, # 5 mins max actual run time
    max_wait=60*10 # 10 mins max wait for spot interruptions
)

# Instead of just specifying the training channel as an S3 path string, we can use s3_input to get more control:
train_channel = sagemaker.session.TrainingInput(
    f"s3://{bucket}/data/movie_train", 
    distribution="FullyReplicated",
    content_type="text/csv",
    s3_data_type="S3Prefix"
)

# This will block until training is complete, showing console output below:
estimator.fit({ "train": train_channel })

In [None]:
deployed_model = estimator.deploy(
    initial_instance_count=1,
    instance_type="ml.m4.xlarge"
)

In [None]:
test = (50, 100)
print(f"Predicting for User {test[0]} Item {test[1]}, Rating: {deployed_model.predict(test)[0][3]}")