In [None]:
# Uncomment and run if necessary
#!pip install scikit-surprise 

In [8]:
# Import libraries
import numpy as np
import pandas as pd

from surprise import SVD
from surprise import Reader, Dataset

# Garbage Collector
import gc

In [9]:
# Load the train and test dataset with pandas
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [10]:
# Show the first 5 rows in the train dataset
train.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,5163,57669,4.0,1518349992
1,106343,5,4.5,1206238739
2,146790,5459,5.0,1076215539
3,106362,32296,2.0,1423042565
4,9041,366,3.0,833375837


In [11]:
# Show the first 5 rows in the test dataset
test.head()

Unnamed: 0,userId,movieId
0,1,2011
1,1,4144
2,1,5767
3,1,6711
4,1,7318


In [12]:
# Sort train and test dataset by userId 
train.sort_values(by=['userId'], inplace= True)
test.sort_values(by=['userId'], inplace= True)

In [13]:
# Drop key_words column from the train dataset
ratings = train.drop(columns='timestamp')

# Instantiate a Reader object from the surprise package
reader = Reader() 

# Instantiate the dataset object from the surprice package with the ratings table and reader object
data = Dataset.load_from_df(ratings, reader) 

In [14]:
# Instantiate an SVD model from the surprice package
svd = SVD(n_epochs=40, n_factors=1800, init_std_dev=0.005, random_state = 25, verbose=1)

# Instantiate and build the dataset with data object from surprise package
trainset = data.build_full_trainset()

# Train the SVD model  
svd.fit(trainset)

Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 5
Processing epoch 6
Processing epoch 7
Processing epoch 8
Processing epoch 9
Processing epoch 10
Processing epoch 11
Processing epoch 12
Processing epoch 13
Processing epoch 14
Processing epoch 15
Processing epoch 16
Processing epoch 17
Processing epoch 18
Processing epoch 19
Processing epoch 20
Processing epoch 21
Processing epoch 22
Processing epoch 23
Processing epoch 24
Processing epoch 25
Processing epoch 26
Processing epoch 27
Processing epoch 28
Processing epoch 29
Processing epoch 30
Processing epoch 31
Processing epoch 32
Processing epoch 33
Processing epoch 34
Processing epoch 35
Processing epoch 36
Processing epoch 37
Processing epoch 38
Processing epoch 39


<surprise.prediction_algorithms.matrix_factorization.SVD at 0x138b925d0>

In [15]:
# Import pickle
import pickle

# Define the file path where the model will be saved
model_save_path = "svd_model.pkl"

# Open the file in write-binary mode as 'file'
with open(model_save_path, 'wb') as file:
    
    # Use pickle to serialize and save the 'svd' model object into the file
    pickle.dump(svd, file)

In [16]:
# Define the file path where the model is saved
model_load_path = "svd_model.pkl"

# Open the file in read-binary mode as 'file'
with open(model_load_path, 'rb') as file:
    
    # Use pickle to deserialize and load the saved model from the file
    unpickled_model = pickle.load(file)

In [17]:
# Extract the 'userId' column from the 'test' dataframe as a list of values
userId = test["userId"].values.tolist()

# Extract the 'movieId' column from the 'test' dataframe as a list of values
movieId = test["movieId"].values.tolist()

# Get the total number of user-movie pairs
count = len(userId)

# Initialize an empty list to store predicted ratings
rating = []

# Initialize an empty list to store unique user-movie pair identifiers
ids = []

# Loop over each user-movie pair
for i in range(count):
    
    # Predict the rating for a specific user and movie using the 'svd' model,
    # 'result' stores the predicted rating (index [3] of the returned object)
    result = svd.predict(userId[i], movieId[i])[3]

    # Append the predicted rating to the 'rating' list
    rating.append(result)

    # Create a unique identifier for the user-movie pair (in the format 'userId_movieId')
    # and append it to the 'ids' list
    ids.append(str(userId[i]) + '_' + str(movieId[i]))


In [18]:
# Convert ids and ratings to dataframe
test_sub_df =  pd.DataFrame({'Id': ids, 'rating': rating})
test_sub_df.head(20)

Unnamed: 0,Id,rating
0,1_2011,3.19584
1,1_4144,4.229688
2,1_5767,3.719302
3,1_6711,4.025117
4,1_7318,2.890713
5,1_8405,3.907422
6,1_8786,3.957115
7,2_3994,3.849667
8,2_4103,3.342066
9,2_4963,4.117717


In [19]:
# Find the number of rows and columns in the test_sub dataframe
test_sub_df.shape

(5000019, 2)

In [20]:
# Convert dataframe to csv file
test_sub_df.to_csv('Kaggle_submission_updated_final.csv', index=False)