In [1]:
# установка пакета surprise
# pip install surprise

In [2]:
# Загрузка библиотек необходимых для работы
from surprise import KNNWithMeans, KNNBasic, KNNWithZScore,SVD
from surprise import Dataset
from surprise import accuracy
from surprise import Reader
from surprise.model_selection import train_test_split
from surprise.model_selection import cross_validate

import pandas as pd

In [3]:
# Загрузка данных
movies = pd.read_csv('/Users/irinanikulina/Documents/ML/RecommendationSystems/movies.csv')
ratings = pd.read_csv('/Users/irinanikulina/Documents/ML/RecommendationSystems/ratings.csv')

In [4]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [5]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [6]:
# Создание нового датафрейма для работы
movies_with_ratings = movies.join(ratings.set_index('movieId'), on='movieId').reset_index(drop=True)
movies_with_ratings.dropna(inplace=True)

In [7]:
movies_with_ratings.head()

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1.0,4.0,964982700.0
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5.0,4.0,847435000.0
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,7.0,4.5,1106636000.0
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,15.0,2.5,1510578000.0
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,17.0,4.5,1305696000.0


In [8]:
movies_with_ratings[movies_with_ratings.userId == 20.0].title.unique()

array(['Jumanji (1995)', 'Tom and Huck (1995)', 'Balto (1995)',
       'Babe (1995)', 'Pocahontas (1995)',
       'Muppet Treasure Island (1996)', 'Casper (1995)',
       'Goofy Movie, A (1995)', 'Little Princess, A (1995)',
       'Santa Clause, The (1994)', 'Jungle Book, The (1994)',
       'Lion King, The (1994)', 'Secret Garden, The (1993)',
       'Nightmare Before Christmas, The (1993)', 'Home Alone (1990)',
       'Aladdin (1992)', 'Snow White and the Seven Dwarfs (1937)',
       'Beauty and the Beast (1991)', 'Pinocchio (1940)',
       'James and the Giant Peach (1996)', 'Oliver & Company (1988)',
       'Wallace & Gromit: The Best of Aardman Animation (1996)',
       'Hunchback of Notre Dame, The (1996)', 'Harriet the Spy (1996)',
       'Matilda (1996)', "Singin' in the Rain (1952)",
       'My Fair Lady (1964)', 'Wizard of Oz, The (1939)',
       'Fly Away Home (1996)', 'Love Bug, The (1969)',
       'Old Yeller (1957)', 'Parent Trap, The (1961)',
       'Homeward Bound: The

In [9]:
dataset = pd.DataFrame({
    'uid': movies_with_ratings.userId,
    'iid': movies_with_ratings.title,
    'rating': movies_with_ratings.rating
})

In [10]:
dataset.head()

Unnamed: 0,uid,iid,rating
0,1.0,Toy Story (1995),4.0
1,5.0,Toy Story (1995),4.0
2,7.0,Toy Story (1995),4.5
3,15.0,Toy Story (1995),2.5
4,17.0,Toy Story (1995),4.5


In [11]:
# Определение граничных значений рейтинга
dataset.rating.min()

0.5

In [12]:
ratings.rating.max()

5.0

In [13]:
reader = Reader(rating_scale=(0.5, 5.0))
data = Dataset.load_from_df(dataset, reader)

In [14]:
# Разбиение данных на тренировочную и тестовую выборки
trainset, testset = train_test_split(data, test_size=.1)

In [15]:
# Использование алгоритма KNNWithMeans
# algo = KNNWithMeans(k=50, sim_options={'name': 'pearson_baseline', 'user_based': True})
algo = SVD(n_factors=46, n_epochs = 20)
algo.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7fa5d83945b0>

In [16]:
test_pred = algo.test(testset)

In [17]:
# Кросс-валидация на 5 фолдах с определением RMSE, MAE
scores = cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8691  0.8700  0.8731  0.8688  0.8681  0.8698  0.0018  
MAE (testset)     0.6675  0.6714  0.6672  0.6677  0.6670  0.6681  0.0016  
Fit time          2.56    2.43    2.46    2.46    2.45    2.47    0.05    
Test time         0.07    0.07    0.07    0.07    0.11    0.08    0.02    


In [18]:
# Определение точности работы алгоритма только по тестовой выборке
accuracy.rmse(test_pred, verbose=True)

RMSE: 0.8691


0.8690815834209011

In [19]:
algo.predict(uid=20, iid='Shawshank Redemption, The (1994)')

Prediction(uid=20, iid='Shawshank Redemption, The (1994)', r_ui=None, est=4.741059898273398, details={'was_impossible': False})

In [20]:
# algo = KNNWithMeans(k=50, sim_options={'name': 'pearson_baseline', 'user_based': False})
algo = SVD(n_factors=46, n_epochs = 20)
algo.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7fa6093c13a0>

In [21]:
test_pred = algo.test(testset)

In [22]:
scores = cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8796  0.8688  0.8729  0.8627  0.8758  0.8720  0.0058  
MAE (testset)     0.6743  0.6667  0.6678  0.6663  0.6706  0.6692  0.0030  
Fit time          2.53    2.54    2.62    2.58    2.54    2.56    0.04    
Test time         0.07    0.13    0.13    0.07    0.07    0.09    0.03    


In [23]:
accuracy.rmse(test_pred, verbose=True)

RMSE: 0.8695


0.8694781942746264

In [24]:
new_pred = algo.predict(uid=20, iid='Shawshank Redemption, The (1994)')
new_pred

Prediction(uid=20, iid='Shawshank Redemption, The (1994)', r_ui=None, est=4.705022213028888, details={'was_impossible': False})