<a href="https://colab.research.google.com/github/KsBrume/recomend-system/blob/main/movie_recommendation_system.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
!pip install scikit-surprise

Collecting scikit-surprise
  Downloading scikit_surprise-1.1.4.tar.gz (154 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/154.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.4/154.4 kB[0m [31m9.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (pyproject.toml) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.4-cp311-cp311-linux_x86_64.whl size=2505176 sha256=78cc8772e8811355290e1797469180e88f1221679ea108049e25a8267ce32da2
  Stored in directory: /root/.cache/pip/wheels/2a/8f/6e/7e2899163e2d85d8266daab4aa1cdabec7a6c56f83c015b5af
Successfully built scikit-surprise
Installing collected packages: scikit-surprise
Succe

In [4]:
import pandas as pd
import numpy as np

from surprise import Dataset, Reader, KNNBaseline
from surprise.model_selection import cross_validate
from sklearn.neighbors import NearestNeighbors

In [None]:
# 1. Let's find all unknown ratings for the movie (id = 181) using kNN(k=20) and user-based approach
# After calculate the average of the scores received.


# loading the data
data = Dataset.load_builtin('ml-100k')
df = pd.DataFrame(data.raw_ratings)
df.columns = ['user', 'item', 'rating', 'timestamp']

# getting the train and the test parts
reader = Reader()
data_train = Dataset.load_from_df(df[['user', 'item', 'rating']], reader)
trainset = data_train.build_full_trainset()
testset = trainset.build_anti_testset()

In [None]:
# building KNN algorithm
algo = KNNBaseline(k=20, sim_options={'user_based':True}).fit(trainset)
predictions = algo.test(testset)

# building final dataframe which includes algorithm response
data_pred = pd.DataFrame()
for pred in predictions:
    row_to_append = [{'user': pred.uid, 'item': pred.iid, 'rating': pred.est}]
    data_pred = pd.concat([data_pred, pd.DataFrame(row_to_append)], ignore_index=True)
data_pred.head()

Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.


Unnamed: 0,user,item,rating
0,196,302,3.968544
1,196,377,2.272678
2,196,51,3.608098
3,196,346,3.083782
4,196,474,4.434754


In [None]:
# saving algorithm response to csv
data_pred.to_csv('data.csv')

In [5]:
df = pd.read_csv('/content/drive/MyDrive/data.csv', index_col='Unnamed: 0')

In [7]:
print('Average of the scores received for the movie with id 181: ', df[df.item == 181].rating.agg('mean'))

Average of the scores received for the movie with id 181:  3.952208342677188


In [None]:
# 2. For the parameter k = [10, 20, 30, 40] let's find the best item-based algorithm
# (using RMSE as a metric and 3 folds cross-validation). Response: define k

# again let's get the data
data = Dataset.load_builtin('ml-100k')
df = pd.DataFrame(data.raw_ratings)
df.columns = ['user', 'item', 'rating', 'timestamp']
df.head()

# now cross validation process for the different values of k
for k in [10, 20, 30, 40]:
    algo = KNNBaseline(k=k, verbose=False)
    cv = cross_validate(algo, data, measures=['RMSE'], cv=3, verbose=False)
    print(str(k)+'NN:', np.mean(cv['test_rmse']))

10NN: 0.9571523079927253
20NN: 0.9421197276512475
30NN: 0.9367803947726127
40NN: 0.9361646814590018


In [8]:
print('Best k value - 10NN')

Best k value - 10NN


In [None]:
# 3. For the best algorithm from point 2, find all unknown ratings for the movie with id 181.
# In the answer: calculate the average of the received ratings.

# collecting data
data = Dataset.load_builtin('ml-100k')
df = pd.DataFrame(data.raw_ratings)
df.columns = ['user', 'item', 'rating', 'timestamp']

reader = Reader()
data_train = Dataset.load_from_df(df[['user', 'item', 'rating']], reader)
trainset = data_train.build_full_trainset()
testset = trainset.build_anti_testset()

algo = KNNBaseline(k=10).fit(trainset)
predictions = algo.test(testset)
data_pred = pd.DataFrame()
for pred in predictions:
    row_to_append = [{'user': pred.uid, 'item': pred.iid, 'rating': pred.est}]
    data_pred = pd.concat([data_pred, pd.DataFrame(row_to_append)], ignore_index=True)
data_pred.head()

Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.


Unnamed: 0,user,item,rating
0,196,302,4.191833
1,196,377,2.256576
2,196,51,3.906107
3,196,346,2.662569
4,196,474,4.255029


In [None]:
# saving resulting dataset to csv
data_pred.to_csv('data_task_3.csv')

In [None]:
df = pd.read_csv('/content/drive/MyDrive/data_task_3.csv', index_col='Unnamed: 0')
df.sample(5)

Unnamed: 0,user,item,rating
247831,184,619,3.32498
1279010,814,476,2.768432
973038,604,77,3.02592
1444707,922,343,2.551284
55989,276,489,3.925517


In [None]:
# аll unknown ratings for the movie № 181

display(df[df.item == 181])
print(f'Mean rating for the movie № 181: {df[df.item == 181]["rating"].mean()}')

Unnamed: 0,user,item,rating
50,196,181,3.861375
1684,186,181,3.834595
6281,166,181,4.449596
11082,253,181,4.287172
14117,6,181,3.755530
...,...,...,...
1473248,937,181,4.043136
1474892,926,181,3.905030
1478065,939,181,4.571768
1481237,930,181,2.774225


Mean rating for the movie № 181: 3.944609284384933


As we can see user-based approach gives slightly higher rating for the movie with the id 181 rather than item-based (approximately 3.95 versus 3.94 respectively)