## Model-Based Collaborative Filtering: Matrix Factorization

In [2]:
!pip install surprise

Collecting surprise
  Downloading surprise-0.1-py2.py3-none-any.whl (1.8 kB)
Collecting scikit-surprise
  Downloading scikit-surprise-1.1.3.tar.gz (771 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m772.0/772.0 kB[0m [31m806.4 kB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (setup.py) ... [?25ldone
[?25h  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.3-cp39-cp39-macosx_10_9_x86_64.whl size=1148084 sha256=ce49f0e8c4a53bcd6ab27bd1e2fe30194f0a37cf8530ab6aef965e3bbde96f95
  Stored in directory: /Users/huseyinefkanalp/Library/Caches/pip/wheels/10/bc/66/1e588b3f2aedb617b1bedc0f24981c917c95246a020c7c1975
Successfully built scikit-surprise
Installing collected packages: scikit-surprise, surprise
Successfully installed scikit-surprise-1.1.3 surprise-0.1


In [3]:
import pandas as pd
pd.set_option("display.max_columns", None)
from surprise import Reader, SVD, Dataset, accuracy
from surprise.model_selection import GridSearchCV, train_test_split, cross_validate

## Veri Setinin Hazırlanması

In [5]:
movie = pd.read_csv("movie_lens_dataset/movie.csv")
rating = pd.read_csv("movie_lens_dataset/rating.csv")
df = movie.merge(rating, how="left", on="movieId")
df.head()

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,3.0,4.0,1999-12-11 13:36:47
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,6.0,5.0,1997-03-13 17:50:52
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,8.0,4.0,1996-06-05 13:37:51
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,10.0,4.0,1999-11-25 02:44:47
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,11.0,4.5,2009-01-02 01:13:41


In [7]:
movie_ids = [130219, 356, 4422, 541]
movies = ["The Dark Knight (2011)",
         "Cries and Whispers (Viskningar och rop) (1972)",
         "Forrest Gump (1994)",
         "Blade Runner (1982)"]

In [10]:
sample_df = df[df.movieId.isin(movie_ids)]
sample_df.head()

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
2457839,356,Forrest Gump (1994),Comedy|Drama|Romance|War,4.0,4.0,1996-08-24 09:28:42
2457840,356,Forrest Gump (1994),Comedy|Drama|Romance|War,7.0,4.0,2002-01-16 19:02:55
2457841,356,Forrest Gump (1994),Comedy|Drama|Romance|War,8.0,5.0,1996-06-05 13:44:19
2457842,356,Forrest Gump (1994),Comedy|Drama|Romance|War,9.0,4.0,2001-07-01 20:26:38
2457843,356,Forrest Gump (1994),Comedy|Drama|Romance|War,10.0,3.0,1999-11-25 02:32:02


In [11]:
sample_df.shape

(97343, 6)

In [12]:
user_movie_df = sample_df.pivot_table(index=["userId"], columns=["title"], values = "rating")

In [13]:
user_movie_df.head()

title,Blade Runner (1982),Cries and Whispers (Viskningar och rop) (1972),Forrest Gump (1994),The Dark Knight (2011)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1.0,4.0,,,
2.0,5.0,,,
3.0,5.0,,,
4.0,,,4.0,
7.0,,,4.0,


In [14]:
#Puan skalamızı belirtiyoruz 1 ile 5 arasında
reader = Reader(rating_scale =(1,5))

In [15]:
#Surprise a uygun bir veri yapısına çevirdik
data = Dataset.load_from_df(sample_df[["userId","movieId","rating"]],reader)

## Modelleme

In [16]:
trainset, testset = train_test_split(data, test_size = .25)

In [17]:
svd_model = SVD()

In [18]:
svd_model.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7fc052acff10>

In [19]:
predictions = svd_model.test(testset)

In [21]:
#Gerçek değerler ile tahmin edilen değerlerin farkını alacak karelerini alacak
#bu değerleri topladıktan sonra ortalamasını alacak ve karekökünü alacak.
accuracy.rmse(predictions)

RMSE: 0.9328


0.9328101945570266

In [22]:
#ÖZellikle 1 tane kullanıcıya tahminde bulunma
svd_model.predict(uid = 1.0, iid=541, verbose= True)

user: 1.0        item: 541        r_ui = None   est = 4.44   {'was_impossible': False}


Prediction(uid=1.0, iid=541, r_ui=None, est=4.43655703490431, details={'was_impossible': False})

In [23]:
svd_model.predict(uid = 1.0, iid=356, verbose= True)

user: 1.0        item: 356        r_ui = None   est = 3.94   {'was_impossible': False}


Prediction(uid=1.0, iid=356, r_ui=None, est=3.943407039705862, details={'was_impossible': False})

## Model Tuning

In [28]:
param_grid = {"n_epochs": [5,10,20],
             "lr_all": [0.002,0.005,0.007]}

In [29]:
#param_grid deki iki hiperparametrenin olası bütün kombinasyonlarını dene
gs = GridSearchCV(SVD,
                 param_grid,
                 measures = ["rmse", "mae"],
                 cv=3, n_jobs=-1, joblib_verbose=True)

In [30]:
gs.fit(data)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  27 out of  27 | elapsed:    4.4s finished


In [31]:
gs.best_score["rmse"]

0.9311143539766141

In [33]:
gs.best_params["rmse"]

{'n_epochs': 5, 'lr_all': 0.002}

## Final Model ve Tahmin

In [35]:
svd_model = SVD(**gs.best_params["rmse"])

In [36]:
data = data.build_full_trainset()

In [37]:
svd_model.fit(data)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7fc052af4b50>

In [38]:
svd_model.predict(uid = 1.0, iid=356, verbose= True)

user: 1.0        item: 356        r_ui = None   est = 4.08   {'was_impossible': False}


Prediction(uid=1.0, iid=356, r_ui=None, est=4.081830680003905, details={'was_impossible': False})