In [1]:
import scipy.io as scio
import scipy.optimize as op
import numpy as np
import matplotlib.pyplot as plt
from surprise import BaselineOnly, Dataset, Reader, Trainset, KNNBaseline, SVD, accuracy
from surprise.model_selection import cross_validate, GridSearchCV

#### 1. Загрузите данные ex9_movies.mat из файла.

In [3]:
movies = scio.loadmat('data/lab 9/ex9_movies.mat')
y, r = movies['Y'], movies['R']

#### 2. Выберите число признаков фильмов (n) для реализации алгоритма коллаборативной фильтрации.

In [4]:
num_users = 4
num_movies = 5
num_features = 3

#### 3. Реализуйте функцию стоимости для алгоритма.
#### 5. При реализации используйте векторизацию для ускорения процесса обучения.
#### 6. Добавьте L2-регуляризацию в модель.

In [5]:
def cofiCostFunc(X, Theta, y, r, num_users, num_movies, num_features, l):
    J = 1 / 2 * np.sum(r * (X.dot(Theta.T) - y)**2) + l / \
        2 * (np.sum(Theta**2) + np.sum(X**2))
    return J

#### 4. Реализуйте функцию вычисления градиентов.

In [6]:
def cofiGradientFunc(X, Theta, y, r, num_users, num_movies, num_features, l):
    X_grad = (r * (X.dot(Theta.T) - y)).dot(Theta) + l * X
    Theta_grad = (r * (X.dot(Theta.T) - y)).T.dot(X) + l * Theta
    return X_grad, Theta_grad

#### 7. Добавьте несколько оценок фильмов от себя. Файл movie_ids.txt содержит индексы каждого из фильмов.

In [7]:
def loadMovieList():
    f = open('data/Lab 9/movie_ids.txt').readlines()
    n = 1682
    list = [0]*1682
    i = 0
    for line in f:
        if (i+1)/10 < 1:
            list[i] = line[2:]
        elif (i+1)/100 < 1:
            list[i] = line[3:]
        elif (i+1)/1000 < 1:
            list[i] = line[4:]
        else:
            list[i] = line[5:]
        i += 1
    # dict = {"list":list}
    # movieList = pd.DataFrame(dict)
    # return movieList
    return list

In [8]:
movieList = loadMovieList()
my_ratings = np.zeros([1682,1])

my_ratings[123] = 4
my_ratings[34] = 2
my_ratings[54] = 3
my_ratings[6]= 5
my_ratings[56] = 4
my_ratings[68] = 5
my_ratings[63] = 3
my_ratings[73] = 5
my_ratings[189] = 4
my_ratings[231] = 5
my_ratings[350] = 5

for i in range(0,len(my_ratings)):
    if my_ratings[i] > 0:
        print('Rated %d for %s'%(my_ratings[i],movieList[i]),end="")
        

Rated 5 for Twelve Monkeys (1995)
Rated 2 for Free Willy 2: The Adventure Home (1995)
Rated 3 for Professional, The (1994)
Rated 4 for Priest (1994)
Rated 3 for Shawshank Redemption, The (1994)
Rated 5 for Forrest Gump (1994)
Rated 5 for Faster Pussycat! Kill! Kill! (1965)
Rated 4 for Lone Star (1996)
Rated 4 for Henry V (1989)
Rated 5 for Young Guns (1988)
Rated 5 for Prophecy II, The (1998)


#### 8. Обучите модель с помощью градиентного спуска или других методов оптимизации.

In [9]:
def normalizeRatings(Y, R):
    m, n = np.shape(Y)
    Ymean = np.zeros([m, 1])
    Ynorm = np.zeros(np.shape(Y))
    for i in range(0,m):
        idx = np.where(R[i,:] == 1)
        Ymean[i] = np.mean(Y[i,idx])
        Ynorm[i,idx] = Y[i,idx] - Ymean[i]

    return Ynorm,Ymean

In [10]:
def cofiCostFunc(params, Y, R, num_users, num_movies,num_features, xlambda):
    X = params[:num_movies*num_features].reshape(num_movies, num_features)
    Theta = params[num_movies*num_features:].reshape(num_users, num_features)

    # zero initial
    J = 0
    X_grad = np.zeros(np.shape(X))
    Theta_grad = np.zeros(np.shape(Theta))

    # start
    J = np.sum(R*((np.dot(X,Theta.T)-Y)**2))/2
    X_grad = np.dot((R*(np.dot(X,Theta.T)-Y)),Theta)
    Theta_grad = np.dot((R*(np.dot(X,Theta.T)-Y)).T,X)

    grad = np.r_[(X_grad.ravel().reshape(num_movies*num_features,1),Theta_grad.ravel().reshape(num_users*num_features,1))]
    return J,grad

In [11]:
ex8_movies = scio.loadmat('data/Lab 9/ex9_movies.mat')

#  Y is a 1682x943 matrix, containing ratings (1-5) of 1682 movies by 943 users
#  R is a 1682x943 matrix, where R(i,j) = 1 if and only if user j gave a rating to movie i
R,Y = ex8_movies["R"], ex8_movies["Y"]

#  Add our own ratings to the data matrix
Y = np.c_[(my_ratings, Y)]
R = np.c_[(np.ceil(my_ratings / 5), R)]

#  Normalize Ratings
Ynorm, Ymean = normalizeRatings(Y, R)

#  Useful Values
num_users = Y.shape[1]
num_movies = Y.shape[0]
num_features = 10

# Set Initial Parameters (Theta, X)
X = np.random.randn(num_movies, num_features)
Theta = np.random.randn(num_users, num_features)

initial_parameters=np.r_[(X.ravel().reshape(num_movies * num_features, 1),Theta.ravel().reshape(num_users * num_features,1))]
xlambda = 10

theta = op.minimize(fun=cofiCostFunc, x0=initial_parameters,
                     args=(Ynorm, R, num_users, num_movies, num_features, xlambda),
                     method='TNC', jac=True, tol=1e-6, options={'maxiter':100, "disp":True}).x

# Unfold the returned theta back into U and W
X = theta[:num_movies * num_features].reshape(num_movies, num_features)
Theta = theta[num_movies * num_features:].reshape(num_users, num_features)

### 9. Сделайте рекомендации для себя. Совпали ли они с реальностью?

In [12]:
p = np.dot(X,Theta.T)
my_predictions = p[:,0].reshape(num_movies,1) + Ymean.reshape(num_movies,1)
my_predictions = my_predictions[:, 0]

movieList = loadMovieList()

my_pred_sort = np.sort(-my_predictions)
my_ix = np.argsort(-my_predictions)

print('\nTop recommendations for you:\n')
for i in range(0,20):
    j = int(my_ix[i])
    print('Predicting rating %.1f for movie %s'%(my_predictions[j], movieList[j]),end="")
    


Top recommendations for you:

Predicting rating 33.3 for movie Eighth Day, The (1996)
Predicting rating 14.6 for movie Nobody Loves Me (Keiner liebt mich) (1994)
Predicting rating 14.3 for movie Etz Hadomim Tafus (Under the Domin Tree) (1994)
Predicting rating 14.1 for movie Quiet Room, The (1996)
Predicting rating 14.0 for movie Forget Paris (1995)
Predicting rating 13.2 for movie Land and Freedom (Tierra y libertad) (1995)
Predicting rating 13.0 for movie Passion Fish (1992)
Predicting rating 12.6 for movie Fast, Cheap & Out of Control (1997)
Predicting rating 12.5 for movie Horseman on the Roof, The (Hussard sur le toit, Le) (1995)
Predicting rating 11.5 for movie Houseguest (1994)
Predicting rating 11.4 for movie Bananas (1971)
Predicting rating 11.2 for movie Witness (1985)
Predicting rating 11.1 for movie Purple Noon (1960)
Predicting rating 11.1 for movie Hush (1998)
Predicting rating 11.0 for movie Cement Garden, The (1993)
Predicting rating 10.9 for movie Alphaville (1965)
Pr

В целом, рекомендации похожи на фильмы, которые я положительно оценил ранее.

#### 10. Также обучите модель с помощью сингулярного разложения матриц. Отличаются ли полученные результаты?
 

In [13]:
# save *.csv file

Y_inlined = np.dstack([*np.indices(Y.T.shape), Y.T]).reshape(-1, 3)
np.savetxt('data/Lab 9/ex9_movies.csv', Y_inlined[Y_inlined[:,2] != 0], fmt="%i %i %i")

# create dataset from *.csv file
reader = Reader(line_format='user item rating', sep=" ")
data = Dataset.load_from_file("data/Lab 9/ex9_movies.csv", reader)

In [14]:
algo = SVD(n_factors=100, n_epochs=100, lr_all=0.1)
trainset = data.build_full_trainset()
testset = trainset.build_testset()

algo.fit(trainset)
p = algo.predict(uid=trainset.to_raw_uid(0), iid=trainset.to_raw_iid(0), r_ui=10)
predictions = algo.test(testset)

print(p)
print('Error = ', accuracy.rmse(predictions))

user: 0          item: 6          r_ui = 10.00   est = 4.97   {'was_impossible': False}
RMSE: 0.2481
Error =  0.24806449586520463


In [15]:
my_predictions = np.ndarray(1681)
for movie_index in np.arange(0, 1681):
    my_predictions[movie_index] = algo.predict(uid=trainset.to_raw_uid(0), iid=trainset.to_raw_iid(movie_index)).est

In [17]:
movieList = loadMovieList()

my_pred_sort = np.sort(-my_predictions)
my_ix = np.argsort(-my_predictions)

print('\nTop recommendations for you:\n')
for i in range(0,20):
    j = int(my_ix[i])
    print('Predicting rating %.1f for movie %s'%(my_predictions[j], movieList[j]),end="")
    


Top recommendations for you:

Predicting rating 5.0 for movie Contact (1997)
Predicting rating 5.0 for movie Raise the Red Lantern (1991)
Predicting rating 5.0 for movie Crumb (1994)
Predicting rating 5.0 for movie Shanghai Triad (Yao a yao yao dao waipo qiao) (1995)
Predicting rating 5.0 for movie Vanya on 42nd Street (1994)
Predicting rating 5.0 for movie Empire Strikes Back, The (1980)
Predicting rating 5.0 for movie Twelve Monkeys (1995)
Predicting rating 5.0 for movie Toy Story (1995)
Predicting rating 5.0 for movie Richard III (1995)
Predicting rating 5.0 for movie Seven (Se7en) (1995)
Predicting rating 5.0 for movie Raging Bull (1980)
Predicting rating 4.9 for movie World of Apu, The (Apur Sansar) (1959)
Predicting rating 4.9 for movie Much Ado About Nothing (1993)
Predicting rating 4.9 for movie Kansas City (1996)
Predicting rating 4.8 for movie Pather Panchali (1955)
Predicting rating 4.8 for movie Welcome to the Dollhouse (1995)
Predicting rating 4.8 for movie Face/Off (1997

Получившиеся результаты совершенно не похожи на предыдущие.
Если рассматривать детально, то отзывы, которые в первом случае были предсказаны на 5, в svd оценены на 3-4.
Но в целом предсказание выглядит более правдоподобным. 
К примеру мне порекомендовало посмотреть фильм Twelve Monkeys, т.к. я его оценил на 5. 
