In [None]:
import sys
import pandas as pd
import numpy as np
import scipy.sparse as sparse
from scipy.sparse.linalg import spsolve
import random

from sklearn.preprocessing import MinMaxScaler

import implicit

# Load the data like we did before
raw_data = pd.read_table('data/usersha1-artmbid-artname-plays.tsv')
raw_data = raw_data.drop(raw_data.columns[1], axis=1)
raw_data.columns = ['user', 'artist', 'plays']

# Drop NaN columns
data = raw_data.dropna()
data = data.copy()

# Create a numeric user_id and artist_id column
data['user'] = data['user'].astype("category")
data['artist'] = data['artist'].astype("category")
data['user_id'] = data['user'].cat.codes
data['artist_id'] = data['artist'].cat.codes

# The implicit library expects data as a item-user matrix so we
# create two matricies, one for fitting the model (item-user)
# and one for recommendations (user-item)
sparse_item_user = sparse.csr_matrix((data['plays'].astype(float), (data['artist_id'], data['user_id'])))
sparse_user_item = sparse.csr_matrix((data['plays'].astype(float), (data['user_id'], data['artist_id'])))

# Initialize the als model and fit it using the sparse item-user matrix
model = implicit.als.AlternatingLeastSquares(factors=20, regularization=0.1, iterations=20)

# Calculate the confidence by multiplying it by our alpha value.
alpha_val = 15
data_conf = (sparse_item_user * alpha_val).astype('double')

#Fit the model
model.fit(data_conf)


#---------------------
# FIND SIMILAR ITEMS
#---------------------

# Find the 10 most similar to Jay-Z
item_id = 147068 #Jay-Z
n_similar = 10

# Use implicit to get similar items.
similar = model.similar_items(item_id, n_similar)

# Print the names of our most similar artists
for item in similar:
    idx, score = item
    print data.artist.loc[data.artist_id == idx].iloc[0]


#------------------------------
# CREATE USER RECOMMENDATIONS
#------------------------------

# Create recommendations for user with id 2025
user_id = 2025

# Use the implicit recommender.
recommended = model.recommend(user_id, sparse_user_item)

artists = []
scores = []

# Get artist names from ids
for item in recommended:
    idx, score = item
    artists.append(data.artist.loc[data.artist_id == idx].iloc[0])
    scores.append(score)

# Create a dataframe of artist names and scores
recommendations = pd.DataFrame({'artist': artists, 'score': scores})

print recommendations

In [1]:
import sys
import pandas as pd
import numpy as np
import scipy.sparse as sparse
from scipy.sparse.linalg import spsolve
import random

from sklearn.preprocessing import MinMaxScaler

import implicit

In [2]:
# загружаем датасет
df = pd.read_csv('data/ratings.csv')

In [3]:
# выводим количество пользователей и фильмов
n_users = df['userId'].unique().shape[0]
n_items = df['movieId'].unique().shape[0]

In [4]:
print(n_users, n_items)

610 9724


In [6]:
# чтобы можно было удобно работать дальше, необходимо отмасштабировать
# значения в колонке movieId (новые значения будут в диапазоне от 1 до
# количества фильмов)
input_list = df['movieId'].unique()

def scale_movie_id(input_id):
    return np.where(input_list == input_id)[0][0] + 1

df['movieId'] = df['movieId'].apply(scale_movie_id)

In [7]:
from sklearn.model_selection import train_test_split

# делим данные на тренировочный и тестовый наборы
train_data, test_data = train_test_split(df, test_size=0.20)

In [8]:
# создаём две user-item матрицы – для обучения и для теста
train_data_matrix = np.zeros((n_users, n_items))

for line in train_data.itertuples():
    train_data_matrix[line[1] - 1, line[2] - 1] = line[3]

test_data_matrix = np.zeros((n_users, n_items))

for line in test_data.itertuples():
    test_data_matrix[line[1] - 1, line[2] - 1] = line[3]

In [34]:
df

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,2,4.0,964981247
2,1,3,4.0,964982224
3,1,4,5.0,964983815
4,1,5,5.0,964982931
...,...,...,...,...
100831,610,3121,4.0,1493848402
100832,610,2036,5.0,1493850091
100833,610,3122,5.0,1494273047
100834,610,1393,5.0,1493846352


In [10]:
train_data_matrix

array([[4. , 4. , 4. , ..., 0. , 0. , 0. ],
       [0. , 0. , 0. , ..., 0. , 0. , 0. ],
       [0. , 0. , 0. , ..., 0. , 0. , 0. ],
       ...,
       [2.5, 2. , 0. , ..., 0. , 0. , 0. ],
       [3. , 0. , 0. , ..., 0. , 0. , 0. ],
       [5. , 0. , 0. , ..., 3. , 3.5, 3.5]])

In [11]:
model = implicit.als.AlternatingLeastSquares(factors=50)



In [54]:
from scipy import sparse
train_data_matrix = sparse.csr_matrix(train_data_matrix)
test_data_matrix = sparse.csr_matrix(test_data_matrix)
model.fit(train_data_matrix)

  0%|          | 0/15 [00:00<?, ?it/s]

In [55]:
user_items = train_data_matrix.T.tocsr()
user_items_test = test_data_matrix.T.tocsr()

In [None]:
user_items

In [48]:
for user_id in range(100):
    recommendations = model.recommend(user_id, user_items[user_id], N=3)
    print(f"rec for {user_id}: ", recommendations)

rec for 0:  (array([  7, 121,  73]), array([1.1730493, 1.1619581, 1.1553907], dtype=float32))
rec for 1:  (array([244, 238, 232]), array([0.72680235, 0.6774666 , 0.66783744], dtype=float32))
rec for 2:  (array([ 86, 224, 272]), array([0.19111466, 0.17105395, 0.16759826], dtype=float32))
rec for 3:  (array([491, 428,   4]), array([1.0915241, 1.0602024, 1.0419829], dtype=float32))
rec for 4:  (array([ 25, 463,  20]), array([0.89451295, 0.8907545 , 0.87786883], dtype=float32))
rec for 5:  (array([484, 608, 504]), array([1.3142223, 1.2803171, 1.2770566], dtype=float32))
rec for 6:  (array([ 85, 758, 463]), array([1.0895649 , 0.95497346, 0.9394176 ], dtype=float32))
rec for 7:  (array([ 20, 232,  25]), array([0.9668588 , 0.91664624, 0.8789672 ], dtype=float32))
rec for 8:  (array([764, 758,  15]), array([0.57201314, 0.5240848 , 0.52194977], dtype=float32))
rec for 9:  (array([ 20, 934, 877]), array([0.910225 , 0.8968362, 0.8756878], dtype=float32))
rec for 10:  (array([ 42, 531, 478]), arra

In [50]:
user_items.toarray()

array([[4. , 0. , 0. , ..., 2.5, 3. , 5. ],
       [4. , 0. , 0. , ..., 2. , 0. , 0. ],
       [4. , 0. , 0. , ..., 0. , 0. , 0. ],
       ...,
       [0. , 0. , 0. , ..., 0. , 0. , 3. ],
       [0. , 0. , 0. , ..., 0. , 0. , 3.5],
       [0. , 0. , 0. , ..., 0. , 0. , 3.5]])

In [56]:
for user_id in range(100):
    recommendations = model.recommend(user_id, user_items_test[user_id], N=3)
    print(f"rec for {user_id}: ", recommendations)

rec for 0:  (array([ 42,   7, 121]), array([1.3655211, 1.1777693, 1.1543326], dtype=float32))
rec for 1:  (array([244, 238, 232]), array([0.7287135 , 0.67896247, 0.6645335 ], dtype=float32))
rec for 2:  (array([ 86, 224, 272]), array([0.1933581 , 0.1713285 , 0.16446169], dtype=float32))
rec for 3:  (array([  0, 491, 428]), array([1.3618611, 1.0746433, 1.0610173], dtype=float32))
rec for 4:  (array([16, 32,  7]), array([1.0098007, 0.9644757, 0.9568286], dtype=float32))
rec for 5:  (array([484, 608, 504]), array([1.3112813, 1.2799227, 1.2795099], dtype=float32))
rec for 6:  (array([ 85, 744, 758]), array([1.0784189 , 0.94658196, 0.94109327], dtype=float32))
rec for 7:  (array([20, 16, 32]), array([0.96353817, 0.9590734 , 0.93663293], dtype=float32))
rec for 8:  (array([764,  15, 758]), array([0.5725221, 0.5259911, 0.5199328], dtype=float32))
rec for 9:  (array([ 20, 934, 877]), array([0.9173486, 0.8947843, 0.8834142], dtype=float32))
rec for 10:  (array([42, 41, 37]), array([0.99275196, 

In [80]:
test_data_matrix = test_data_matrix.toarray()
test_data_matrix[2][263] = 0

In [86]:
np.where(test_data_matrix[5] == 5)

(array([  7,  17,  25,  41,  58, 471, 532, 565, 573, 603, 684], dtype=int64),)

In [88]:
test_data_matrix[5][7] = 0
test_data_matrix[5][17] = 0
test_data_matrix[5][25] = 0

In [89]:
model.recommend(5, sparse.csr_matrix(test_data_matrix[5]), N=20)

(array([ 484,  608,  504,  580,  314,  502,  460,  313,  486,  481,  515,
        1430,  531,  621,  610,  483,   22,  612,  588,   31]),
 array([1.3112813, 1.2799227, 1.2795099, 1.2285532, 1.218313 , 1.1686647,
        1.1674622, 1.1575077, 1.1272751, 1.1202202, 1.1179943, 1.1080779,
        1.1063772, 1.1007195, 1.0802832, 1.0797931, 1.0716823, 1.0705589,
        1.0695897, 1.0574884], dtype=float32))