In [34]:
import pandas as pd
import numpy as np
from scipy.sparse import coo_matrix, csr_matrix
import implicit as imp
import time
from sklearn import metrics
from os import path, environ

###### Загружаем датасет

In [2]:
dataset = pd.read_csv(
        '/lastfm_small/lastfm_small.tsv',
        sep='\t', header=None,
        names=['user', 'artist', 'artist_name', 'plays']
        )

In [3]:
dataset.head()

Unnamed: 0,user,artist,artist_name,plays
0,00000c289a1829a808ac09c00daf10bc3c4e223b,3bd73256-3905-4f3a-97e2-8b341527f805,betty blowtorch,2137
1,00000c289a1829a808ac09c00daf10bc3c4e223b,f2fb0ff0-5679-42ec-a55c-15109ce6e320,die Ärzte,1099
2,00000c289a1829a808ac09c00daf10bc3c4e223b,b3ae82c2-e60b-4551-a76d-6620f1b456aa,melissa etheridge,897
3,00000c289a1829a808ac09c00daf10bc3c4e223b,3d6bbeb7-f90e-4d10-b440-e153c0d10b53,elvenking,717
4,00000c289a1829a808ac09c00daf10bc3c4e223b,bbd2ffd7-17f4-4506-8572-c1ea58c3f9a8,juliette & the licks,706


In [4]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 4 columns):
user           1000000 non-null object
artist         987168 non-null object
artist_name    1000000 non-null object
plays          1000000 non-null int64
dtypes: int64(1), object(3)
memory usage: 30.5+ MB


###### Уберем из датасета "нулевых" пользователей

In [5]:
dataset_clean = dataset.loc[pd.isnull(dataset.artist) == False]

###### Обрабатываем данные и строим разреженные матрицы

In [6]:
dataset_clean.head()

Unnamed: 0,user,artist,artist_name,plays
0,00000c289a1829a808ac09c00daf10bc3c4e223b,3bd73256-3905-4f3a-97e2-8b341527f805,betty blowtorch,2137
1,00000c289a1829a808ac09c00daf10bc3c4e223b,f2fb0ff0-5679-42ec-a55c-15109ce6e320,die Ärzte,1099
2,00000c289a1829a808ac09c00daf10bc3c4e223b,b3ae82c2-e60b-4551-a76d-6620f1b456aa,melissa etheridge,897
3,00000c289a1829a808ac09c00daf10bc3c4e223b,3d6bbeb7-f90e-4d10-b440-e153c0d10b53,elvenking,717
4,00000c289a1829a808ac09c00daf10bc3c4e223b,bbd2ffd7-17f4-4506-8572-c1ea58c3f9a8,juliette & the licks,706


In [7]:
dataset_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 987168 entries, 0 to 999999
Data columns (total 4 columns):
user           987168 non-null object
artist         987168 non-null object
artist_name    987168 non-null object
plays          987168 non-null int64
dtypes: int64(1), object(3)
memory usage: 37.7+ MB


In [8]:
dataset_clean["user"] = dataset_clean["user"].astype("category").cat.codes.copy() + 1
dataset_clean["artist"] = dataset_clean["artist"].astype("category").cat.codes.copy() + 1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


In [9]:
dataset_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 987168 entries, 0 to 999999
Data columns (total 4 columns):
user           987168 non-null int16
artist         987168 non-null int32
artist_name    987168 non-null object
plays          987168 non-null int64
dtypes: int16(1), int32(1), int64(1), object(1)
memory usage: 28.2+ MB


In [10]:
dataset_clean.head()

Unnamed: 0,user,artist,artist_name,plays
0,1,15531,betty blowtorch,2137
1,1,63468,die Ärzte,1099
2,1,46857,melissa etheridge,897
3,1,15968,elvenking,717
4,1,48968,juliette & the licks,706


In [11]:
dataset_clean.drop(['artist_name'], axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [12]:
dataset_clean.describe()

Unnamed: 0,user,artist,plays
count,987168.0,987168.0,987168.0
mean,10232.484131,33572.788481,217.479664
std,5911.779514,19332.163691,606.759652
min,1.0,1.0,1.0
25%,5117.0,17049.0,34.0
50%,10237.0,34352.0,95.0
75%,15347.0,49720.0,225.0
max,20462.0,66798.0,135392.0


In [13]:
dataset_clean.head()

Unnamed: 0,user,artist,plays
0,1,15531,2137
1,1,63468,1099
2,1,46857,897
3,1,15968,717
4,1,48968,706


In [14]:
dataset_clean.shape

(987168, 3)

In [15]:
test_indices = np.random.choice(
    dataset_clean.index.values,
    replace=False,
    size=int(len(dataset_clean.index.values) * 0.2)
)


In [16]:
test_indices

array([559421, 966508, 713552, ..., 604113, 302366, 651472], dtype=int64)

In [17]:
test_data = dataset_clean.loc[test_indices]
train_data = dataset_clean.drop(test_indices)

In [18]:
train_data.shape, test_data.shape

((789735, 3), (197433, 3))

In [19]:
test_user_set = set(test_data['user'].unique())
train_user_set = set(train_data['user'].unique())
print('нет в обучающей выборке, но есть в тестовой: {}'.format(
    len(test_user_set - train_user_set)))
print('нет в тестовой выборке, но есть в обучающей: {}'.format(
    len(train_user_set - test_user_set)))
print('всего пользователей: {}'.format(len(dataset_clean['user'].unique())))

нет в обучающей выборке, но есть в тестовой: 2
нет в тестовой выборке, но есть в обучающей: 18
всего пользователей: 20462


In [20]:
user_ids_to_exclude = (test_user_set - train_user_set).union(train_user_set - test_user_set)
bad_indices = test_data[test_data['user'].isin(user_ids_to_exclude).values].index
test_data.drop(bad_indices, inplace=True)
bad_indices = train_data[train_data['user'].isin(user_ids_to_exclude).values]
train_data.drop(bad_indices.index, inplace=True)

In [21]:
train_data.shape, test_data.shape

((789614, 3), (197428, 3))

In [22]:
train_data.columns

Index(['user', 'artist', 'plays'], dtype='object')

In [23]:
def sparse_info(sparse_matrix: csr_matrix) -> None:
    print("Размерности матрицы: {}".format(sparse_matrix.shape))
    print("Ненулевых элементов в матрице: {}".format(sparse_matrix.nnz))
    print("Доля ненулевых элементов: {}"
          .format(sparse_matrix.nnz / sparse_matrix.shape[0] / sparse_matrix.shape[1])
    )
    print("Среднее значение ненулевых элементов: {}".format(sparse_matrix.data.mean()))
    print("Максимальное значение ненулевых элементов: {}".format(sparse_matrix.data.max()))
    print("Минимальное значение ненулевых элементов: {}".format(sparse_matrix.data.min()))

In [24]:
item_users_coo = coo_matrix((
    train_data['plays'].astype(np.float32),
    (
        train_data['artist'],
        train_data['user']
    )
))

sparse_info(item_users_coo.tocsr())

Размерности матрицы: (66799, 20463)
Ненулевых элементов в матрице: 789602
Доля ненулевых элементов: 0.0005776555820818239
Среднее значение ненулевых элементов: 217.33758544921875
Максимальное значение ненулевых элементов: 135392.0
Минимальное значение ненулевых элементов: 1.0


In [25]:
item_users_csr = csr_matrix((
    train_data['plays'].astype(np.double),
    (
        train_data['artist'],
        train_data['user']
    )
))

sparse_info(item_users_csr)

Размерности матрицы: (66799, 20463)
Ненулевых элементов в матрице: 789602
Доля ненулевых элементов: 0.0005776555820818239
Среднее значение ненулевых элементов: 217.33753460604203
Максимальное значение ненулевых элементов: 135392.0
Минимальное значение ненулевых элементов: 1.0


In [26]:
item_users_coo, item_users_csr

(<66799x20463 sparse matrix of type '<class 'numpy.float32'>'
 	with 789602 stored elements in COOrdinate format>,
 <66799x20463 sparse matrix of type '<class 'numpy.float64'>'
 	with 789602 stored elements in Compressed Sparse Row format>)

###### Обучаем ALS модель и делаем по ней рекомендации, пишем в файл

In [27]:
model_als = imp.als.AlternatingLeastSquares()

In [28]:
model_als.fit(item_users_csr)

In [29]:
data_dir = '/lastfm_small/'

In [30]:
print("получаем рекомендации для всех пользователей")
start = time.time()
user_items_csr = item_users_csr.T.tocsr()
with open(path.join(data_dir, 'recs/', 'recs.tsv'), "w") as output_file:
    for user_id in test_data['user'].unique():
        for artist_id, score in model_als.recommend(user_id, user_items_csr):
                output_file.write('%s\t%s\t%s\n' % (user_id, artist_id, score))
print("получили рекомендации для всех пользователей за {} секнуд".format(
        time.time() - start))

получаем рекомендации для всех пользователей
получили рекомендации для всех пользователей за 106.98707938194275 секнуд


In [38]:
recomends_als = pd.read_csv(
                '/lastfm_small/recs/recs.tsv',
                sep='\t', header=None,names=["user_id","artist_id","plays"])
recomends_knn.head()

Unnamed: 0,user_id,artist_id,plays
0,11454,51136,238.716593
1,11454,40065,238.716593
2,11454,36216,238.716593
3,11454,31132,238.716593
4,11454,16384,236.354267


###### Используем метрики из sklearn

In [49]:
test_data_metrics = test_data.copy()

In [52]:
test_data_metrics.rename(columns={'user':'user_id', 'artist': 'artist_id'}, inplace=True)

In [54]:
test_data_metrics.columns

Index(['user_id', 'artist_id', 'plays'], dtype='object')

In [57]:
intersect = pd.merge(test_data_metrics, recomends_als, on=['user_id', 'artist_id'])

In [58]:
intersect.head()

Unnamed: 0,user_id,artist_id,plays_x,plays_y
0,9035,30159,7,0.247652
1,10565,56193,662,0.952686
2,887,52941,519,1.203133
3,11845,31507,42,0.907074
4,10787,28658,296,1.361559


In [59]:
intersect.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 22062 entries, 0 to 22061
Data columns (total 4 columns):
user_id      22062 non-null int16
artist_id    22062 non-null int32
plays_x      22062 non-null int64
plays_y      22062 non-null float64
dtypes: float64(1), int16(1), int32(1), int64(1)
memory usage: 646.3 KB


In [61]:
mean_recomended = intersect["plays_y"].mean()
mean_real = intersect["plays_x"].mean()

recomended = intersect["plays_y"].apply(lambda x: 1 if x > mean_recomended else 0 )
real = intersect["plays_x"].apply(lambda x: 1 if x > mean_real else 0 )



In [64]:
# Average precision
print( 'Average precision: ' + str(metrics.average_precision_score(recomended, real)))

''

# Precision
print( 'Precision: ' + str(metrics.precision_score(recomended, real)))

''

# ROC AUC

print( 'ROC AUC: ' + str(metrics.roc_auc_score(recomended, real)))


Average precision: 0.806585953364
Precision: 0.874614407549
ROC AUC: 0.650641256322


###### Обучаем косинусную модель и делаем по ней рекомендации, пишем в файл

In [31]:
model_knn_cosine = imp.nearest_neighbours.CosineRecommender() 

In [32]:
model_knn_cosine.fit(item_users_coo)

In [33]:
print("получаем рекомендации для всех пользователей")
start = time.time()
user_items_csr = item_users_coo.T.tocsr()
with open(path.join(data_dir, 'recs/', 'recs_knn.tsv'), "w") as output_file:
    for user_id in test_data['user'].unique():
        for artist_id, score in model_knn_cosine.recommend(user_id, user_items_csr):
                output_file.write('%s\t%s\t%s\n' % (user_id, artist_id, score))
print("получили рекомендации для всех пользователей за {} секнуд".format(
        time.time() - start))

получаем рекомендации для всех пользователей
получили рекомендации для всех пользователей за 16.859893798828125 секнуд


In [37]:
recomends_knn = pd.read_csv(
                '/lastfm_small/recs/recs_knn.tsv',
                sep='\t', header=None,names=["user_id","artist_id","plays"])
recomends_knn.head()

Unnamed: 0,user_id,artist_id,plays
0,11454,51136,238.716593
1,11454,40065,238.716593
2,11454,36216,238.716593
3,11454,31132,238.716593
4,11454,16384,236.354267


###### Используем метрики из sklearn

In [65]:
intersect = pd.merge(test_data_metrics, recomends_knn, on=['user_id', 'artist_id'])

In [66]:
intersect.head()

Unnamed: 0,user_id,artist_id,plays_x,plays_y
0,15518,29473,165,544.811617
1,16227,44578,18,11.546304
2,14035,17722,264,4561.971388
3,5555,28744,220,598.84362
4,20033,14348,512,335.822915


In [67]:
intersect.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2985 entries, 0 to 2984
Data columns (total 4 columns):
user_id      2985 non-null int16
artist_id    2985 non-null int32
plays_x      2985 non-null int64
plays_y      2985 non-null float64
dtypes: float64(1), int16(1), int32(1), int64(1)
memory usage: 87.5 KB


In [68]:
mean_recomended = intersect["plays_y"].mean()
mean_real = intersect["plays_x"].mean()

recomended = intersect["plays_y"].apply(lambda x: 1 if x > mean_recomended else 0 )
real = intersect["plays_x"].apply(lambda x: 1 if x > mean_real else 0 )

In [69]:
# Average precision
print( 'Average precision: ' + str(metrics.average_precision_score(recomended, real)))

''

# Precision
print( 'Precision: ' + str(metrics.precision_score(recomended, real)))

''

# ROC AUC

print( 'ROC AUC: ' + str(metrics.roc_auc_score(recomended, real)))

Average precision: 0.703781184329
Precision: 0.708108108108
ROC AUC: 0.729165685582
