In [1]:
import pandas as pd
from pandas import DataFrame
import numpy as np

import implicit
from implicit.evaluation import ndcg_at_k
from scipy.sparse import coo_matrix


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import sys
sys.path.append('../src')

import config as cfg

# Считываем данные

In [3]:
# %%time

df_train = pd.read_csv(f'{cfg.DATASET_PATH}\\train.csv')
df_test = pd.read_csv(f'{cfg.DATASET_PATH}\\test.csv')

df_songs = pd.read_csv(f'{cfg.DATASET_PATH}\\songs.csv')
df_members = pd.read_csv(f'{cfg.DATASET_PATH}\\members.csv')

In [4]:
ALL_USERS = df_members['msno'].unique().tolist()
ALL_ITEMS = df_songs['song_id'].unique().tolist()

missing_songs_train = set(df_train['song_id'].unique()) - set(ALL_ITEMS)
ALL_ITEMS.extend(missing_songs_train)

missing_songs_test = set(df_test['song_id'].unique()) - set(ALL_ITEMS)
ALL_ITEMS.extend(missing_songs_test)

user_ids = dict(list(enumerate(ALL_USERS)))
item_ids = dict(list(enumerate(ALL_ITEMS)))

user_map = {u: uidx for uidx, u in user_ids.items()}
item_map = {i: iidx for iidx, i in item_ids.items()}

df_train['user_id'] = df_train['msno'].map(user_map)
df_train['item_id'] = df_train['song_id'].map(item_map)

In [5]:
del df_songs, df_members

# Создадим coo_matrix (user x item) и csr matrix (user x item)

In [6]:
row = df_train['user_id'].values
col = df_train['item_id'].values
data = np.ones(df_train.shape[0])
coo_train = coo_matrix((data, (row, col)), shape=(len(ALL_USERS), len(ALL_ITEMS)))
coo_train

<34403x2296378 sparse matrix of type '<class 'numpy.float64'>'
	with 7377418 stored elements in COOrdinate format>

## Проверим работает ли ALS на этих значениях

In [7]:
model = implicit.als.AlternatingLeastSquares(factors=10, iterations=2)
model.fit(coo_train)

  check_blas_config()
100%|██████████| 2/2 [00:01<00:00,  1.46it/s]


# Валидация

In [8]:
def to_user_item_coo(df: DataFrame):
    """ Turn a dataframe with transactions into a COO sparse items x users matrix"""
    df['user_id'] = df['msno'].map(user_map)
    df['item_id'] = df['song_id'].map(item_map)

    row = df['user_id'].values
    col = df['item_id'].values
    data = np.ones(df.shape[0])
    coo = coo_matrix((data, (row, col)), shape=(len(ALL_USERS), len(ALL_ITEMS)))
    return coo

def slice_data(df_train, df_test, slice=10**6):
    df_train = df_train[:slice]
    df_test = df_test[:slice]
    return df_train, df_test

def get_val_matricies(df_train: DataFrame, df_test: DataFrame, slice=10**6):
    """
    Returns a dictionary with the following keys:
            csr_train: training data in CSR sparse format and as (users x items)
            csr_val:  validation data in CSR sparse format and as (users x items)
    """
    df_train, df_test = slice_data(df_train, df_test, slice=slice)

    coo_train = to_user_item_coo(df_train)
    coo_test = to_user_item_coo(df_test)

    csr_train = coo_train.tocsr()
    csr_test = coo_test.tocsr()

    return {'csr_train': csr_train,
            'csr_test': csr_test
          }

def validate(matrices: dict, factors=200, iterations=20, regularization=0.01, show_progress=True):
    """ Train an ALS model with <<factors>> (embeddings dimension)
    for <<iterations>> over matrices and validate with NDCG@20
    """
    csr_train, csr_test = matrices['csr_train'], matrices['csr_test']

    model = implicit.als.AlternatingLeastSquares(factors=factors,
                                                 iterations=iterations,
                                                 regularization=regularization)
    model.fit(csr_train, show_progress=show_progress)

    # The MAPK by implicit doesn't allow to calculate allowing repeated items, which is the case.
    ndcg20 = ndcg_at_k(model, csr_train, csr_test, K=20, show_progress=show_progress, num_threads=0)
    print(f"Factors: {factors:>3} - Iterations: {iterations:>2} - Regularization: {regularization:4.3f} ==> NDCG@20: {ndcg20:6.5f}")
    return ndcg20

In [9]:
matrices = get_val_matricies(df_train, df_test)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['user_id'] = df['msno'].map(user_map)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['item_id'] = df['song_id'].map(item_map)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['user_id'] = df['msno'].map(user_map)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .l

In [10]:
# Grid search

# best_ndcg20 = 0
# for factors in [40, 50, 60, 100, 200, 500, 1000]:
#     for iterations in [3, 12, 14, 15, 20]:
#         for regularization in [0.0, 0.1, 0.01, 0.001]:
#             ndcg20 = validate(matrices, factors, iterations, regularization, show_progress=True)
#             if ndcg20 > best_ndcg20:
#                 best_ndcg20 = ndcg20
#                 best_params = {'factors': factors, 'iterations': iterations, 'regularization': regularization}
#                 print(f"Best NDCG@20 found. Updating: {best_params}")

best_params = {'factors': 40, 'iterations': 3, 'regularization': 0.01}

In [11]:
del matrices

# Обучение на всём датасете

In [12]:
coo_train = to_user_item_coo(df_train)
csr_train = coo_train.tocsr()

In [13]:
def train(coo_train, factors=200, iterations=15, regularization=0.01, show_progress=True):
    model = implicit.als.AlternatingLeastSquares(factors=factors,
                                                 iterations=iterations,
                                                 regularization=regularization)
    model.fit(coo_train, show_progress=show_progress)
    return model

In [14]:
best_params

{'factors': 40, 'iterations': 3, 'regularization': 0.01}

In [15]:
model = train(csr_train, **best_params)

100%|██████████| 3/3 [00:02<00:00,  1.03it/s]


# Оценка обучения

In [16]:
coo_test = to_user_item_coo(df_test)
csr_test = coo_test.tocsr()

In [17]:
ndcg20 = ndcg_at_k(model, csr_train, csr_test, K=20, show_progress=True, num_threads=0)
ndcg20
# 0.11044308262909103

100%|██████████| 25131/25131 [01:30<00:00, 277.01it/s]


0.11086081503371653

In [18]:
import os
if not os.path.exists('weights'):
    os.makedirs('weights')

model.save(r'weights/als.npz')

При использовании алгоритма ALS (AlternatingLeastSquares | Collaborative Filtering) NDCG@20 оказался ниже чем простая рекомендация на основе ТОП20 музыкальных треков