In [1]:
from pathlib import Path
from typing import Any, Protocol, List, TypeVar, Generic, Optional

import polars as pl
import numpy as np
from scipy.sparse import coo_matrix, csr_matrix
from implicit.als import AlternatingLeastSquares

from my_recsys_metrics import compute_metrics
from my_utils import make_submission

In [11]:
train_events = pl.read_parquet("train_events.parquet")
users_for_submission = pl.read_parquet("users_for_submission.parquet")
ground_truth = pl.read_parquet("ground_truth.parquet")

In [3]:
_T = TypeVar("_T")
_U = TypeVar("_U")

class TransformerLike(Protocol):
    def fit_transform(self, input: Any) -> Any: ...


class Pipeline(Generic[_T, _U]):
    def __init__(self, transformers: List[TransformerLike]) -> None:
        self.transformers = transformers

    def fit_transform(self, x: _T) -> _U:
        y: Any = x
        for t in self.transformers:
            print(f"Fit-transform with {t.__class__.__name__}")
            y = t.fit_transform(y)
        return y

class OrdinalEncoder:
    def __init__(self, column: str) -> None:
        self.column = column

    def fit(self, df: pl.DataFrame) -> "OrdinalEncoder":
        self._mapper = (
            df[[self.column]].unique()
            .sort(self.column)
            .with_row_count("__index__")
            .with_columns(pl.col("__index__").cast(pl.Int32))
        )
        return self

    def transform(self, df: pl.DataFrame) -> pl.DataFrame:
        df = (
            df
            .join(self._mapper, on=self.column, how="left")
            .drop(self.column)
            .rename({"__index__": self.column})
        )
        return df

    def inverse_transform(self, df: pl.DataFrame) -> pl.DataFrame:
        df = (
            df
            .rename({self.column: "__index__"})
            .join(
                self._mapper,
                on="__index__",
                how="left",
            )
            .drop(f"__index__")
        )
        return df

    def fit_transform(self, df: pl.DataFrame) -> pl.DataFrame:
        return self.fit(df).transform(df)


class FilterByPlayRatio:
    def __init__(self, min_ratio: float) -> None:
        self.min_ratio = min_ratio

    def fit_transform(self, events: pl.DataFrame) -> pl.DataFrame:
        return events.filter(pl.col("play_ratio") > self.min_ratio)


class FrequencyEncoder:
    def __init__(self, user_column: str, item_column: str, value_column: str) -> None:
        self.user_column = user_column
        self.item_column = item_column
        self.value_column = value_column

    def fit_transform(self, events: pl.DataFrame) -> pl.DataFrame:
        frequency_scores = (
            events
            .group_by(self.user_column, self.item_column)
            .agg(pl.col(self.item_column).count().alias("n_interactions_per_user"))
            .with_columns(
                pl.col("n_interactions_per_user").sum().over(self.user_column).alias("n_interactions_total"),
            )
            .with_columns(
                (pl.col("n_interactions_per_user") / pl.col("n_interactions_total")).alias(self.value_column),
            )
            .drop("n_interactions_per_user", "n_interactions_total")
        )
        return frequency_scores


class TFIDFEncoder:
    def __init__(self, user_column: str, item_column: str, value_column: str) -> None:
        self.user_column = user_column
        self.item_column = item_column
        self.value_column = value_column

    def fit_transform(self, events: pl.DataFrame) -> pl.DataFrame:
        n_users = events["user_id"].n_unique()

        def idf_fn(track_occurrences):
            return 1 + np.log(1 + 0.001 * (n_users / track_occurrences))

        idf_scores = (
            events
            .group_by(self.item_column)
            .agg(pl.col(self.user_column).count().alias("n_user_per_item"))
            .with_columns(idf_fn(pl.col("n_user_per_item")).alias("idf"))
            .drop("n_user_per_item")
        )

        tf_scores = (
            events
            .group_by(self.user_column, self.item_column)
            .agg(pl.count().alias("n_user_item"))
            .with_columns(
                pl.col("n_user_item").sum().over(self.user_column).alias("n_total"),
            )
            .with_columns(
                (pl.col("n_user_item") / pl.col("n_total")).alias("tf"),
            )
            .drop("n_user_item", "n_total")
        )

        scores = (
            tf_scores
            .join(
                idf_scores,
                on=self.item_column,
                how="left"
            )
            .with_columns((pl.col("tf") * pl.col("idf")).alias(self.value_column))
            .drop("tf", "idf")
        )

        return scores


class CSRConverter:
    def __init__(self, user_column: str, item_column: str, value_column: str) -> None:
        self.user_column = user_column
        self.item_column = item_column
        self.value_column = value_column

    def fit_transform(self, coo: pl.DataFrame) -> csr_matrix:
        user_idx = coo[self.user_column].to_numpy()
        item_idx = coo[self.item_column].to_numpy()
        values = coo[self.value_column].to_numpy()

        n_users = user_idx.max() + 1
        n_items = item_idx.max() + 1

        user_item_coo = coo_matrix(
            (
                values.astype(np.float32),
                (user_idx, item_idx),
            ),
            shape=(n_users, n_items),
            dtype=np.float32,
        )

        user_item_coo.sum_duplicates()

        user_item_csr = user_item_coo.tocsr()
        return user_item_csr
events_preprocessing_pipeline: Pipeline[pl.DataFrame, csr_matrix] = Pipeline([
    OrdinalEncoder(column="user_id"),
    OrdinalEncoder(column="track_id"),
    FrequencyEncoder(user_column="user_id", item_column="track_id", value_column="freq"),
    CSRConverter(user_column="user_id", item_column="track_id", value_column="freq"),
])

user_item_csr = events_preprocessing_pipeline.fit_transform(train_events)
user_item_csr

Fit-transform with OrdinalEncoder
Fit-transform with OrdinalEncoder
Fit-transform with FrequencyEncoder
Fit-transform with CSRConverter


<24196x118430 sparse matrix of type '<class 'numpy.float32'>'
	with 6432526 stored elements in Compressed Sparse Row format>

In [4]:
class ALS:
    def __init__(
        self,
        user_column: str,
        item_column: str,
        score_column: str,
        n_factors: int,
        n_iterations: int,
        top_k: int,
    ) -> None:
        self.user_column = user_column
        self.item_column = item_column
        self.score_column = score_column
        self.n_factors = n_factors
        self.n_iterations = n_iterations
        self.top_k = top_k

    def fit_predict(
        self,
        user_item: csr_matrix,
        user_item_filter: Optional[csr_matrix] = None,
    ) -> pl.DataFrame:
        als = AlternatingLeastSquares(
            factors=self.n_factors,
            iterations=self.n_iterations,
            alpha=40.0,
            regularization=0.001,
            calculate_training_loss=True,
        )
        als.fit(user_item)

        user_ids = np.arange(user_item_csr.shape[0])
        recommended_item_indices, recommended_scores = als.recommend(
            user_ids,
            (user_item_filter if user_item_filter is not None else user_item),
            N=self.top_k,
            filter_already_liked_items=True,
        )

        scores_df = pl.DataFrame({
            self.user_column: pl.Series(user_ids, dtype=pl.Int32),
            self.item_column: pl.Series(recommended_item_indices, dtype=pl.List(pl.Int32)),
            self.score_column: pl.Series(recommended_scores, dtype=pl.List(pl.Float32)),
        })

        scores_df = scores_df.explode(self.item_column, self.score_column)

        return scores_df


als = ALS(
    user_column="user_id",
    item_column="track_id",
    score_column="score",
    n_factors=128,
    n_iterations=10,
    top_k=10,
)
als_recommendations = als.fit_predict(user_item_csr)

  check_blas_config()
  check_blas_config()


  0%|          | 0/10 [00:00<?, ?it/s]

In [5]:
user_encoder: OrdinalEncoder = events_preprocessing_pipeline.transformers[0]
item_encoder: OrdinalEncoder = events_preprocessing_pipeline.transformers[1]

als_recommendations_decoded = als_recommendations
als_recommendations_decoded = user_encoder.inverse_transform(als_recommendations_decoded)
als_recommendations_decoded = item_encoder.inverse_transform(als_recommendations_decoded)

In [7]:
als_submission = make_submission(als_recommendations_decoded)
compute_metrics(als_submission, pl.read_parquet("ground_truth.parquet"))

{'ndcg@10': 0.0072625363441447734, 'recall@10': 0.010132499079867501}

# Подбор гипперпараметров для ALS

In [12]:
# Гиперпараметры для тестирования
factors_options = [64, 128, 256]
iterations_options = [10, 20, 30]
alpha_options = [1, 10, 40]
regularization_options = [0.01, 0.1]

best_score = 0
best_params = None

# Перебор комбинаций гиперпараметров
for factors in factors_options:
    for iterations in iterations_options:
        for alpha_val in alpha_options:
            for reg in regularization_options:
                # Инициализация и обучение модели ALS с текущими гиперпараметрами
                als_model = ALS(
                    user_column="user_id",
                    item_column="track_id",
                    score_column="score",
                    n_factors=factors,
                    n_iterations=iterations,
                    top_k=10
                )
                als_recommendations = als_model.fit_predict(user_item_csr)

                # Обратное преобразование идентификаторов
                als_recommendations_decoded = user_encoder.inverse_transform(als_recommendations)
                als_recommendations_decoded = item_encoder.inverse_transform(als_recommendations_decoded)

                # Создание предсказаний и вычисление метрик
                als_submission = make_submission(als_recommendations_decoded)
                metrics = compute_metrics(als_submission, ground_truth)

                # Оценка качества модели
                ndcg_score = metrics['ndcg@10']
                recall_score = metrics['recall@10']
                current_score = ndcg_score  # Можно выбрать другую метрику или комбинацию метрик

                print(f"Factors: {factors}, Iterations: {iterations}, Alpha: {alpha_val}, Regularization: {reg}, NDCG@10: {ndcg_score}, Recall@10: {recall_score}")

                # Обновление лучших параметров, если текущая модель лучше
                if current_score > best_score:
                    best_score = current_score
                    best_params = (factors, iterations, alpha_val, reg)

print("Лучшие параметры для ALS:", best_params)

  0%|          | 0/10 [00:00<?, ?it/s]

Factors: 64, Iterations: 10, Alpha: 1, Regularization: 0.01, NDCG@10: 0.006133042995123495, Recall@10: 0.00888847994111152


  0%|          | 0/10 [00:00<?, ?it/s]

Factors: 64, Iterations: 10, Alpha: 1, Regularization: 0.1, NDCG@10: 0.005963884732156992, Recall@10: 0.008828364617838303


  0%|          | 0/10 [00:00<?, ?it/s]

Factors: 64, Iterations: 10, Alpha: 10, Regularization: 0.01, NDCG@10: 0.0061428536836633145, Recall@10: 0.009061464850938535


  0%|          | 0/10 [00:00<?, ?it/s]

Factors: 64, Iterations: 10, Alpha: 10, Regularization: 0.1, NDCG@10: 0.006064978716076885, Recall@10: 0.008893387314439945


  0%|          | 0/10 [00:00<?, ?it/s]

Factors: 64, Iterations: 10, Alpha: 40, Regularization: 0.01, NDCG@10: 0.005782728082006948, Recall@10: 0.008665194454668138


  0%|          | 0/10 [00:00<?, ?it/s]

Factors: 64, Iterations: 10, Alpha: 40, Regularization: 0.1, NDCG@10: 0.005990948313129777, Recall@10: 0.008731443994601889


  0%|          | 0/20 [00:00<?, ?it/s]

Factors: 64, Iterations: 20, Alpha: 1, Regularization: 0.01, NDCG@10: 0.006199777118688295, Recall@10: 0.008850447797816218


  0%|          | 0/20 [00:00<?, ?it/s]

Factors: 64, Iterations: 20, Alpha: 1, Regularization: 0.1, NDCG@10: 0.006255350673273133, Recall@10: 0.00886394307446939


  0%|          | 0/20 [00:00<?, ?it/s]

Factors: 64, Iterations: 20, Alpha: 10, Regularization: 0.01, NDCG@10: 0.006254128552113572, Recall@10: 0.008928965771071032


  0%|          | 0/20 [00:00<?, ?it/s]

Factors: 64, Iterations: 20, Alpha: 10, Regularization: 0.1, NDCG@10: 0.006274122519925612, Recall@10: 0.008844313581155687


  0%|          | 0/20 [00:00<?, ?it/s]

Factors: 64, Iterations: 20, Alpha: 40, Regularization: 0.01, NDCG@10: 0.006308878946684863, Recall@10: 0.009182922340817077


  0%|          | 0/20 [00:00<?, ?it/s]

Factors: 64, Iterations: 20, Alpha: 40, Regularization: 0.1, NDCG@10: 0.006419418660926421, Recall@10: 0.009281069807385595


  0%|          | 0/30 [00:00<?, ?it/s]

Factors: 64, Iterations: 30, Alpha: 1, Regularization: 0.01, NDCG@10: 0.0063785211757917595, Recall@10: 0.008993988467672678


  0%|          | 0/30 [00:00<?, ?it/s]

Factors: 64, Iterations: 30, Alpha: 1, Regularization: 0.1, NDCG@10: 0.006131454386734204, Recall@10: 0.008719175561280825


  0%|          | 0/30 [00:00<?, ?it/s]

Factors: 64, Iterations: 30, Alpha: 10, Regularization: 0.01, NDCG@10: 0.006276783503891613, Recall@10: 0.008964544227702122


  0%|          | 0/30 [00:00<?, ?it/s]

Factors: 64, Iterations: 30, Alpha: 10, Regularization: 0.1, NDCG@10: 0.006204229134556531, Recall@10: 0.008911789964421544


  0%|          | 0/30 [00:00<?, ?it/s]

Factors: 64, Iterations: 30, Alpha: 40, Regularization: 0.01, NDCG@10: 0.006204022754421042, Recall@10: 0.008890933627775733


  0%|          | 0/30 [00:00<?, ?it/s]

Factors: 64, Iterations: 30, Alpha: 40, Regularization: 0.1, NDCG@10: 0.006339214801030656, Recall@10: 0.00892037786774629


  0%|          | 0/10 [00:00<?, ?it/s]

Factors: 128, Iterations: 10, Alpha: 1, Regularization: 0.01, NDCG@10: 0.007317327951618565, Recall@10: 0.010197521776469145


  0%|          | 0/10 [00:00<?, ?it/s]

Factors: 128, Iterations: 10, Alpha: 1, Regularization: 0.1, NDCG@10: 0.007516046590696677, Recall@10: 0.010314071893019262


  0%|          | 0/10 [00:00<?, ?it/s]

Factors: 128, Iterations: 10, Alpha: 10, Regularization: 0.01, NDCG@10: 0.007215156332197186, Recall@10: 0.0098675009201325


  0%|          | 0/10 [00:00<?, ?it/s]

Factors: 128, Iterations: 10, Alpha: 10, Regularization: 0.1, NDCG@10: 0.007403006971833426, Recall@10: 0.010155809103177524


  0%|          | 0/10 [00:00<?, ?it/s]

Factors: 128, Iterations: 10, Alpha: 40, Regularization: 0.01, NDCG@10: 0.007377541055010991, Recall@10: 0.010526315789473684


  0%|          | 0/10 [00:00<?, ?it/s]

Factors: 128, Iterations: 10, Alpha: 40, Regularization: 0.1, NDCG@10: 0.00744567457081526, Recall@10: 0.01039872408293461


  0%|          | 0/20 [00:00<?, ?it/s]

Factors: 128, Iterations: 20, Alpha: 1, Regularization: 0.01, NDCG@10: 0.007224674020229262, Recall@10: 0.01013617960986382


  0%|          | 0/20 [00:00<?, ?it/s]

Factors: 128, Iterations: 20, Alpha: 1, Regularization: 0.1, NDCG@10: 0.007383619947530668, Recall@10: 0.0103778677462888


  0%|          | 0/20 [00:00<?, ?it/s]

Factors: 128, Iterations: 20, Alpha: 10, Regularization: 0.01, NDCG@10: 0.007254574676893634, Recall@10: 0.010008587903324744


  0%|          | 0/20 [00:00<?, ?it/s]

Factors: 128, Iterations: 20, Alpha: 10, Regularization: 0.1, NDCG@10: 0.0073677489214321445, Recall@10: 0.010336155072997177


  0%|          | 0/20 [00:00<?, ?it/s]

Factors: 128, Iterations: 20, Alpha: 40, Regularization: 0.01, NDCG@10: 0.007505723214739103, Recall@10: 0.010559440559440561


  0%|          | 0/20 [00:00<?, ?it/s]

Factors: 128, Iterations: 20, Alpha: 40, Regularization: 0.1, NDCG@10: 0.007435263782583675, Recall@10: 0.010505459452827874


  0%|          | 0/30 [00:00<?, ?it/s]

Factors: 128, Iterations: 30, Alpha: 1, Regularization: 0.01, NDCG@10: 0.007265478801592718, Recall@10: 0.010272359219727642


  0%|          | 0/30 [00:00<?, ?it/s]

Factors: 128, Iterations: 30, Alpha: 1, Regularization: 0.1, NDCG@10: 0.007627828530317832, Recall@10: 0.010706661759293338


  0%|          | 0/30 [00:00<?, ?it/s]

Factors: 128, Iterations: 30, Alpha: 10, Regularization: 0.01, NDCG@10: 0.0070468987446332, Recall@10: 0.009849098270150902


  0%|          | 0/30 [00:00<?, ?it/s]

Factors: 128, Iterations: 30, Alpha: 10, Regularization: 0.1, NDCG@10: 0.007313654041796777, Recall@10: 0.010317752423015582


  0%|          | 0/30 [00:00<?, ?it/s]

Factors: 128, Iterations: 30, Alpha: 40, Regularization: 0.01, NDCG@10: 0.007577872769905436, Recall@10: 0.010668629615998038


  0%|          | 0/30 [00:00<?, ?it/s]

Factors: 128, Iterations: 30, Alpha: 40, Regularization: 0.1, NDCG@10: 0.007356932319358643, Recall@10: 0.010354557722978774


  0%|          | 0/10 [00:00<?, ?it/s]

Factors: 256, Iterations: 10, Alpha: 1, Regularization: 0.01, NDCG@10: 0.008269206699175646, Recall@10: 0.011643970065022697


  0%|          | 0/10 [00:00<?, ?it/s]

Factors: 256, Iterations: 10, Alpha: 1, Regularization: 0.1, NDCG@10: 0.008379312474181609, Recall@10: 0.012107716844558952


  0%|          | 0/10 [00:00<?, ?it/s]

Factors: 256, Iterations: 10, Alpha: 10, Regularization: 0.01, NDCG@10: 0.008402126811267853, Recall@10: 0.01190774138142559


  0%|          | 0/10 [00:00<?, ?it/s]

Factors: 256, Iterations: 10, Alpha: 10, Regularization: 0.1, NDCG@10: 0.008544937109009896, Recall@10: 0.01210035578456631


  0%|          | 0/10 [00:00<?, ?it/s]

Factors: 256, Iterations: 10, Alpha: 40, Regularization: 0.01, NDCG@10: 0.008413151876488239, Recall@10: 0.012135934241197398


  0%|          | 0/10 [00:00<?, ?it/s]

Factors: 256, Iterations: 10, Alpha: 40, Regularization: 0.1, NDCG@10: 0.008426360488020668, Recall@10: 0.01198871304134462


  0%|          | 0/20 [00:00<?, ?it/s]

Factors: 256, Iterations: 20, Alpha: 1, Regularization: 0.01, NDCG@10: 0.008686190553863915, Recall@10: 0.012250030671083302


  0%|          | 0/20 [00:00<?, ?it/s]

Factors: 256, Iterations: 20, Alpha: 1, Regularization: 0.1, NDCG@10: 0.008767893334234318, Recall@10: 0.012183781131149552


  0%|          | 0/20 [00:00<?, ?it/s]

Factors: 256, Iterations: 20, Alpha: 10, Regularization: 0.01, NDCG@10: 0.00859743311259763, Recall@10: 0.011880750828119247


  0%|          | 0/20 [00:00<?, ?it/s]

Factors: 256, Iterations: 20, Alpha: 10, Regularization: 0.1, NDCG@10: 0.00863526481318357, Recall@10: 0.012162924794503741


  0%|          | 0/20 [00:00<?, ?it/s]

Factors: 256, Iterations: 20, Alpha: 40, Regularization: 0.01, NDCG@10: 0.008642146846446817, Recall@10: 0.012105263157894735


  0%|          | 0/20 [00:00<?, ?it/s]

Factors: 256, Iterations: 20, Alpha: 40, Regularization: 0.1, NDCG@10: 0.008613083353651126, Recall@10: 0.012235308551098027


  0%|          | 0/30 [00:00<?, ?it/s]

Factors: 256, Iterations: 30, Alpha: 1, Regularization: 0.01, NDCG@10: 0.008819925065375557, Recall@10: 0.012267206477732794


  0%|          | 0/30 [00:00<?, ?it/s]

Factors: 256, Iterations: 30, Alpha: 1, Regularization: 0.1, NDCG@10: 0.008702430302371954, Recall@10: 0.012138387927861613


  0%|          | 0/30 [00:00<?, ?it/s]

Factors: 256, Iterations: 30, Alpha: 10, Regularization: 0.01, NDCG@10: 0.008678498756747763, Recall@10: 0.012064777327935223


  0%|          | 0/30 [00:00<?, ?it/s]

Factors: 256, Iterations: 30, Alpha: 10, Regularization: 0.1, NDCG@10: 0.008780218007348548, Recall@10: 0.01213225371120108


  0%|          | 0/30 [00:00<?, ?it/s]

Factors: 256, Iterations: 30, Alpha: 40, Regularization: 0.01, NDCG@10: 0.008800508511233679, Recall@10: 0.012180100601153232


  0%|          | 0/30 [00:00<?, ?it/s]

Factors: 256, Iterations: 30, Alpha: 40, Regularization: 0.1, NDCG@10: 0.008648432669520806, Recall@10: 0.012145748987854251
Лучшие параметры для ALS: (256, 30, 1, 0.01)


In [13]:
# Гиперпараметры для тестирования
factors_options = [256]
iterations_options = [30, 40]
alpha_options = [40]
regularization_options = [0.5]

best_score = 0
best_params = None

# Перебор комбинаций гиперпараметров
for factors in factors_options:
    for iterations in iterations_options:
        for alpha_val in alpha_options:
            for reg in regularization_options:
                # Инициализация и обучение модели ALS с текущими гиперпараметрами
                als_model = ALS(
                    user_column="user_id",
                    item_column="track_id",
                    score_column="score",
                    n_factors=factors,
                    n_iterations=iterations,
                    top_k=10
                )
                als_recommendations = als_model.fit_predict(user_item_csr)

                # Обратное преобразование идентификаторов
                als_recommendations_decoded = user_encoder.inverse_transform(als_recommendations)
                als_recommendations_decoded = item_encoder.inverse_transform(als_recommendations_decoded)

                # Создание предсказаний и вычисление метрик
                als_submission = make_submission(als_recommendations_decoded)
                metrics = compute_metrics(als_submission, ground_truth)

                # Оценка качества модели
                ndcg_score = metrics['ndcg@10']
                recall_score = metrics['recall@10']
                current_score = ndcg_score  # Можно выбрать другую метрику или комбинацию метрик

                print(f"Factors: {factors}, Iterations: {iterations}, Alpha: {alpha_val}, Regularization: {reg}, NDCG@10: {ndcg_score}, Recall@10: {recall_score}")

                # Обновление лучших параметров, если текущая модель лучше
                if current_score > best_score:
                    best_score = current_score
                    best_params = (factors, iterations, alpha_val, reg)

print("Лучшие параметры для ALS:", best_params)

  0%|          | 0/30 [00:00<?, ?it/s]

KeyboardInterrupt: 