In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd
from IPython.display import display

# Fix the dying kernel problem (only a problem in some installations - you can remove it, if it works without it)
import os
from typing import List, Tuple

seed = 6789

os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'

# Load the dataset for recommenders

In [2]:
data_path = os.path.join("data", "hotel_data")

input_interactions_df = pd.read_csv(os.path.join(data_path, "hotel_data_interactions_df.csv"), index_col=0)

base_item_features = ['term', 'length_of_stay_bucket', 'rate_plan', 'room_segment', 'n_people_bucket', 'weekend_stay']

column_values_dict = {
    'term': ['WinterVacation', 'Easter', 'OffSeason', 'HighSeason', 'LowSeason', 'MayLongWeekend', 'NewYear',
             'Christmas'],
    'length_of_stay_bucket': ['[0-1]', '[2-3]', '[4-7]', '[8-inf]'],
    'rate_plan': ['Standard', 'Nonref'],
    'room_segment': ['[0-160]', '[160-260]', '[260-360]', '[360-500]', '[500-900]'],
    'n_people_bucket': ['[1-1]', '[2-2]', '[3-4]', '[5-inf]'],
    'weekend_stay': ['True', 'False']
}

input_interactions_df['term'] = pd.Categorical(
    input_interactions_df['term'], categories=column_values_dict['term'])
input_interactions_df['length_of_stay_bucket'] = pd.Categorical(
    input_interactions_df['length_of_stay_bucket'], categories=column_values_dict['length_of_stay_bucket'])
input_interactions_df['rate_plan'] = pd.Categorical(
    input_interactions_df['rate_plan'], categories=column_values_dict['rate_plan'])
input_interactions_df['room_segment'] = pd.Categorical(
    input_interactions_df['room_segment'], categories=column_values_dict['room_segment'])
input_interactions_df['n_people_bucket'] = pd.Categorical(
    input_interactions_df['n_people_bucket'], categories=column_values_dict['n_people_bucket'])
input_interactions_df['weekend_stay'] = input_interactions_df['weekend_stay'].astype('str')
input_interactions_df['weekend_stay'] = pd.Categorical(
    input_interactions_df['weekend_stay'], categories=column_values_dict['weekend_stay'])

display(input_interactions_df.head(15))

Unnamed: 0,user_id,item_id,term,length_of_stay_bucket,rate_plan,room_segment,n_people_bucket,weekend_stay
0,1,0,WinterVacation,[2-3],Standard,[260-360],[5-inf],True
1,2,1,WinterVacation,[2-3],Standard,[160-260],[3-4],True
2,3,2,WinterVacation,[2-3],Standard,[160-260],[2-2],False
3,4,3,WinterVacation,[4-7],Standard,[160-260],[3-4],True
4,5,4,WinterVacation,[4-7],Standard,[0-160],[2-2],True
5,6,5,Easter,[4-7],Standard,[260-360],[5-inf],True
6,7,6,OffSeason,[2-3],Standard,[260-360],[5-inf],True
7,8,7,HighSeason,[2-3],Standard,[160-260],[1-1],True
8,9,8,HighSeason,[2-3],Standard,[0-160],[1-1],True
9,8,7,HighSeason,[2-3],Standard,[160-260],[1-1],True



# Define user features based on reservations
The content-based recommenders will be forecasting the probability of interaction between user and item based on user features vector and item features vector:

<center>
$$
    r_{u, i} = f(user\_features, item\_features)
$$
</center>

<span style="color:red"><font size="4">**Task:**</font></span><br>
Design numerical user features based on user reservations. Code the following method which for a given interactions DataFrame (it will be used in the fit method of the recommender) returns a DataFrame with user_id and user features as well as a list with names of user features (this will be important to select the right columns for an ML algorithm). Remember to name the columns differently than item features which you will create in the next task. Validate your features on users with several interactions (sample user ids are already given below).

Ideas for user features:
- Find the vector of most popular feature values from all user reservations and encode every feature with one-hot encoding.
- For every reservation feature calculate the probability distribution of its values among all user's reservations.
- For numerical buckets (length_of_stay, room_segment, n_people) you can calculate the average value for every user from their reservations (you will have to map the buckets back to numerical values before averaging them).

Remember that you will have to select the best features (with the highest explanatory power). Using all above features at once would make the number of variables too large for this dataset and would also introduce too much correlations between features.

You can also prepare several versions of the
 method and test which works best in your recommender.

# Prepare numerical item features

<span style="color:red"><font size="4">**Task:**</font></span><br>
Code the prepare_items_df method which will be used in the recommender fit and recommend methods to map items to numerical features. This method should take the interactions_df DataFrame as input and return a DataFrame containing one record per item_id with item_id column and numerical item feature columns.

You can try turning all item features into one-hot representations. You can use the get_dummies method from pandas. It will return the same columns on any dataset of interactions because the categorical variables with all possible values have been defined in the second cell in this notebook.

You are welcome to design your own numerical item features, for instance based on numerical min and max values in buckets used as features.

In [3]:
class FitDataToolkit:

    @staticmethod
    def most_popular_features(interactions_df: pd.DataFrame, n_most_popular_items):

        most_popular = {
            'term': list(interactions_df['term'].value_counts()[:n_most_popular_items].index),
            'length_of_stay_bucket': list(interactions_df['length_of_stay_bucket'].value_counts()[:n_most_popular_items].index),
            'room_segment': list(interactions_df['room_segment'].value_counts()[:n_most_popular_items].index),
            'n_people_bucket': list(interactions_df['n_people_bucket'].value_counts()[:n_most_popular_items].index),
        }

        columns = []

        for key, values in most_popular.items():
            for value in values:
                columns.append(key + "_" + value)

        return columns

    @staticmethod
    def prepare_users_df(interactions_df: pd.DataFrame, n_most_popular_items: int = 4) -> Tuple[pd.DataFrame, List[str]]:

        most_popular_cols = FitDataToolkit.most_popular_features(interactions_df, n_most_popular_items)
        most_popular_cols.append("user_id")
        most_popular_cols.append("item_id")

        one_hot = pd.get_dummies(interactions_df, columns=['term', 'rate_plan', 'room_segment', 'n_people_bucket', 'length_of_stay_bucket', 'weekend_stay'])
        all_cols = one_hot.columns
        cols_to_drop = list(set(all_cols) - set(most_popular_cols))
        one_hot = one_hot.drop(columns=cols_to_drop)

        options = {}
        for col in one_hot.columns:
            if col == 'user_id' or col == 'item_id':
                continue
            else:
                options[col] = 'sum'

        one_hot = one_hot.groupby(['user_id']).agg(options)
        one_hot = one_hot / one_hot.sum(axis=1).values.reshape(-1, 1)
        one_hot: pd.DataFrame = one_hot.rename_axis(None, axis=1).fillna(0)
        users_df = one_hot.add_prefix('user_')
        user_features = list(users_df.columns)
        users_df = users_df.reset_index()

        return users_df, user_features

    @staticmethod
    def prepare_items_df(interactions_df: pd.DataFrame) -> Tuple[pd.DataFrame, List[str]]:
        one_hot = pd.get_dummies(interactions_df, columns=['term', 'rate_plan', 'room_segment', 'length_of_stay_bucket', 'n_people_bucket', 'weekend_stay'])
        options = {}
        for col in one_hot.columns:
            if col == 'user_id' or col == 'item_id':
                continue
            else:
                options[col] = 'sum'
        one_hot = one_hot.groupby(['item_id']).agg(options)
        one_hot = one_hot / one_hot.sum(axis=1).values.reshape(-1, 1)
        one_hot = one_hot.rename_axis(None, axis=1).fillna(0)
        items_df = one_hot.add_prefix('item_')
        item_features = list(items_df.columns)
        items_df = items_df.reset_index()

        return items_df, item_features


In [4]:
user_df, user_features = FitDataToolkit.prepare_users_df(input_interactions_df)
items_df, item_features = FitDataToolkit.prepare_items_df(input_interactions_df)

display(user_df)
display(items_df)

Unnamed: 0,user_id,user_term_WinterVacation,user_term_OffSeason,user_term_HighSeason,user_term_LowSeason,user_room_segment_[0-160],user_room_segment_[160-260],user_room_segment_[260-360],user_room_segment_[360-500],user_n_people_bucket_[1-1],user_n_people_bucket_[2-2],user_n_people_bucket_[3-4],user_n_people_bucket_[5-inf],user_length_of_stay_bucket_[0-1],user_length_of_stay_bucket_[2-3],user_length_of_stay_bucket_[4-7],user_length_of_stay_bucket_[8-inf]
0,1,0.022727,0.170455,0.022727,0.034091,0.00,0.215909,0.034091,0.0,0.00,0.181818,0.045455,0.022727,0.00,0.147727,0.102273,0.00
1,2,0.250000,0.000000,0.000000,0.000000,0.00,0.250000,0.000000,0.0,0.00,0.000000,0.250000,0.000000,0.00,0.250000,0.000000,0.00
2,3,0.250000,0.000000,0.000000,0.000000,0.00,0.250000,0.000000,0.0,0.00,0.250000,0.000000,0.000000,0.00,0.250000,0.000000,0.00
3,4,0.250000,0.000000,0.000000,0.000000,0.00,0.250000,0.000000,0.0,0.00,0.000000,0.250000,0.000000,0.00,0.000000,0.250000,0.00
4,5,0.250000,0.000000,0.000000,0.000000,0.25,0.000000,0.000000,0.0,0.00,0.250000,0.000000,0.000000,0.00,0.000000,0.250000,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13793,14498,0.000000,0.250000,0.000000,0.000000,0.00,0.250000,0.000000,0.0,0.25,0.000000,0.000000,0.000000,0.00,0.250000,0.000000,0.00
13794,14499,0.000000,0.250000,0.000000,0.000000,0.00,0.250000,0.000000,0.0,0.00,0.250000,0.000000,0.000000,0.00,0.000000,0.000000,0.25
13795,14500,0.000000,0.000000,0.000000,0.000000,0.00,0.000000,0.333333,0.0,0.00,0.000000,0.333333,0.000000,0.00,0.333333,0.000000,0.00
13796,14501,0.000000,0.250000,0.000000,0.000000,0.00,0.250000,0.000000,0.0,0.00,0.000000,0.250000,0.000000,0.00,0.250000,0.000000,0.00


Unnamed: 0,item_id,item_term_WinterVacation,item_term_Easter,item_term_OffSeason,item_term_HighSeason,item_term_LowSeason,item_term_MayLongWeekend,item_term_NewYear,item_term_Christmas,item_rate_plan_Standard,...,item_length_of_stay_bucket_[0-1],item_length_of_stay_bucket_[2-3],item_length_of_stay_bucket_[4-7],item_length_of_stay_bucket_[8-inf],item_n_people_bucket_[1-1],item_n_people_bucket_[2-2],item_n_people_bucket_[3-4],item_n_people_bucket_[5-inf],item_weekend_stay_True,item_weekend_stay_False
0,0,0.166667,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.166667,...,0.0,0.166667,0.000000,0.000000,0.000000,0.000000,0.000000,0.166667,0.166667,0.000000
1,1,0.166667,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.166667,...,0.0,0.166667,0.000000,0.000000,0.000000,0.000000,0.166667,0.000000,0.166667,0.000000
2,2,0.166667,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.166667,...,0.0,0.166667,0.000000,0.000000,0.000000,0.166667,0.000000,0.000000,0.000000,0.166667
3,3,0.166667,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.166667,...,0.0,0.000000,0.166667,0.000000,0.000000,0.000000,0.166667,0.000000,0.166667,0.000000
4,4,0.166667,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.166667,...,0.0,0.000000,0.166667,0.000000,0.000000,0.166667,0.000000,0.000000,0.166667,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
758,758,0.000000,0.0,0.166667,0.0,0.0,0.0,0.000000,0.000000,0.000000,...,0.0,0.166667,0.000000,0.000000,0.000000,0.000000,0.166667,0.000000,0.166667,0.000000
759,759,0.000000,0.0,0.000000,0.0,0.0,0.0,0.000000,0.166667,0.166667,...,0.0,0.000000,0.000000,0.166667,0.000000,0.166667,0.000000,0.000000,0.166667,0.000000
760,760,0.000000,0.0,0.000000,0.0,0.0,0.0,0.166667,0.000000,0.166667,...,0.0,0.000000,0.166667,0.000000,0.166667,0.000000,0.000000,0.000000,0.000000,0.166667
761,761,0.000000,0.0,0.000000,0.0,0.0,0.0,0.166667,0.000000,0.000000,...,0.0,0.000000,0.166667,0.000000,0.000000,0.000000,0.166667,0.000000,0.166667,0.000000


# Content-based recommender

<span style="color:red"><font size="4">**Task:**</font></span><br>
Code the content-based recommender. User features should be calculated within the fit method based on available training data and should be saved in the object as self.users_df for later use in the recommend method. Item features should be calculated both in the fit method (from interactions_df) and in the recommend method (from items_df - the items to be evaluated).

In the fit method you have to randomly generate non-existing interactions and add them to the training data for the regressor. You should add the target variable to interactions - equal to 1 for real ("positive") interactions and equal to 0 for those newly added "negative" interactions. Generate several negative interactions per every positive interaction (n_neg_per_pos). Treat the proportion as a tunable parameter of the model.

Remember to keep control over randomness - in the init method add seed as a parameter and initialize the random seed generator with that seed:

```python
self.seed = seed
self.rng = np.random.RandomState(seed=seed)
```

Below the base content-based recommender class there are several classes which inherit from the base class and use different ML models:
  - LinearRegressionCBUIRecommender - based on linear regression,
  - SVRCBUIRecommender - based on Support Vector Regressor (if you want to test it, sample the data in the fit method, as the training can take many hours on the entire dataset of interactions),
  - RandomForestCBUIRecommender - based on Random Forest,
  - XGBoostCBUIRecommender - based on XGBoost.

There is no need to change anything in those inheriting classes, although you can experiment with other tunable parameters of the underlying models.

You are encouraged to experiment with:
  - Other numerical user and item features (but always train and evaluate the model on buckets defined in the first notebook).
  - Other ML models, e.g. Huber regression, Lasso regression, Ridge regression, LARS regression, Linear SVR, Decision Tree, Naive Bayes, LightGBM, Neural Networks or any model of your choice.

In [5]:
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor

from recommenders.recommender import Recommender


class ContentBasedUserItemRecommender(Recommender):

    def __init__(self, seed: int = 6789, n_neg_per_pos: int = 5, n_most_popular_items: int = 5):

        super().__init__()

        self.fitDataToolkit = FitDataToolkit()
        self.model = LinearRegression()
        self.n_neg_per_pos = n_neg_per_pos

        self.n_most_popular_items = n_most_popular_items

        self.recommender_df = pd.DataFrame(columns=['user_id', 'item_id', 'score'])
        self.users_df: pd.DataFrame | None = None
        self.user_features: List[str] | None = None
        self.items_df: pd.DataFrame | None = None
        self.item_features: List[str] | None = None

        self.seed = seed
        self.rng = np.random.RandomState(seed=seed)

    def fit(self, interactions_df: pd.DataFrame, users_df: pd.DataFrame, items_df: pd.DataFrame):

        self.users_df, self.user_features = self.fitDataToolkit.prepare_users_df(interactions_df, self.n_most_popular_items)
        self.items_df, self.item_features = self.fitDataToolkit.prepare_items_df(interactions_df)
        self.items_df = self.items_df.loc[:, ['item_id'] + self.item_features]

        interactions_df = interactions_df.loc[:, ['user_id', 'item_id']]
        interactions_df.loc[:, 'interacted'] = 1
        negative_interactions = []

        n_users = np.max(interactions_df['user_id']) + 1
        n_items = np.max(interactions_df['item_id']) + 1

        copy = interactions_df.copy()
        copy['val'] = 1
        interactions_dict = copy.set_index(['user_id', 'item_id']).val.to_dict()

        i = 0
        temp = self.n_neg_per_pos * len(interactions_df)
        while i < temp:
            sample_size = 1000
            user_ids = self.rng.choice(np.arange(n_users), size=sample_size)
            item_ids = self.rng.choice(np.arange(n_items), size=sample_size)

            j = 0
            while j < sample_size and i < temp:
                if (user_ids[j], item_ids[j]) not in interactions_dict:
                    negative_interactions.append([user_ids[j], item_ids[j], 0])
                    i += 1
                j += 1

        interactions_df = pd.concat(
            [interactions_df, pd.DataFrame(negative_interactions, columns=['user_id', 'item_id', 'interacted'])])

        interactions_df = pd.merge(interactions_df, self.users_df, on=['user_id'])
        interactions_df = pd.merge(interactions_df, self.items_df, on=['item_id'])
        x = interactions_df.loc[:, self.user_features + self.item_features].values
        y = interactions_df['interacted'].values

        self.model.fit(x, y)

    def recommend(self, users_df: pd.DataFrame, interactions_df: pd.DataFrame, n_recommendations: int = 1):

        self.recommender_df = self.recommender_df[:0]

        users_df = pd.merge(users_df, self.users_df, on='user_id', how='left')
        users_df = users_df.fillna(0)
        items_df, item_features = self.fitDataToolkit.prepare_items_df(interactions_df)

        for ix, user in users_df.iterrows():

            user_df = user.to_frame().T
            input_df = pd.merge(user_df, items_df, how="cross")
            input_df_copy = input_df.drop(columns=['user_id', 'item_id'])

            scores = self.model.predict(input_df_copy.values)
            chosen_ids = np.argsort(-scores)[:n_recommendations]

            user_recommendations = []
            for item_id in chosen_ids:
                user_recommendations.append(
                    {
                        'user_id': user['user_id'],
                        'item_id': input_df.iloc[item_id]['item_id'],
                        'score': scores[item_id]
                    }
                )

            user_recommendations = pd.DataFrame(user_recommendations)
            self.recommender_df = pd.concat([self.recommender_df, user_recommendations])

        return self.recommender_df


class LinearRegressionCBUIRecommender(ContentBasedUserItemRecommender):
    def __init__(self, seed=6789, n_neg_per_pos=5, **model_params):
        n_most_popular_items = 5
        if 'n_most_popular_items' in model_params:
            n_most_popular_items = int(model_params['n_most_popular_items'])
        super().__init__(seed=seed, n_neg_per_pos=n_neg_per_pos, n_most_popular_items=n_most_popular_items)
        self.model = LinearRegression()


class SVRCBUIRecommender(ContentBasedUserItemRecommender):
    def __init__(self, seed=6789, n_neg_per_pos=5, **model_params):
        n_most_popular_items = 5
        if 'n_most_popular_items' in model_params:
            n_most_popular_items = int(model_params['n_most_popular_items'])
        super().__init__(seed=seed, n_neg_per_pos=n_neg_per_pos, n_most_popular_items=n_most_popular_items)
        if 'kernel' in model_params:
            self.kernel = model_params['kernel']
        else:
            self.kernel = 'rbf'
        if 'C' in model_params:
            self.C = model_params['C']
        else:
            self.C = 1.0
        if 'epsilon' in model_params:
            self.epsilon = model_params['epsilon']
        else:
            self.epsilon = 0.1
        self.model = SVR(kernel=self.kernel, C=self.C, epsilon=self.epsilon)


class RandomForestCBUIRecommender(ContentBasedUserItemRecommender):
    def __init__(self, seed=6789, n_neg_per_pos=5, **model_params):
        n_most_popular_items = 5
        if 'n_most_popular_items' in model_params:
            n_most_popular_items = int(model_params['n_most_popular_items'])
        super().__init__(seed=seed, n_neg_per_pos=n_neg_per_pos, n_most_popular_items=n_most_popular_items)
        if 'n_estimators' in model_params:
            self.n_estimators = int(model_params['n_estimators'])
        else:
            self.n_estimators = 100
        if 'max_depth' in model_params:
            self.max_depth = int(model_params['max_depth'])
        else:
            self.max_depth = 30
        if 'min_samples_split' in model_params:
            self.min_samples_split = int(model_params['min_samples_split'])
        else:
            self.min_samples_split = 30
        self.model = RandomForestRegressor(
            n_estimators=self.n_estimators, max_depth=self.max_depth, min_samples_split=self.min_samples_split)


class XGBoostCBUIRecommender(ContentBasedUserItemRecommender):
    def __init__(self, seed=6789, n_neg_per_pos=5, **model_params):
        n_most_popular_items = 5
        if 'n_most_popular_items' in model_params:
            n_most_popular_items = int(model_params['n_most_popular_items'])
        super().__init__(seed=seed, n_neg_per_pos=n_neg_per_pos, n_most_popular_items=n_most_popular_items)
        if 'n_estimators' in model_params:
            self.n_estimators = int(model_params['n_estimators'])
        else:
            self.n_estimators = 100
        if 'max_depth' in model_params:
            self.max_depth = int(model_params['max_depth'])
        else:
            self.max_depth = 30
        if 'min_samples_split' in model_params:
            self.min_samples_split = int(model_params['min_samples_split'])
        else:
            self.min_samples_split = 30
        if 'learning_rate' in model_params:
            self.learning_rate = model_params['learning_rate']
        else:
            self.learning_rate = 30
        self.model = GradientBoostingRegressor(
            n_estimators=self.n_estimators, max_depth=self.max_depth, min_samples_split=self.min_samples_split,
            learning_rate=self.learning_rate)

# Quick test of the recommender

In [6]:
items_df = input_interactions_df.loc[:, ['item_id'] + base_item_features].drop_duplicates()

In [7]:
import time
start_time = time.time()

cb_user_item_recommender = RandomForestCBUIRecommender()
cb_user_item_recommender.fit(input_interactions_df, None, None)

print("Fit function performance: %s seconds" % (time.time() - start_time))

Fit function performance: 11.654460191726685 seconds


In [9]:
recommendations = cb_user_item_recommender.recommend(pd.DataFrame([[1], [2], [3], [4], [5]], columns=['user_id']),
                                                     input_interactions_df, 10)

recommendations = pd.merge(recommendations, items_df, on='item_id', how='left')
display(recommendations)

Unnamed: 0,user_id,item_id,score,term,length_of_stay_bucket,rate_plan,room_segment,n_people_bucket,weekend_stay
0,1.0,55.0,1.0,OffSeason,[2-3],Nonref,[160-260],[2-2],True
1,1.0,22.0,1.0,OffSeason,[2-3],Standard,[160-260],[3-4],True
2,1.0,61.0,0.998266,OffSeason,[4-7],Nonref,[160-260],[2-2],True
3,1.0,32.0,0.998205,OffSeason,[2-3],Standard,[160-260],[2-2],True
4,1.0,51.0,0.99814,OffSeason,[2-3],Nonref,[160-260],[3-4],True
5,1.0,53.0,0.99775,OffSeason,[2-3],Nonref,[160-260],[2-2],False
6,1.0,21.0,0.99775,OffSeason,[2-3],Standard,[160-260],[2-2],False
7,1.0,23.0,0.994248,OffSeason,[4-7],Standard,[160-260],[2-2],True
8,1.0,14.0,0.969455,OffSeason,[4-7],Standard,[160-260],[3-4],True
9,1.0,76.0,0.948566,OffSeason,[4-7],Nonref,[160-260],[3-4],True


# Tuning method

In [10]:
from evaluation_and_testing.testing import evaluate_train_test_split_implicit
from hyperopt import hp, fmin, tpe, Trials
import traceback

def tune_recommender(recommender_class, interactions_df, items_df,
                     param_space, max_evals=1, show_progressbar=True, seed=6789):
    # Split into train_validation and test sets

    shuffle = np.arange(len(interactions_df))
    rng = np.random.RandomState(seed=seed)
    rng.shuffle(shuffle)
    shuffle = list(shuffle)

    train_test_split = 0.8
    split_index = int(len(interactions_df) * train_test_split)

    train_validation = interactions_df.iloc[shuffle[:split_index]]
    test = interactions_df.iloc[shuffle[split_index:]]

    # Tune

    def loss(tuned_params):
        recommender = recommender_class(seed=seed, **tuned_params)
        hr1, hr3, hr5, hr10, ndcg1, ndcg3, ndcg5, ndcg10 = evaluate_train_test_split_implicit(
            recommender, train_validation, items_df, seed=seed)
        return -hr10

    n_tries = 1
    succeded = False
    try_id = 0
    while not succeded and try_id < n_tries:
        try:
            trials = Trials()
            best_param_set = fmin(loss, space=param_space, algo=tpe.suggest,
                                  max_evals=max_evals, show_progressbar=show_progressbar, trials=trials, verbose=True)
            succeded = True
        except:
            traceback.print_exc()
            try_id += 1

    if not succeded:
        return None

    # Validate

    recommender = recommender_class(seed=seed, **best_param_set)

    results = [[recommender_class.__name__] + list(evaluate_train_test_split_implicit(
        recommender, {'train': train_validation, 'test': test}, items_df, seed=seed))]

    results = pd.DataFrame(results,
                           columns=['Recommender', 'HR@1', 'HR@3', 'HR@5', 'HR@10', 'NDCG@1', 'NDCG@3', 'NDCG@5',
                                    'NDCG@10'])

    display(results)

    return best_param_set

## Tuning of the recommender

<span style="color:red"><font size="4">**Task:**</font></span><br> 
Tune your models using the code below. You only need to put the class name of your recommender and choose an appropriate parameter space.

In [11]:
param_space = {
    'n_neg_per_pos': hp.quniform('n_neg_per_pos', 1, 10, 1),
    'n_most_popular_items': hp.quniform('n_most_popular_items', 1, 8, 1)
}

linear_best_param_set = tune_recommender(LinearRegressionCBUIRecommender, input_interactions_df, items_df,
                                  param_space, max_evals=80, show_progressbar=True, seed=seed)

print("Best parameters:")
print(linear_best_param_set)

100%|██████████| 80/80 [1:25:10<00:00, 63.88s/trial, best loss: -0.2293694456199746]


Unnamed: 0,Recommender,HR@1,HR@3,HR@5,HR@10,NDCG@1,NDCG@3,NDCG@5,NDCG@10
0,LinearRegressionCBUIRecommender,0.049219,0.130007,0.174134,0.235234,0.049219,0.094502,0.113016,0.132582


Best parameters:
{'n_most_popular_items': 3.0, 'n_neg_per_pos': 1.0}


In [None]:
param_space = {
    'n_neg_per_pos': hp.quniform('n_neg_per_pos', 1, 10, 1),
    'C': hp.loguniform('C', np.log(0.01), np.log(100.0))
}

svr_best_param_set = tune_recommender(SVRCBUIRecommender, input_interactions_df, items_df,
                                  param_space, max_evals=100, show_progressbar=True, seed=seed)

print("Best parameters:")
print(svr_best_param_set)

  0%|          | 0/100 [00:00<?, ?trial/s, best loss=?]

In [150]:
param_space = {
    'n_neg_per_pos': hp.quniform('n_neg_per_pos', 1, 10, 1),
    'n_estimators': hp.quniform('n_estimators', 30, 300, 1),
    'max_depth': hp.quniform('max_depth', 2, 10, 1),
    'min_samples_split': hp.quniform('min_samples_split', 2, 30, 1)
}

random_forest_best_param_set = tune_recommender(RandomForestCBUIRecommender, input_interactions_df, items_df,
                                  param_space, max_evals=100, show_progressbar=True, seed=seed)

print("Best parameters:")
print(random_forest_best_param_set)

100%|██████████| 100/100 [2:41:05<00:00, 96.65s/trial, best loss: -0.047820567075751164]  


Unnamed: 0,Recommender,HR@1,HR@3,HR@5,HR@10,NDCG@1,NDCG@3,NDCG@5,NDCG@10
0,RandomForestCBUIRecommender,0.002037,0.010523,0.015614,0.026477,0.002037,0.007257,0.009346,0.012833


Best parameters:
{'max_depth': 3.0, 'min_samples_split': 9.0, 'n_estimators': 141.0, 'n_neg_per_pos': 1.0}
Stored 'random_forest_best_param_set' (dict)


In [151]:
# This tuning may take around 12 hours

param_space = {
    'n_neg_per_pos': hp.quniform('n_neg_per_pos', 1, 10, 1),
    'n_estimators': hp.quniform('n_estimators', 10, 300, 1),
    'max_depth': hp.quniform('max_depth', 2, 10, 1),
    'min_samples_split': hp.quniform('min_samples_split', 2, 30, 1),
    'learning_rate': hp.loguniform('learning_rate', np.log(0.001), np.log(0.1))
}

xg_boost_best_param_set = tune_recommender(XGBoostCBUIRecommender, input_interactions_df, items_df,
                                  param_space, max_evals=300, show_progressbar=True, seed=seed)

print("Best parameters:")
print(xg_boost_best_param_set)

100%|██████████| 300/300 [6:27:38<00:00, 77.53s/trial, best loss: -0.058823529411764705]  


Unnamed: 0,Recommender,HR@1,HR@3,HR@5,HR@10,NDCG@1,NDCG@3,NDCG@5,NDCG@10
0,XGBoostCBUIRecommender,0.014257,0.03666,0.04888,0.073999,0.014257,0.027547,0.032721,0.040834


Best parameters:
{'learning_rate': 0.09880843325335834, 'max_depth': 3.0, 'min_samples_split': 14.0, 'n_estimators': 298.0, 'n_neg_per_pos': 10.0}
Stored 'xg_boost_best_param_set' (dict)


# Final evaluation

<span style="color:red"><font size="4">**Task:**</font></span><br> 
Run the final evaluation of your recommender and present its results against the Amazon recommender's results. You can present results for several of your recommenders. You just need to give the class name of your recommender and its tuned parameters below. If you present results for several recommenders, you should add a separate cell for each recommender and change the names of the DataFrames containing results.

In [12]:
linear_best_param_set = { 'n_most_popular_items': 3.0, 'n_neg_per_pos': 1.0 }
cb_user_item_recommender = LinearRegressionCBUIRecommender(seed=seed, **linear_best_param_set)

linear_cbui_tts_results = [['LinearRegressionCBUIRecommender'] + list(evaluate_train_test_split_implicit(
    cb_user_item_recommender, input_interactions_df, items_df))]

linear_cbui_tts_results = pd.DataFrame(
    linear_cbui_tts_results,
    columns=['Recommender', 'HR@1', 'HR@3', 'HR@5', 'HR@10', 'NDCG@1', 'NDCG@3', 'NDCG@5', 'NDCG@10'])

display(linear_cbui_tts_results)

Unnamed: 0,Recommender,HR@1,HR@3,HR@5,HR@10,NDCG@1,NDCG@3,NDCG@5,NDCG@10
0,LinearRegressionCBUIRecommender,0.049219,0.130007,0.174134,0.235234,0.049219,0.094502,0.113016,0.132582


In [13]:
from recommenders.amazon_recommender import AmazonRecommender

amazon_recommender = AmazonRecommender()

amazon_tts_results = [['AmazonRecommender'] + list(evaluate_train_test_split_implicit(
    amazon_recommender, input_interactions_df, items_df))]

amazon_tts_results = pd.DataFrame(
    amazon_tts_results,
    columns=['Recommender', 'HR@1', 'HR@3', 'HR@5', 'HR@10', 'NDCG@1', 'NDCG@3', 'NDCG@5', 'NDCG@10'])

display(amazon_tts_results)

Unnamed: 0,Recommender,HR@1,HR@3,HR@5,HR@10,NDCG@1,NDCG@3,NDCG@5,NDCG@10
0,AmazonRecommender,0.044128,0.118805,0.160557,0.223693,0.044128,0.086755,0.104216,0.124468


In [14]:
random_forest_best_param_set = {'max_depth': 3.0, 'min_samples_split': 9.0, 'n_estimators': 141.0, 'n_neg_per_pos': 1.0}
cb_user_item_recommender = RandomForestCBUIRecommender(seed=seed, **random_forest_best_param_set)

random_forest_cbui_tts_results = [['RandomForestCBUIRecommender'] + list(evaluate_train_test_split_implicit(
    cb_user_item_recommender, input_interactions_df, items_df))]

random_forest_cbui_tts_results = pd.DataFrame(
    random_forest_cbui_tts_results,
    columns=['Recommender', 'HR@1', 'HR@3', 'HR@5', 'HR@10', 'NDCG@1', 'NDCG@3', 'NDCG@5', 'NDCG@10'])

display(random_forest_cbui_tts_results)

Unnamed: 0,Recommender,HR@1,HR@3,HR@5,HR@10,NDCG@1,NDCG@3,NDCG@5,NDCG@10
0,RandomForestCBUIRecommender,0.001697,0.013917,0.025798,0.039036,0.001697,0.008741,0.013485,0.017891


In [15]:
xg_boost_best_param_set = {'learning_rate': 0.09880843325335834, 'max_depth': 3.0, 'min_samples_split': 14.0, 'n_estimators': 298.0, 'n_neg_per_pos': 10.0}
cb_user_item_recommender = XGBoostCBUIRecommender(seed=seed, **xg_boost_best_param_set)

xg_boost_cbui_tts_results = [['XGBoostCBUIRecommender'] + list(evaluate_train_test_split_implicit(
    cb_user_item_recommender, input_interactions_df, items_df))]

xg_boost_cbui_tts_results = pd.DataFrame(
    xg_boost_cbui_tts_results,
    columns=['Recommender', 'HR@1', 'HR@3', 'HR@5', 'HR@10', 'NDCG@1', 'NDCG@3', 'NDCG@5', 'NDCG@10'])

display(xg_boost_cbui_tts_results)

Unnamed: 0,Recommender,HR@1,HR@3,HR@5,HR@10,NDCG@1,NDCG@3,NDCG@5,NDCG@10
0,XGBoostCBUIRecommender,0.013917,0.03632,0.046504,0.075017,0.013917,0.027297,0.031519,0.040636


In [16]:
tts_results = pd.concat([linear_cbui_tts_results, amazon_tts_results, random_forest_cbui_tts_results, xg_boost_cbui_tts_results]).reset_index(drop=True)
display(tts_results)

Unnamed: 0,Recommender,HR@1,HR@3,HR@5,HR@10,NDCG@1,NDCG@3,NDCG@5,NDCG@10
0,LinearRegressionCBUIRecommender,0.049219,0.130007,0.174134,0.235234,0.049219,0.094502,0.113016,0.132582
1,AmazonRecommender,0.044128,0.118805,0.160557,0.223693,0.044128,0.086755,0.104216,0.124468
2,RandomForestCBUIRecommender,0.001697,0.013917,0.025798,0.039036,0.001697,0.008741,0.013485,0.017891
3,XGBoostCBUIRecommender,0.013917,0.03632,0.046504,0.075017,0.013917,0.027297,0.031519,0.040636
