In [2]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import Markdown, display, HTML
from collections import defaultdict

# Fix the dying kernel problem (only a problem in some installations - you can remove it, if it works without it)
import os
os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'

# Load the dataset for recommenders

In [3]:
data_path = os.path.join("data", "hotel_data")

interactions_df = pd.read_csv(os.path.join(data_path, "hotel_data_interactions_df.csv"), index_col=0)

base_item_features = ['term', 'length_of_stay_bucket', 'rate_plan', 'room_segment', 'n_people_bucket', 'weekend_stay']

column_values_dict = {
    'term': ['WinterVacation', 'Easter', 'OffSeason', 'HighSeason', 'LowSeason', 'MayLongWeekend', 'NewYear', 'Christmas'],
    'length_of_stay_bucket': ['[0-1]', '[2-3]', '[4-7]', '[8-inf]'],
    'rate_plan': ['Standard', 'Nonref'],
    'room_segment': ['[0-160]', '[160-260]', '[260-360]', '[360-500]', '[500-900]'],
    'n_people_bucket': ['[1-1]', '[2-2]', '[3-4]', '[5-inf]'],
    'weekend_stay': ['True', 'False']
}

interactions_df['term'] = pd.Categorical(
    interactions_df['term'], categories=column_values_dict['term'])
interactions_df['length_of_stay_bucket'] = pd.Categorical(
    interactions_df['length_of_stay_bucket'], categories=column_values_dict['length_of_stay_bucket'])
interactions_df['rate_plan'] = pd.Categorical(
    interactions_df['rate_plan'], categories=column_values_dict['rate_plan'])
interactions_df['room_segment'] = pd.Categorical(
    interactions_df['room_segment'], categories=column_values_dict['room_segment'])
interactions_df['n_people_bucket'] = pd.Categorical(
    interactions_df['n_people_bucket'], categories=column_values_dict['n_people_bucket'])
interactions_df['weekend_stay'] = interactions_df['weekend_stay'].astype('str')
interactions_df['weekend_stay'] = pd.Categorical(
    interactions_df['weekend_stay'], categories=column_values_dict['weekend_stay'])

display(interactions_df.head(15))

Unnamed: 0,user_id,item_id,term,length_of_stay_bucket,rate_plan,room_segment,n_people_bucket,weekend_stay
0,1,0,WinterVacation,[2-3],Standard,[260-360],[5-inf],True
1,2,1,WinterVacation,[2-3],Standard,[160-260],[3-4],True
2,3,2,WinterVacation,[2-3],Standard,[160-260],[2-2],False
3,4,3,WinterVacation,[4-7],Standard,[160-260],[3-4],True
4,5,4,WinterVacation,[4-7],Standard,[0-160],[2-2],True
5,6,5,Easter,[4-7],Standard,[260-360],[5-inf],True
6,7,6,OffSeason,[2-3],Standard,[260-360],[5-inf],True
7,8,7,HighSeason,[2-3],Standard,[160-260],[1-1],True
8,9,8,HighSeason,[2-3],Standard,[0-160],[1-1],True
9,8,7,HighSeason,[2-3],Standard,[160-260],[1-1],True


# Define user features based on reservations

The content-based recommenders will be forecasting the probability of interaction between user and item based on user features vector and item features vector:

<center>
$$
    r_{u, i} = f(user\_features, item\_features)
$$
</center>

<span style="color:red"><font size="4">**Task:**</font></span><br> 
Design numerical user features based on user reservations. Code the following method which for a given interactions DataFrame (it will be used in the fit method of the recommender) returns a DataFrame with user_id and user features as well as a list with names of user features (this will be important to select the right columns for an ML algorithm). Remember to name the columns differently than item features which you will create in the next task. Validate your features on users with several interactions (sample user ids are already given below).

Ideas for user features:
- Find the vector of most popular feature values from all user reservations and encode every feature with one-hot encoding.
- For every reservation feature calculate the probability distribution of its values among all user's reservations.
- For numerical buckets (length_of_stay, room_segment, n_people) you can calculate the average value for every user from their reservations (you will have to map the buckets back to numerical values before averaging them).

Remember that you will have to select the best features (with the highest explanatory power). Using all above features at once would make the number of variables too large for this dataset and would also introduce too much correlations between features.

You can also prepare several versions of the prepare_users_df method and test which works best in your recommender.

In [19]:
def map_bucket_to_value(bucket, feature):
        if feature == 'length_of_stay':
            if bucket == '[0-1]':
                return 1
            elif bucket == '[2-3]':
                return 2.5
            elif bucket == '[4-7]':
                return 5.5
            elif bucket == '[8-inf]':
                return 10
        elif feature == 'n_people':
            if bucket == '[1-1]':
                return 1
            elif bucket == '[2-2]':
                return 2
            elif bucket == '[3-4]':
                return 3.5
            elif bucket == '[5-inf]':
                return 5.5
        elif feature == 'room_segment':
            if bucket == '[0-160]':
                return 80
            elif bucket == '[160-260]':
                return 210
            elif bucket == '[260-360]':
                return 310
            elif bucket == '[360-500]':
                return 430
            elif bucket == '[500-900]':
                return 700
            elif bucket == '[900-inf]':
                return 1000
        return np.nan
def prepare_users_df(interactions_df):
    #display(interactions_df)
    # Aggregate user interactions by user and item
    user_item_counts = interactions_df.groupby(['user_id', 'item_id']).size().reset_index(name='count')
    item_features_df = interactions_df.copy().drop(columns = ["user_id"])
    # Merge item features onto user interactions
    user_item_features = pd.merge(user_item_counts, item_features_df, on='item_id')
    # Get list of categorical and numerical feature names
    categorical_feature_names = ["term", "rate_plan", "weekend_stay"]
    numerical_feature_names = ["length_of_stay", "room_segment", "n_people"]

    # One-hot encode categorical item features
    # item_feature_cols = []
    # for column in categorical_feature_names:
    #     encoded_feature = pd.get_dummies(item_features_df[column], prefix=column)
    #     item_feature_cols += encoded_feature.columns.tolist()
    #     user_item_features = pd.concat([user_item_features, encoded_feature], axis=1)




    # Encode categorical user features based on most popular feature values
    #user_categorical_features = user_item_features.groupby('user_id')[categorical_feature_names].apply(lambda x: x.mode().iloc[0]).reset_index()
    #for column in categorical_feature_names:
    #    encoded_feature = pd.get_dummies(user_categorical_features[column], prefix=column)
    #    user_categorical_features = pd.concat([user_categorical_features, encoded_feature], axis=1)
    # Calculate probability distribution of values for each reservation feature
    user_prob_features = pd.DataFrame()
    for column in categorical_feature_names:
        feature_probs = interactions_df.groupby(['user_id', column]).size().reset_index(name='count')
        feature_probs['prob'] = feature_probs['count'] / feature_probs.groupby('user_id')['count'].transform('sum')
        feature_probs_pivot = feature_probs.pivot(index='user_id', columns=column, values='prob')
        feature_probs_pivot.columns = [f'{column}_prob_{col}' for col in feature_probs_pivot.columns]
        user_prob_features = pd.concat([user_prob_features, feature_probs_pivot], axis=1)
    # Calculate average numerical user features from reservations
    user_numerical_features = interactions_df.groupby('user_id').agg({
            'length_of_stay_bucket': lambda x: map_bucket_to_value(x.mode()[0], 'length_of_stay'),
            'room_segment': lambda x: map_bucket_to_value(x.mode()[0], 'room_segment'),
            'n_people_bucket': lambda x: map_bucket_to_value(x.mode()[0], 'n_people')
        }).reset_index()
    user_numerical_features.columns = ['user_id'] + [f'{col}_avg' for col in numerical_feature_names]
    
    # Merge user-level features
    #user_features = pd.merge(user_categorical_features, user_prob_features, on='user_id')
    user_features = pd.merge(user_prob_features, user_numerical_features, on='user_id')
    # Get list of feature names
    feature_names = user_features.columns.tolist()
    feature_names.remove("user_id")
    return user_features, feature_names
    
users_df, feature_names = prepare_users_df(interactions_df)
print(feature_names)

display(users_df.loc[users_df['user_id'].isin([706, 1736, 7779, 96, 1, 50, 115])].head(15))

['term_prob_WinterVacation', 'term_prob_Easter', 'term_prob_OffSeason', 'term_prob_HighSeason', 'term_prob_LowSeason', 'term_prob_MayLongWeekend', 'term_prob_NewYear', 'term_prob_Christmas', 'rate_plan_prob_Standard', 'rate_plan_prob_Nonref', 'weekend_stay_prob_True', 'weekend_stay_prob_False', 'length_of_stay_avg', 'room_segment_avg', 'n_people_avg']


Unnamed: 0,user_id,term_prob_WinterVacation,term_prob_Easter,term_prob_OffSeason,term_prob_HighSeason,term_prob_LowSeason,term_prob_MayLongWeekend,term_prob_NewYear,term_prob_Christmas,rate_plan_prob_Standard,rate_plan_prob_Nonref,weekend_stay_prob_True,weekend_stay_prob_False,length_of_stay_avg,room_segment_avg,n_people_avg
0,1,0.090909,0.0,0.681818,0.090909,0.136364,0.0,0.0,0.0,0.5,0.5,0.772727,0.227273,2.5,210,2.0
40,50,0.043478,0.0,0.434783,0.304348,0.217391,0.0,0.0,0.0,0.26087,0.73913,0.782609,0.217391,2.5,210,3.5
84,96,0.090909,0.0,0.681818,0.136364,0.045455,0.045455,0.0,0.0,0.272727,0.727273,0.727273,0.272727,2.5,210,3.5
102,115,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,2.5,210,1.0
371,706,0.095238,0.0,0.511905,0.190476,0.142857,0.047619,0.011905,0.0,0.97619,0.02381,0.583333,0.416667,2.5,210,3.5
1383,1736,0.034483,0.0,0.482759,0.206897,0.275862,0.0,0.0,0.0,0.172414,0.827586,0.448276,0.551724,2.5,210,2.0
7301,7779,0.0,0.0,0.5,0.0,0.5,0.0,0.0,0.0,1.0,0.0,0.75,0.25,5.5,210,3.5


# Prepare numerical item features

<span style="color:red"><font size="4">**Task:**</font></span><br> 
Code the prepare_items_df method which will be used in the recommender fit and recommend methods to map items to numerical features. This method should take the interactions_df DataFrame as input and return a DataFrame containing one record per item_id with item_id column and numerical item feature columns.

You can try turning all item features into one-hot representations. You can use the get_dummies method from pandas. It will return the same columns on any dataset of interactions because the categorical variables with all possible values have been defined in the second cell in this notebook.

You are welcome to design your own numerical item features, for instance based on numerical min and max values in buckets used as features.

In [14]:
def prepare_items_df(interactions_df):
    item_features_df = interactions_df.copy().drop(columns = ["item_id"])
    if "user_id" in interactions_df:
        item_features_df = item_features_df.drop(columns = ["user_id"])
    item_features = []
    items_df = interactions_df["item_id"]
    for column in item_features_df.columns:
        encoded_feature = pd.get_dummies(item_features_df[column], prefix=column)
        item_features += encoded_feature.columns.tolist()
        items_df = pd.concat([items_df, encoded_feature], axis=1)
    return items_df, item_features


items_df, item_features = prepare_items_df(interactions_df)

print(item_features)

display(items_df.loc[items_df['item_id'].isin([0, 1, 2, 3, 4, 5, 6])].head(15))

['term_WinterVacation', 'term_Easter', 'term_OffSeason', 'term_HighSeason', 'term_LowSeason', 'term_MayLongWeekend', 'term_NewYear', 'term_Christmas', 'length_of_stay_bucket_[0-1]', 'length_of_stay_bucket_[2-3]', 'length_of_stay_bucket_[4-7]', 'length_of_stay_bucket_[8-inf]', 'rate_plan_Standard', 'rate_plan_Nonref', 'room_segment_[0-160]', 'room_segment_[160-260]', 'room_segment_[260-360]', 'room_segment_[360-500]', 'room_segment_[500-900]', 'n_people_bucket_[1-1]', 'n_people_bucket_[2-2]', 'n_people_bucket_[3-4]', 'n_people_bucket_[5-inf]', 'weekend_stay_True', 'weekend_stay_False']


Unnamed: 0,item_id,term_WinterVacation,term_Easter,term_OffSeason,term_HighSeason,term_LowSeason,term_MayLongWeekend,term_NewYear,term_Christmas,length_of_stay_bucket_[0-1],...,room_segment_[160-260],room_segment_[260-360],room_segment_[360-500],room_segment_[500-900],n_people_bucket_[1-1],n_people_bucket_[2-2],n_people_bucket_[3-4],n_people_bucket_[5-inf],weekend_stay_True,weekend_stay_False
0,0,True,False,False,False,False,False,False,False,False,...,False,True,False,False,False,False,False,True,True,False
1,1,True,False,False,False,False,False,False,False,False,...,True,False,False,False,False,False,True,False,True,False
2,2,True,False,False,False,False,False,False,False,False,...,True,False,False,False,False,True,False,False,False,True
3,3,True,False,False,False,False,False,False,False,False,...,True,False,False,False,False,False,True,False,True,False
4,4,True,False,False,False,False,False,False,False,False,...,False,False,False,False,False,True,False,False,True,False
5,5,False,True,False,False,False,False,False,False,False,...,False,True,False,False,False,False,False,True,True,False
6,6,False,False,True,False,False,False,False,False,False,...,False,True,False,False,False,False,False,True,True,False
44,6,False,False,True,False,False,False,False,False,False,...,False,True,False,False,False,False,False,True,True,False
47,6,False,False,True,False,False,False,False,False,False,...,False,True,False,False,False,False,False,True,True,False
51,6,False,False,True,False,False,False,False,False,False,...,False,True,False,False,False,False,False,True,True,False


# Content-based recommender

<span style="color:red"><font size="4">**Task:**</font></span><br> 
Code the content-based recommender. User features should be calculated within the fit method based on available training data and should be saved in the object as self.users_df for later use in the recommend method. Item features should be calculated both in the fit method (from interactions_df) and in the recommend method (from items_df - the items to be evaluated).

In the fit method you have to randomly generate non-existing interactions and add them to the training data for the regressor. You should add the target variable to interactions - equal to 1 for real ("positive") interactions and equal to 0 for those newly added "negative" interactions. Generate several negative interactions per every positive interaction (n_neg_per_pos). Treat the proportion as a tunable parameter of the model.

Remember to keep control over randomness - in the init method add seed as a parameter and initialize the random seed generator with that seed:

```python
self.seed = seed
self.rng = np.random.RandomState(seed=seed)
```

Below the base content-based recommender class there are several classes which inherit from the base class and use different ML models:
  - LinearRegressionCBUIRecommender - based on linear regression,
  - SVRCBUIRecommender - based on Support Vector Regressor (if you want to test it, sample the data in the fit method, as the training can take many hours on the entire dataset of interactions),
  - RandomForestCBUIRecommender - based on Random Forest,
  - XGBoostCBUIRecommender - based on XGBoost.
  
There is no need to change anything in those inheriting classes, although you can experiment with other tunable parameters of the underlying models.

You are encouraged to experiment with:
  - Other numerical user and item features (but always train and evaluate the model on buckets defined in the first notebook).
  - Other ML models, e.g. Huber regression, Lasso regression, Ridge regression, LARS regression, Linear SVR, Decision Tree, Naive Bayes, LightGBM, Neural Networks or any model of your choice.

In [15]:
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor

from recommenders.recommender import Recommender


class ContentBasedUserItemRecommender(Recommender):
    """
    Linear recommender class based on user and item features.
    """
    
    def __init__(self, seed=6789, n_neg_per_pos=5):
        """
        Initialize base recommender params and variables.
        """
        self.model = LinearRegression()
        self.n_neg_per_pos = n_neg_per_pos
        
        self.recommender_df = pd.DataFrame(columns=['user_id', 'item_id', 'score'])
        self.users_df = None
        self.user_features = None
        
        self.seed = seed
        self.rng = np.random.RandomState(seed=seed)
    
    def fit(self, interactions_df, users_df, items_df):
        """
        Training of the recommender.
        
        :param pd.DataFrame interactions_df: DataFrame with recorded interactions between users and items 
            defined by user_id, item_id and features of the interaction.
        :param pd.DataFrame users_df: DataFrame with users and their features defined by user_id and the user feature columns.
        :param pd.DataFrame items_df: DataFrame with items and their features defined by item_id and the item feature columns.
        """
        
        interactions_df = interactions_df.copy()
        
        # Prepare users_df and items_df
        
        users_df, user_features = prepare_users_df(interactions_df)
        
        self.users_df = users_df
        self.user_features = user_features
        items_df, item_features = prepare_items_df(interactions_df)
        items_df = items_df.loc[:, ['item_id'] + item_features]
       
        # Generate negative interactions
        
        interactions_df = interactions_df.loc[:, ['user_id', 'item_id']]
        
        interactions_df.loc[:, 'interacted'] = 1
        #display(interactions_df)
        negative_interactions = []
        
        ########################
        # Write your code here #
        # Generate tuples (user_id, item_id, 0) for pairs (user_id, item_id) which do not
        # appear in the interactions_df and add those tuples to the list negative_interactions.
        # Generate self.n_neg_per_pos * len(interactions_df) negative interactions 
        # (self.n_neg_per_pos per one positive).
        # Make sure the code is efficient and runs fast, otherwise you will not be able to properly tune your model.

        #start
        all_users = interactions_df["user_id"].unique()
        all_items = interactions_df["item_id"]
        for user_id in all_users:
            user_items = interactions_df.loc[interactions_df["user_id"] == user_id]["item_id"]
            count_user_interactions = len(interactions_df.loc[interactions_df["user_id"] == user_id])
            neg_items = list(set(all_items.copy()) - set(user_items))
            for _ in range(count_user_interactions * self.n_neg_per_pos):
                item_id = np.random.choice(neg_items)
                negative_interactions.append((user_id, item_id, 0))
                
        #end
        interactions_df = pd.concat(
            [interactions_df, pd.DataFrame(negative_interactions, columns=['user_id', 'item_id', 'interacted'])])
        
        # Get the input data for the model
        
        interactions_df = pd.merge(interactions_df, users_df, on=['user_id'])
        interactions_df = pd.merge(interactions_df, items_df, on=['item_id'])
        
        x = interactions_df.loc[:, user_features + item_features].values
        y = interactions_df['interacted'].values
    
        self.model.fit(x, y)
    
    def recommend(self, users_df, items_df, n_recommendations=1):
        """
        Serving of recommendations. Scores items in items_df for each user in users_df and returns 
        top n_recommendations for each user.
        
        :param pd.DataFrame users_df: DataFrame with users and their features for which recommendations should be generated.
        :param pd.DataFrame items_df: DataFrame with items and their features which should be scored.
        :param int n_recommendations: Number of recommendations to be returned for each user.
        :return: DataFrame with user_id, item_id and score as columns returning n_recommendations top recommendations 
            for each user.
        :rtype: pd.DataFrame
        """
        
        # Clean previous recommendations (iloc could be used alternatively)
        self.recommender_df = self.recommender_df[:0]
        
        ########################
        # Write your code here #
        # Prepare users_df and items_df
        # For users_df you need to merge user features from self.users_df to users_df 
        # (the users for which you generate recommendations).
        # Note that for users who were not in the hotel before (which is true for most users)
        # there will be no features in self.users_df. For such users you can initialize their features
        # with all zeros (for instance with fillna(0)), but you can also try to use average feature
        # values from self.users_df (this way you would trear a new user as an average user).
        # For items you have to apply the prepare_items_df method to items_df.

        #start
        
        #only for tests/ delete later
        self.users_df, self.user_features = prepare_users_df(interactions_df)
        
        # end of code for test
        users_df = users_df.merge(self.users_df, on='user_id', how='left')
        users_df = users_df.fillna(0)
        items_df, item_features = prepare_items_df(interactions_df)
        #items_df = items_df.loc[:, ['item_id'] + item_features]
        #end
        
        # Score the items
    
        recommendations = pd.DataFrame(columns=['user_id', 'item_id', 'score'])
        
        for ix, user in users_df.iterrows():
            
            ########################
            # Write your code here #
            # Create a Carthesian product of users from users_df and items from items_df

            #start

            #cartesian_df = users_df.merge(items_df)
            cartesian_df = pd.merge(users_df, items_df, how='cross')
            #end

            ########################
            # Write your code here #
            # Use self.model.predict method to calculate scores for all records in the just created DataFrame
            # of users and items
            
            #start
           
            scores = self.model.predict(cartesian_df.loc[:, self.user_features + item_features].values)
            #end

            ########################
            # Write your code here #
            # Obtain item ids with the highest score and save those ids under the chosen_ids variable
            # Do not exclude already booked items.
            
            #start
            chosen_ids = np.argsort(-scores)[:n_recommendations]
            #print(chosen_ids)
            #end
            
            
            
            recommendations = []
            for item_id in chosen_ids:
                recommendations.append(
                    {
                        'user_id': user['user_id'],
                        'item_id': cartesian_df.iloc[item_id]['item_id'],
                        'score': scores[item_id]
                    }
                )
            
            user_recommendations = pd.DataFrame(recommendations)

            self.recommender_df = pd.concat([self.recommender_df, user_recommendations])

        return self.recommender_df
    
    
class LinearRegressionCBUIRecommender(ContentBasedUserItemRecommender):
    """
    Linear regression recommender class based on user and item features.
    """
    
    def __init__(self, seed=6789, n_neg_per_pos=5, **model_params):
        """
        Initialize base recommender params and variables.
        """
        super().__init__(seed=seed, n_neg_per_pos=n_neg_per_pos)
        self.model = LinearRegression()
        
        
class SVRCBUIRecommender(ContentBasedUserItemRecommender):
    """
    SVR recommender class based on user and item features.
    """
    
    def __init__(self, seed=6789, n_neg_per_pos=5, **model_params):
        """
        Initialize base recommender params and variables.
        """
        super().__init__(seed=seed, n_neg_per_pos=n_neg_per_pos)
        if 'kernel' in model_params:
            self.kernel = model_params['kernel']
        else:
            self.kernel = 'rbf'
        if 'C' in model_params:
            self.C = model_params['C']
        else:
            self.C = 1.0
        if 'epsilon' in model_params:
            self.epsilon = model_params['epsilon']
        else:
            self.epsilon = 0.1
        self.model = SVR(kernel=self.kernel, C=self.C, epsilon=self.epsilon)
        
    
class RandomForestCBUIRecommender(ContentBasedUserItemRecommender):
    """
    Random forest recommender class based on user and item features.
    """
    
    def __init__(self, seed=6789, n_neg_per_pos=5, **model_params):
        """
        Initialize base recommender params and variables.
        """
        super().__init__(seed=seed, n_neg_per_pos=n_neg_per_pos)
        if 'n_estimators' in model_params:
            self.n_estimators = int(model_params['n_estimators'])
        else:
            self.n_estimators = 100
        if 'max_depth' in model_params:
            self.max_depth = int(model_params['max_depth'])
        else:
            self.max_depth = 30
        if 'min_samples_split' in model_params:
            self.min_samples_split = int(model_params['min_samples_split'])
        else:
            self.min_samples_split = 30
        self.model = RandomForestRegressor(
            n_estimators=self.n_estimators, max_depth=self.max_depth, min_samples_split=self.min_samples_split)
    
    
class XGBoostCBUIRecommender(ContentBasedUserItemRecommender):
    """
    XGBoost recommender class based on user and item features.
    """
    
    def __init__(self, seed=6789, n_neg_per_pos=5, **model_params):
        """
        Initialize base recommender params and variables.
        """
        super().__init__(seed=seed, n_neg_per_pos=n_neg_per_pos)
        if 'n_estimators' in model_params:
            self.n_estimators = int(model_params['n_estimators'])
        else:
            self.n_estimators = 100
        if 'max_depth' in model_params:
            self.max_depth = int(model_params['max_depth'])
        else:
            self.max_depth = 30
        if 'min_samples_split' in model_params:
            self.min_samples_split = int(model_params['min_samples_split'])
        else:
            self.min_samples_split = 30
        if 'learning_rate' in model_params:
            self.learning_rate = model_params['learning_rate']
        else:
            self.learning_rate = 30
        self.model = GradientBoostingRegressor(
            n_estimators=self.n_estimators, max_depth=self.max_depth, min_samples_split=self.min_samples_split,
            learning_rate=self.learning_rate)    

# Quick test of the recommender

In [16]:
items_df = interactions_df.loc[:, ['item_id'] + base_item_features].drop_duplicates()

In [17]:
# Fit method
cb_user_item_recommender = RandomForestCBUIRecommender()
cb_user_item_recommender.fit(interactions_df, None, None)

In [18]:
# Recommender method
#for tests
#cb_user_item_recommender = RandomForestCBUIRecommender()
#cb_user_item_recommender.fit(interactions_df, None, None)
# !!!!!!
recommendations = cb_user_item_recommender.recommend(pd.DataFrame([[1], [2], [3], [4], [14503]], columns=['user_id']), interactions_df, 10)

recommendations = pd.merge(recommendations, items_df, on='item_id', how='left')
display(recommendations)

Unnamed: 0,user_id,item_id,score,term,length_of_stay_bucket,rate_plan,room_segment,n_people_bucket,weekend_stay
0,1.0,0,1.0,WinterVacation,[2-3],Standard,[260-360],[5-inf],True
1,1.0,150,1.0,OffSeason,[4-7],Nonref,[160-260],[1-1],True
2,1.0,54,1.0,OffSeason,[2-3],Nonref,[160-260],[2-2],True
3,1.0,21,1.0,OffSeason,[2-3],Standard,[160-260],[2-2],False
4,1.0,50,1.0,OffSeason,[2-3],Nonref,[160-260],[3-4],True
5,1.0,137,1.0,OffSeason,[2-3],Nonref,[160-260],[1-1],True
6,1.0,54,1.0,OffSeason,[2-3],Nonref,[160-260],[2-2],True
7,1.0,272,1.0,OffSeason,[0-1],Nonref,[160-260],[3-4],True
8,1.0,272,1.0,OffSeason,[0-1],Nonref,[160-260],[3-4],True
9,1.0,50,1.0,OffSeason,[2-3],Nonref,[160-260],[3-4],True


# Tuning method

In [13]:
from evaluation_and_testing.testing import evaluate_train_test_split_implicit

seed = 6789

In [13]:
from hyperopt import hp, fmin, tpe, Trials
import traceback

def tune_recommender(recommender_class, interactions_df, items_df, 
                     param_space, max_evals=1, show_progressbar=True, seed=6789):
    # Split into train_validation and test sets

    shuffle = np.arange(len(interactions_df))
    rng = np.random.RandomState(seed=seed)
    rng.shuffle(shuffle)
    shuffle = list(shuffle)

    train_test_split = 0.8
    split_index = int(len(interactions_df) * train_test_split)

    train_validation = interactions_df.iloc[shuffle[:split_index]]
    test = interactions_df.iloc[shuffle[split_index:]]

    # Tune

    def loss(tuned_params):
        recommender = recommender_class(seed=seed, **tuned_params)
        hr1, hr3, hr5, hr10, ndcg1, ndcg3, ndcg5, ndcg10 = evaluate_train_test_split_implicit(
            recommender, train_validation, items_df, seed=seed)
        return -hr10

    n_tries = 1
    succeded = False
    try_id = 0
    while not succeded and try_id < n_tries:
        try:
            trials = Trials()
            best_param_set = fmin(loss, space=param_space, algo=tpe.suggest, 
                                  max_evals=max_evals, show_progressbar=show_progressbar, trials=trials, verbose=True)
            succeded = True
        except:
            traceback.print_exc()
            try_id += 1
            
    if not succeded:
        return None
        
    # Validate
    
    recommender = recommender_class(seed=seed, **best_param_set)

    results = [[recommender_class.__name__] + list(evaluate_train_test_split_implicit(
        recommender, {'train': train_validation, 'test': test}, items_df, seed=seed))]

    results = pd.DataFrame(results, 
                           columns=['Recommender', 'HR@1', 'HR@3', 'HR@5', 'HR@10', 'NDCG@1', 'NDCG@3', 'NDCG@5', 'NDCG@10'])

    display(results)
    
    return best_param_set

## Tuning of the recommender

<span style="color:red"><font size="4">**Task:**</font></span><br> 
Tune your models using the code below. You only need to put the class name of your recommender and choose an appropriate parameter space.

In [14]:
param_space = {
    'n_neg_per_pos': hp.quniform('n_neg_per_pos', 1, 10, 1)
}

best_param_set = tune_recommender(LinearRegressionCBUIRecommender, interactions_df, items_df,
                                  param_space, max_evals=10, show_progressbar=True, seed=seed)

print("Best parameters:")
print(best_param_set)

  0%|          | 0/10 [00:00<?, ?trial/s, best loss=?]

Unnamed: 0,user_id,item_id,term,length_of_stay_bucket,rate_plan,room_segment,n_people_bucket,weekend_stay
83,1,32,OffSeason,[2-3],Standard,[160-260],[2-2],True
0,1,0,WinterVacation,[2-3],Standard,[260-360],[5-inf],True
11267,1,54,OffSeason,[2-3],Nonref,[160-260],[2-2],True
5509,1,32,OffSeason,[2-3],Standard,[160-260],[2-2],True
4127,1,23,OffSeason,[4-7],Standard,[160-260],[2-2],True
10178,1,54,OffSeason,[2-3],Nonref,[160-260],[2-2],True
84,1,32,OffSeason,[2-3],Standard,[160-260],[2-2],True
318,1,59,WinterVacation,[2-3],Nonref,[160-260],[2-2],True
9532,1,23,OffSeason,[4-7],Standard,[160-260],[2-2],True
9972,1,21,OffSeason,[2-3],Standard,[160-260],[2-2],False


Unnamed: 0,user_id,item_id,count,term,length_of_stay_bucket,rate_plan,room_segment,n_people_bucket,weekend_stay,term_WinterVacation,...,term_OffSeason,term_HighSeason,term_LowSeason,term_MayLongWeekend,term_NewYear,term_Christmas,rate_plan_Standard,rate_plan_Nonref,weekend_stay_True,weekend_stay_False
0,1,0,1,WinterVacation,[2-3],Standard,[260-360],[5-inf],True,True,...,False,False,False,False,False,False,True,False,True,False
1,1,0,1,WinterVacation,[2-3],Standard,[260-360],[5-inf],True,True,...,False,False,False,False,False,False,True,False,True,False
2,1,0,1,WinterVacation,[2-3],Standard,[260-360],[5-inf],True,True,...,False,False,False,False,False,False,True,False,False,True
3,1,0,1,WinterVacation,[2-3],Standard,[260-360],[5-inf],True,,...,,,,,,,,,,
4,1,0,1,WinterVacation,[2-3],Standard,[260-360],[5-inf],True,True,...,False,False,False,False,False,False,True,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
922322,14460,738,1,OffSeason,[2-3],Standard,[0-160],[5-inf],True,,...,,,,,,,,,,
922323,14460,738,1,OffSeason,[2-3],Standard,[0-160],[5-inf],True,,...,,,,,,,,,,
922324,14490,738,1,OffSeason,[2-3],Standard,[0-160],[5-inf],True,,...,,,,,,,,,,
922325,14490,738,1,OffSeason,[2-3],Standard,[0-160],[5-inf],True,,...,,,,,,,,,,


Unnamed: 0,user_id,term_prob_WinterVacation,term_prob_Easter,term_prob_OffSeason,term_prob_HighSeason,term_prob_LowSeason,term_prob_MayLongWeekend,term_prob_NewYear,term_prob_Christmas,rate_plan_prob_Standard,rate_plan_prob_Nonref,weekend_stay_prob_True,weekend_stay_prob_False,length_of_stay_avg,room_segment_avg,n_people_avg
0,1,0.133333,0.0,0.8,0.066667,0.0,0.0,0.0,0.0,0.533333,0.466667,0.8,0.2,2.5,210,2.0
1,2,1.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,1.000000,0.000000,1.0,0.0,2.5,210,3.5
2,3,1.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,1.000000,0.000000,0.0,1.0,2.5,210,2.0
3,5,1.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,1.000000,0.000000,1.0,0.0,5.5,80,2.0
4,6,0.000000,1.0,0.0,0.000000,0.0,0.0,0.0,0.0,1.000000,0.000000,1.0,0.0,5.5,310,5.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9031,14498,0.000000,0.0,1.0,0.000000,0.0,0.0,0.0,0.0,1.000000,0.000000,1.0,0.0,2.5,210,1.0
9032,14499,0.000000,0.0,1.0,0.000000,0.0,0.0,0.0,0.0,1.000000,0.000000,1.0,0.0,10.0,210,2.0
9033,14500,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,1.0,1.000000,0.000000,1.0,0.0,2.5,310,3.5
9034,14501,0.000000,0.0,1.0,0.000000,0.0,0.0,0.0,0.0,1.000000,0.000000,0.0,1.0,2.5,210,3.5


job exception: 'NoneType' object is not iterable



  0%|          | 0/10 [00:09<?, ?trial/s, best loss=?]
Best parameters:
None


Traceback (most recent call last):
  File "C:\Users\Michał\AppData\Local\Temp\ipykernel_23628\3138901096.py", line 33, in tune_recommender
    best_param_set = fmin(loss, space=param_space, algo=tpe.suggest,
  File "c:\Users\Michał\AppData\Local\Programs\Python\Python39\lib\site-packages\hyperopt\fmin.py", line 540, in fmin
    return trials.fmin(
  File "c:\Users\Michał\AppData\Local\Programs\Python\Python39\lib\site-packages\hyperopt\base.py", line 671, in fmin
    return fmin(
  File "c:\Users\Michał\AppData\Local\Programs\Python\Python39\lib\site-packages\hyperopt\fmin.py", line 586, in fmin
    rval.exhaust()
  File "c:\Users\Michał\AppData\Local\Programs\Python\Python39\lib\site-packages\hyperopt\fmin.py", line 364, in exhaust
    self.run(self.max_evals - n_done, block_until_done=self.asynchronous)
  File "c:\Users\Michał\AppData\Local\Programs\Python\Python39\lib\site-packages\hyperopt\fmin.py", line 300, in run
    self.serial_evaluate()
  File "c:\Users\Michał\AppData\Local\P

In [None]:
param_space = {
    'n_neg_per_pos': hp.quniform('n_neg_per_pos', 1, 10, 1),
    'C': hp.loguniform('C', np.log(0.01), np.log(100.0))
}

best_param_set = tune_recommender(SVRCBUIRecommender, interactions_df, items_df,
                                  param_space, max_evals=100, show_progressbar=True, seed=seed)

print("Best parameters:")
print(best_param_set)

In [None]:
param_space = {
    'n_neg_per_pos': hp.quniform('n_neg_per_pos', 1, 10, 1),
    'n_estimators': hp.quniform('n_estimators', 30, 300, 1),
    'max_depth': hp.quniform('max_depth', 2, 10, 1),
    'min_samples_split': hp.quniform('min_samples_split', 2, 30, 1)
}

best_param_set = tune_recommender(RandomForestCBUIRecommender, interactions_df, items_df,
                                  param_space, max_evals=100, show_progressbar=True, seed=seed)

print("Best parameters:")
print(best_param_set)

In [None]:
# This tuning may take around 12 hours

param_space = {
    'n_neg_per_pos': hp.quniform('n_neg_per_pos', 1, 10, 1),
    'n_estimators': hp.quniform('n_estimators', 10, 300, 1),
    'max_depth': hp.quniform('max_depth', 2, 10, 1),
    'min_samples_split': hp.quniform('min_samples_split', 2, 30, 1),
    'learning_rate': hp.loguniform('learning_rate', np.log(0.001), np.log(0.1))
}

best_param_set = tune_recommender(XGBoostCBUIRecommender, interactions_df, items_df,
                                  param_space, max_evals=300, show_progressbar=True, seed=seed)

print("Best parameters:")
print(best_param_set)

# Final evaluation

<span style="color:red"><font size="4">**Task:**</font></span><br> 
Run the final evaluation of your recommender and present its results against the Amazon recommender's results. You can present results for several of your recommenders. You just need to give the class name of your recommender and its tuned parameters below. If you present results for several recommenders, you should add a separate cell for each recommender and change the names of the DataFrames containing results.

In [16]:
cb_user_item_recommender = LinearRegressionCBUIRecommender(
    **{'n_neg_per_pos': 7})  # Initialize your recommender here with the best params from tuning

# Give the name of your recommender in the line below
linear_cbui_tts_results = [['LinearRegressionCBUIRecommender'] + list(evaluate_train_test_split_implicit(
    cb_user_item_recommender, interactions_df, items_df))]

linear_cbui_tts_results = pd.DataFrame(
    linear_cbui_tts_results, columns=['Recommender', 'HR@1', 'HR@3', 'HR@5', 'HR@10', 'NDCG@1', 'NDCG@3', 'NDCG@5', 'NDCG@10'])

display(linear_cbui_tts_results)

1
loop1
loop2
loop3
loop4
loop5
loop6
loop7
loop8
7
loop1
loop2
loop3
loop4
loop5
loop6
loop7
loop8
8
loop1
loop2
loop3
loop4
loop5
loop6
loop7
loop8
9
loop1
loop2
loop3
loop4
loop5
loop6
loop7
loop8
15
loop1
loop2
loop3
loop4
loop5
loop6
loop7
loop8
16
loop1
loop2
loop3
loop4
loop5
loop6
loop7
loop8
25
loop1
loop2
loop3
loop4
loop5
loop6
loop7
loop8
27
loop1
loop2
loop3
loop4
loop5
loop6
loop7
loop8
29
loop1
loop2
loop3
loop4
loop5
loop6
loop7
loop8
35
loop1
loop2
loop3
loop4
loop5
loop6
loop7
loop8
42
loop1
loop2
loop3
loop4
loop5
loop6
loop7
loop8
44
loop1
loop2
loop3
loop4
loop5
loop6
loop7
loop8
45
loop1
loop2
loop3
loop4
loop5
loop6
loop7
loop8
50
loop1
loop2
loop3
loop4
loop5
loop6
loop7
loop8
57
loop1
loop2
loop3
loop4
loop5
loop6
loop7
loop8
58
loop1
loop2
loop3
loop4
loop5
loop6
loop7
loop8
60
loop1
loop2
loop3
loop4
loop5
loop6
loop7
loop8
62
loop1
loop2
loop3
loop4
loop5
loop6
loop7
loop8
65
loop1
loop2
loop3
loop4
loop5
loop6
loop7
loop8
69
loop1
loop2
loop3
loop4
loop5
lo

Unnamed: 0,Recommender,HR@1,HR@3,HR@5,HR@10,NDCG@1,NDCG@3,NDCG@5,NDCG@10
0,LinearRegressionCBUIRecommender,0.049219,0.147658,0.246096,0.492193,0.049219,0.104883,0.145121,0.223631


In [None]:
from recommenders.amazon_recommender import AmazonRecommender

amazon_recommender = AmazonRecommender()

amazon_tts_results = [['AmazonRecommender'] + list(evaluate_train_test_split_implicit(
    amazon_recommender, interactions_df, items_df))]

amazon_tts_results = pd.DataFrame(
    amazon_tts_results, columns=['Recommender', 'HR@1', 'HR@3', 'HR@5', 'HR@10', 'NDCG@1', 'NDCG@3', 'NDCG@5', 'NDCG@10'])

display(amazon_tts_results)

In [None]:
tts_results = pd.concat([linear_cbui_tts_results, amazon_tts_results]).reset_index(drop=True)
display(tts_results)