# Guide Recommendation Pipeline

## Define working environment

In [1]:
# Import libraries

import numpy as np
import pandas as pd
import scipy.sparse as sps
import matplotlib.pyplot as pyplot
import ast

from tqdm import tqdm

## Load and preprocess data

In [2]:
# Load dataframe for tourist attributes

tourist_file = open('Data/tourists_200.csv')

tourist_df = pd.read_csv(
    filepath_or_buffer = tourist_file,
    sep = ';',
    header = 0
)

tourist_df.rename(columns={tourist_df.columns[0]: 'id'}, inplace=True)

tourist_df

Unnamed: 0,id,languages,keywords
0,0,['bulgarian'],['cinema']
1,1,['deutsche'],[]
2,2,['spanish'],"['rafting', 'museums']"
3,3,['deutsche'],"['cinema', 'history', 'food']"
4,4,['chinese'],"['wine', 'rafting']"
...,...,...,...
195,195,['french'],['cinema']
196,196,['dutch'],"['food', 'tracking', 'history']"
197,197,['dutch'],"['museums', 'art', 'history']"
198,198,['italian'],"['rafting', 'literature']"


In [3]:
# Load dataframe for guide attributes

guide_file = open('Data/guides_40.csv')

guide_df = pd.read_csv(
    filepath_or_buffer = guide_file,
    sep = ';',
    header = 0,
    converters = {'languages_spoken':ast.literal_eval, 'keywords':ast.literal_eval}
)

guide_df.rename(columns={guide_df.columns[0]: 'id'}, inplace=True)

guide_df

Unnamed: 0,id,gender,name,birth_date,now_available,languages_spoken,price,education,biography,keywords,current_location,experience
0,0,male,Jeffrey Carroll,1960-10-18,True,[english],25,high-school,Sports coach,[museums],"{'lat': 40.342693584880706, 'lon': 18.16438078...",30
1,1,female,Regina Thomas,1989-11-21,True,"[italian, dutch]",36,middle-school,Stage manager,"[cinema, rafting, history, wine]","{'lat': 40.3367551413547, 'lon': 18.1569120995...",13
2,2,female,Brianna Nicholson,1962-05-03,True,"[chinese, french, english]",34,phd,"Psychotherapist, dance movement","[food, archeology, art]","{'lat': 40.362447049660204, 'lon': 18.14225129...",28
3,3,male,Jordan Ali,1992-10-15,True,[bulgarian],46,bachelor,"Scientist, clinical (histocompatibility and im...","[countryside, rafting, art]","{'lat': 40.36584086550253, 'lon': 18.183002910...",5
4,4,male,Steven Taylor,1985-01-25,True,"[deutsche, french]",30,master,Trading standards officer,"[countryside, tracking, beer]","{'lat': 40.354724889812935, 'lon': 18.20308322...",17
5,5,male,Ronald Mitchell,1959-06-15,True,"[deutsche, dutch, bulgarian]",27,middle-school,"Teacher, secondary school",[],"{'lat': 40.35854623058413, 'lon': 18.183459879...",1
6,6,male,John Lloyd,1983-04-08,True,"[english, chinese, french]",34,middle-school,"Engineer, electrical","[rafting, art, sport]","{'lat': 40.35774331848761, 'lon': 18.163273185...",18
7,7,male,Jason Hartman,1971-12-09,True,"[spanish, dutch, french, italian]",33,bachelor,Fish farm manager,"[museums, art, countryside, wine]","{'lat': 40.365193043282794, 'lon': 18.18548169...",22
8,8,male,Cody Stevens,1992-11-27,True,"[bulgarian, english, chinese]",31,middle-school,"Therapist, art","[literature, cinema, music]","{'lat': 40.35265552781134, 'lon': 18.151723396...",2
9,9,male,Daniel Carey,1988-09-18,True,"[chinese, bulgarian, dutch, english]",37,phd,Fish farm manager,[museums],"{'lat': 40.33774657696426, 'lon': 18.182064664...",2


In [4]:
guide_df.dtypes

id                   int64
gender              object
name                object
birth_date          object
now_available         bool
languages_spoken    object
price                int64
education           object
biography           object
keywords            object
current_location    object
experience           int64
dtype: object

In [5]:
# Load dataframe for ratings

rating_file = open('Data/ratings_200_40.csv')

rating_df = pd.read_csv(
    filepath_or_buffer = rating_file,
    sep = ';',
    header = 0
)

rating_df

Unnamed: 0,0,1,2
0,0,32,5.0
1,0,10,4.0
2,1,20,5.0
3,1,15,4.0
4,2,34,4.0
...,...,...,...
300,195,14,4.0
301,196,18,3.0
302,197,5,4.0
303,198,18,4.0


In [6]:
# Format the rating dataframe

rating_df.rename(columns={rating_df.columns[0]: 'tourist_id',
                          rating_df.columns[1]: 'guide_id',
                          rating_df.columns[2]: 'rating',
                         }, inplace=True)

rating_df

Unnamed: 0,tourist_id,guide_id,rating
0,0,32,5.0
1,0,10,4.0
2,1,20,5.0
3,1,15,4.0
4,2,34,4.0
...,...,...,...
300,195,14,4.0
301,196,18,3.0
302,197,5,4.0
303,198,18,4.0


In [7]:
# Check if there are duplicated interactions

rating_df.drop_duplicates(subset=['tourist_id','guide_id'],inplace=True)
rating_df

Unnamed: 0,tourist_id,guide_id,rating
0,0,32,5.0
1,0,10,4.0
2,1,20,5.0
3,1,15,4.0
4,2,34,4.0
...,...,...,...
300,195,14,4.0
301,196,18,3.0
302,197,5,4.0
303,198,18,4.0


## Print statistics

In [8]:
# Statistics about data
arr_tourists = tourist_df["id"].unique()
arr_guides = guide_df["id"].unique()

n_tourists = len(arr_tourists)
n_guides = len(arr_guides)
n_interactions = len(rating_df)

print("Number of tourists: {:d}".format(n_tourists))
print("Number of guides: {:d}".format(n_guides))
print("Number of interactions: {:d}".format(n_interactions))

print("Average interaction per tourist: {:.2f}".format(n_interactions/n_tourists))
print("Average interaction per guide: {:.2f}".format(n_interactions/n_guides))
print("Sparsity: {:.2f} %".format((1-float(n_interactions)/(n_guides*n_tourists))*100))

Number of tourists: 200
Number of guides: 40
Number of interactions: 305
Average interaction per tourist: 1.52
Average interaction per guide: 7.62
Sparsity: 96.19 %


## Create the URM

In [9]:
# Create the User Rating Matrix
URM_all = sps.csr_matrix(
    (rating_df["rating"].values,
    (rating_df["tourist_id"].values, rating_df["guide_id"].values))
)

URM_all

<200x40 sparse matrix of type '<class 'numpy.float64'>'
	with 305 stored elements in Compressed Sparse Row format>

In [10]:
URM_train = URM_all

## Create the ICM

In [11]:
icm_df = guide_df.copy(deep=True)

In [12]:
def replace_birth_year(x):
    if x > 1984:
        return '20-40'
    else:
        return '40+'

icm_df['birth_date'] = icm_df['birth_date'].apply(
    lambda x: replace_birth_year(pd.to_datetime(x, format="%Y-%m-%d").year)
)

In [13]:
def replace_experience(x):
    if x < 5:
        return 'junior'
    elif x < 10:
        return 'experienced'
    else:
        return 'senior'

icm_df['experience'] = icm_df['experience'].apply(replace_experience)

In [14]:
icm_df[['price']].mean(axis=0)

price    29.175
dtype: float64

In [15]:
def replace_price(x):
    if x < 25:
        return 'low_cost'
    elif x < 35:
        return 'medium_cost'
    else:
        return 'high_cost'

icm_df['price'] = icm_df['price'].apply(replace_price)

In [16]:
icm_df

Unnamed: 0,id,gender,name,birth_date,now_available,languages_spoken,price,education,biography,keywords,current_location,experience
0,0,male,Jeffrey Carroll,40+,True,[english],medium_cost,high-school,Sports coach,[museums],"{'lat': 40.342693584880706, 'lon': 18.16438078...",senior
1,1,female,Regina Thomas,20-40,True,"[italian, dutch]",high_cost,middle-school,Stage manager,"[cinema, rafting, history, wine]","{'lat': 40.3367551413547, 'lon': 18.1569120995...",senior
2,2,female,Brianna Nicholson,40+,True,"[chinese, french, english]",medium_cost,phd,"Psychotherapist, dance movement","[food, archeology, art]","{'lat': 40.362447049660204, 'lon': 18.14225129...",senior
3,3,male,Jordan Ali,20-40,True,[bulgarian],high_cost,bachelor,"Scientist, clinical (histocompatibility and im...","[countryside, rafting, art]","{'lat': 40.36584086550253, 'lon': 18.183002910...",experienced
4,4,male,Steven Taylor,20-40,True,"[deutsche, french]",medium_cost,master,Trading standards officer,"[countryside, tracking, beer]","{'lat': 40.354724889812935, 'lon': 18.20308322...",senior
5,5,male,Ronald Mitchell,40+,True,"[deutsche, dutch, bulgarian]",medium_cost,middle-school,"Teacher, secondary school",[],"{'lat': 40.35854623058413, 'lon': 18.183459879...",junior
6,6,male,John Lloyd,40+,True,"[english, chinese, french]",medium_cost,middle-school,"Engineer, electrical","[rafting, art, sport]","{'lat': 40.35774331848761, 'lon': 18.163273185...",senior
7,7,male,Jason Hartman,40+,True,"[spanish, dutch, french, italian]",medium_cost,bachelor,Fish farm manager,"[museums, art, countryside, wine]","{'lat': 40.365193043282794, 'lon': 18.18548169...",senior
8,8,male,Cody Stevens,20-40,True,"[bulgarian, english, chinese]",medium_cost,middle-school,"Therapist, art","[literature, cinema, music]","{'lat': 40.35265552781134, 'lon': 18.151723396...",junior
9,9,male,Daniel Carey,20-40,True,"[chinese, bulgarian, dutch, english]",high_cost,phd,Fish farm manager,[museums],"{'lat': 40.33774657696426, 'lon': 18.182064664...",junior


In [17]:
icm_df.drop(labels=['name', 'now_available', 'current_location'], axis=1, inplace=True)

In [18]:
multiclass_attributes = ['gender', 'price', 'experience', 'birth_date', 'education', 'biography', 'languages_spoken', 'keywords']

for n in multiclass_attributes:
    s = icm_df[n].explode()
    icm_df = icm_df.join(pd.crosstab(s.index, s).astype(object)).fillna(0)
    icm_df.drop(labels=n,axis=1,inplace=True)

In [19]:
icm_df

Unnamed: 0,id,female,male,high_cost,low_cost,medium_cost,experienced,junior,senior,20-40,...,countryside,food,history,literature,museums,music,rafting,sport,tracking,wine
0,0,0,1,0,0,1,0,0,1,0,...,0,0,0,0,1,0,0,0,0,0
1,1,1,0,1,0,0,0,0,1,1,...,0,0,1,0,0,0,1,0,0,1
2,2,1,0,0,0,1,0,0,1,0,...,0,1,0,0,0,0,0,0,0,0
3,3,0,1,1,0,0,1,0,0,1,...,1,0,0,0,0,0,1,0,0,0
4,4,0,1,0,0,1,0,0,1,1,...,1,0,0,0,0,0,0,0,1,0
5,5,0,1,0,0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
6,6,0,1,0,0,1,0,0,1,0,...,0,0,0,0,0,0,1,1,0,0
7,7,0,1,0,0,1,0,0,1,0,...,1,0,0,0,1,0,0,0,0,1
8,8,0,1,0,0,1,0,1,0,1,...,0,0,0,1,0,1,0,0,0,0
9,9,0,1,1,0,0,0,1,0,1,...,0,0,0,0,1,0,0,0,0,0


In [20]:
attribute_list = icm_df.columns.tolist()
attribute_list

['id',
 'female',
 'male',
 'high_cost',
 'low_cost',
 'medium_cost',
 'experienced',
 'junior',
 'senior',
 '20-40',
 '40+',
 'bachelor',
 'high-school',
 'master',
 'middle-school',
 'phd',
 'Accountant, chartered certified',
 'Accountant, chartered management',
 'Accountant, chartered public finance',
 'Actuary',
 'Advertising account planner',
 'Broadcast presenter',
 'Chartered certified accountant',
 'Chief Operating Officer',
 'Child psychotherapist',
 'Community pharmacist',
 'Corporate treasurer',
 'Designer, blown glass/stained glass',
 'Development worker, international aid',
 'Engineer, electrical',
 'Financial controller',
 'Fish farm manager',
 'Fisheries officer',
 'Geneticist, molecular',
 'Land',
 'Magazine journalist',
 'Probation officer',
 'Producer, television/film/video',
 'Product designer',
 'Psychotherapist, dance movement',
 'Public house manager',
 'Scientist, clinical (histocompatibility and immunogenetics)',
 'Secondary school teacher',
 'Software engineer'

In [21]:
def convert_index(x):
    if x == 'id':
        return x
    else:
        return attribute_list.index(x)

In [22]:
icm_df.rename(mapper=convert_index, axis=1, inplace=True)
icm_df

Unnamed: 0,id,1,2,3,4,5,6,7,8,9,...,66,67,68,69,70,71,72,73,74,75
0,0,0,1,0,0,1,0,0,1,0,...,0,0,0,0,1,0,0,0,0,0
1,1,1,0,1,0,0,0,0,1,1,...,0,0,1,0,0,0,1,0,0,1
2,2,1,0,0,0,1,0,0,1,0,...,0,1,0,0,0,0,0,0,0,0
3,3,0,1,1,0,0,1,0,0,1,...,1,0,0,0,0,0,1,0,0,0
4,4,0,1,0,0,1,0,0,1,1,...,1,0,0,0,0,0,0,0,1,0
5,5,0,1,0,0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
6,6,0,1,0,0,1,0,0,1,0,...,0,0,0,0,0,0,1,1,0,0
7,7,0,1,0,0,1,0,0,1,0,...,1,0,0,0,1,0,0,0,0,1
8,8,0,1,0,0,1,0,1,0,1,...,0,0,0,1,0,1,0,0,0,0
9,9,0,1,1,0,0,0,1,0,1,...,0,0,0,0,1,0,0,0,0,0


In [23]:
new_icm_df = pd.melt(icm_df, id_vars='id', var_name='label')
new_icm_df = new_icm_df[new_icm_df["value"]==1]
new_icm_df

Unnamed: 0,id,label,value
1,1,1,1
2,2,1,1
10,10,1,1
11,11,1,1
12,12,1,1
...,...,...,...
2948,28,74,1
2949,29,74,1
2961,1,75,1
2967,7,75,1


In [24]:
# Create the User Rating Matrix
ICM_all = sps.csr_matrix(
    (new_icm_df["value"].values,
    (new_icm_df["id"].values, new_icm_df["label"].values))
)

ICM_all

<40x76 sparse matrix of type '<class 'numpy.int64'>'
	with 412 stored elements in Compressed Sparse Row format>

## Build the model

### Collaborative Filtering

In [25]:
from Recommenders.Compute_Similarity_Python import Compute_Similarity_Python

In [26]:
class ItemKNNCFRecommender(object):
    
    def __init__(self, URM):
        self.URM = URM
        
            
    def fit(self, topK=5, shrink=3, normalize=True, similarity="cosine"):
        
        similarity_object = Compute_Similarity_Python(self.URM, shrink=shrink, 
                                                  topK=topK, normalize=normalize, 
                                                  similarity = similarity)
        
        self.W_sparse = similarity_object.compute_similarity()

        
    def recommend(self, user_id, at=None, exclude_seen=True):
        # compute the scores using the dot product
        user_profile = self.URM[user_id]
        scores = user_profile.dot(self.W_sparse).toarray().ravel()

        if exclude_seen:
            scores = self.filter_seen(user_id, scores)

        # rank items
        ranking = scores.argsort()[::-1]
            
        return ranking[:at]
    
    
    def filter_seen(self, user_id, scores):

        start_pos = self.URM.indptr[user_id]
        end_pos = self.URM.indptr[user_id+1]

        user_profile = self.URM.indices[start_pos:end_pos]
        
        scores[user_profile] = -np.inf

        return scores

### Content-based Filtering

In [27]:
class ItemKNNCBFRecommender(object):

    def __init__(self, URM, ICM):
        self.URM = URM
        self.ICM = ICM


    def fit(self, topK=50, shrink=100, normalize = True, similarity = "cosine"):

        similarity_object = Compute_Similarity_Python(self.ICM.T, shrink=shrink,
                                                  topK=topK, normalize=normalize,
                                                  similarity = similarity)

        self.W_sparse = similarity_object.compute_similarity()


    def recommend(self, user_id, at=None, exclude_seen=True):
        # compute the scores using the dot product
        user_profile = self.URM[user_id]
        scores = user_profile.dot(self.W_sparse).toarray().ravel()

        if exclude_seen:
            scores = self.filter_seen(user_id, scores)

        # rank items
        ranking = scores.argsort()[::-1]

        return ranking[:at]


    def filter_seen(self, user_id, scores):

        start_pos = self.URM.indptr[user_id]
        end_pos = self.URM.indptr[user_id+1]

        user_profile = self.URM.indices[start_pos:end_pos]

        scores[user_profile] = -np.inf

        return scores

## Fit the model

In [40]:
model_type = 'cf'

In [41]:
if model_type == 'cf':
    recommender = ItemKNNCFRecommender(URM_train)
    recommender.fit(shrink=0.5, topK=5)

Similarity column 40 (100.0%), 20018.15 column/sec. Elapsed time 0.00 sec


In [42]:
if model_type == 'cbf':
    recommender = ItemKNNCBFRecommender(URM_train, ICM_all)
    recommender.fit(shrink=0.5, topK=5)

## Generate outputs

In [43]:
recommendations = []

for i,id in tqdm(enumerate(arr_tourists)):
    rec = recommender.recommend(id, at=3, exclude_seen=True)
    rec_list = rec
    rec_row = ' '.join(str(s) for s in rec_list)
    recommendations.append(rec_row)

200it [00:00, 8334.19it/s]


In [44]:
recommendations[:10]

['5 29 4',
 '30 6 32',
 '28 29 39',
 '33 15 4',
 '28 2 34',
 '27 7 35',
 '39 38 17',
 '32 34 10',
 '8 23 39',
 '39 12 37']

In [45]:
result_df = pd.DataFrame(
    data = {'tourist_id': arr_tourists,
            'guides': recommendations}
)

result_df

Unnamed: 0,tourist_id,guides
0,0,5 29 4
1,1,30 6 32
2,2,28 29 39
3,3,33 15 4
4,4,28 2 34
...,...,...
195,195,2 22 39
196,196,37 27 35
197,197,10 26 17
198,198,37 27 35


In [52]:
# Show some examples: select a tourist by id to visualize the received recommendations
sample_tourist = 0
pd.DataFrame(tourist_df.loc[sample_tourist,:])

Unnamed: 0,0
id,0
languages,['bulgarian']
keywords,['cinema']


In [53]:
sample_guide_list = list(map(int, recommendations[sample_tourist].split(" ")))
guide_df.loc[sample_guide_list,:]

Unnamed: 0,id,gender,name,birth_date,now_available,languages_spoken,price,education,biography,keywords,current_location,experience
5,5,male,Ronald Mitchell,1959-06-15,True,"[deutsche, dutch, bulgarian]",27,middle-school,"Teacher, secondary school",[],"{'lat': 40.35854623058413, 'lon': 18.183459879...",1
29,29,female,Melissa Hart,1992-06-23,True,"[bulgarian, chinese]",28,master,"Geneticist, molecular","[archeology, literature, tracking]","{'lat': 40.35465462623879, 'lon': 18.172666005...",9
4,4,male,Steven Taylor,1985-01-25,True,"[deutsche, french]",30,master,Trading standards officer,"[countryside, tracking, beer]","{'lat': 40.354724889812935, 'lon': 18.20308322...",17
