# Guide Recommendation Pipeline

## Define working environment

In [91]:
# Import libraries

import numpy as np
import pandas as pd
import scipy.sparse as sps
import matplotlib.pyplot as pyplot
import ast

from tqdm import tqdm

## Load and preprocess data

In [92]:
# Load dataframe for tourist attributes

tourist_file = open('Data/tourists_200.csv')

tourist_df = pd.read_csv(
    filepath_or_buffer = tourist_file,
    sep = ';',
    header = 0
)

tourist_df.rename(columns={tourist_df.columns[0]: 'id'}, inplace=True)

tourist_df

Unnamed: 0,id,languages,keywords
0,0,['bulgarian'],['cinema']
1,1,['deutsche'],[]
2,2,['spanish'],"['rafting', 'museums']"
3,3,['deutsche'],"['cinema', 'history', 'food']"
4,4,['chinese'],"['wine', 'rafting']"
...,...,...,...
195,195,['french'],['cinema']
196,196,['dutch'],"['food', 'tracking', 'history']"
197,197,['dutch'],"['museums', 'art', 'history']"
198,198,['italian'],"['rafting', 'literature']"


In [93]:
# Load dataframe for guide attributes

guide_file = open('Data/guides_40.csv')

guide_df = pd.read_csv(
    filepath_or_buffer = guide_file,
    sep = ';',
    header = 0,
    converters = {'languages_spoken':ast.literal_eval, 'keywords':ast.literal_eval}
)

guide_df.rename(columns={guide_df.columns[0]: 'id'}, inplace=True)

guide_df

Unnamed: 0,id,gender,name,birth_date,now_available,languages_spoken,price,education,biography,keywords,current_location,experience
0,0,male,Ronald Oneill,1961-02-13,True,[english],25,high-school,Set designer,[museums],"{'lat': 40.342693584880706, 'lon': 18.16438078...",9
1,1,female,Anna Weaver,1964-03-25,True,"[italian, dutch]",36,middle-school,Quality manager,"[cinema, rafting, history, wine]","{'lat': 40.3367551413547, 'lon': 18.1569120995...",28
2,2,female,Evelyn Pacheco,1981-10-22,True,"[chinese, french, english]",34,phd,"Production designer, theatre/television/film","[food, archeology, art]","{'lat': 40.362447049660204, 'lon': 18.14225129...",24
3,3,male,Paul Richards,1982-07-23,True,[bulgarian],46,bachelor,Records manager,"[countryside, rafting, art]","{'lat': 40.36584086550253, 'lon': 18.183002910...",9
4,4,male,Jason Gomez,1999-10-14,True,"[deutsche, french]",30,master,"Exhibitions officer, museum/gallery","[countryside, tracking, beer]","{'lat': 40.354724889812935, 'lon': 18.20308322...",7
5,5,male,Timothy Smith,1959-08-06,True,"[deutsche, dutch, bulgarian]",27,middle-school,Financial manager,[],"{'lat': 40.35854623058413, 'lon': 18.183459879...",2
6,6,male,Joseph Moore,1997-01-02,True,"[english, chinese, french]",34,middle-school,English as a foreign language teacher,"[rafting, art, sport]","{'lat': 40.35774331848761, 'lon': 18.163273185...",6
7,7,male,Curtis Snow,1980-11-27,True,"[spanish, dutch, french, italian]",33,bachelor,Educational psychologist,"[museums, art, countryside, wine]","{'lat': 40.365193043282794, 'lon': 18.18548169...",10
8,8,male,Christopher Simon,1981-01-08,True,"[bulgarian, english, chinese]",31,middle-school,"Therapist, occupational","[literature, cinema, music]","{'lat': 40.35265552781134, 'lon': 18.151723396...",1
9,9,male,James Dean,1992-04-11,True,"[chinese, bulgarian, dutch, english]",37,phd,Barrister,[museums],"{'lat': 40.33774657696426, 'lon': 18.182064664...",13


In [95]:
guide_df.dtypes

id                   int64
gender              object
name                object
birth_date          object
now_available         bool
languages_spoken    object
price                int64
education           object
biography           object
keywords            object
current_location    object
experience           int64
dtype: object

In [96]:
# Load dataframe for ratings

rating_file = open('Data/ratings_200_40.csv')

rating_df = pd.read_csv(
    filepath_or_buffer = rating_file,
    sep = ';',
    header = 0
)

rating_df

Unnamed: 0,0,1,2
0,0,32,5.0
1,0,33,2.0
2,0,10,4.0
3,0,33,2.0
4,1,31,4.0
...,...,...,...
583,198,16,2.0
584,199,36,4.0
585,199,26,1.0
586,199,7,5.0


In [97]:
# Format the rating dataframe

rating_df.rename(columns={rating_df.columns[0]: 'tourist_id',
                          rating_df.columns[1]: 'guide_id',
                          rating_df.columns[2]: 'rating',
                         }, inplace=True)

rating_df

Unnamed: 0,tourist_id,guide_id,rating
0,0,32,5.0
1,0,33,2.0
2,0,10,4.0
3,0,33,2.0
4,1,31,4.0
...,...,...,...
583,198,16,2.0
584,199,36,4.0
585,199,26,1.0
586,199,7,5.0


In [98]:
# Check if there are duplicated interactions

rating_df.drop_duplicates(subset=['tourist_id','guide_id'],inplace=True)
rating_df

Unnamed: 0,tourist_id,guide_id,rating
0,0,32,5.0
1,0,33,2.0
2,0,10,4.0
4,1,31,4.0
5,1,1,2.0
...,...,...,...
581,198,16,1.0
582,198,34,4.0
584,199,36,4.0
585,199,26,1.0


## Print statistics

In [99]:
# Statistics about data
arr_tourists = tourist_df["id"].unique()
arr_guides = guide_df["id"].unique()

n_tourists = len(arr_tourists)
n_guides = len(arr_guides)
n_interactions = len(rating_df)

print("Number of tourists: {:d}".format(n_tourists))
print("Number of guides: {:d}".format(n_guides))
print("Number of interactions: {:d}".format(n_interactions))

print("Average interaction per tourist: {:.2f}".format(n_interactions/n_tourists))
print("Average interaction per guide: {:.2f}".format(n_interactions/n_guides))
print("Sparsity: {:.2f} %".format((1-float(n_interactions)/(n_guides*n_tourists))*100))

Number of tourists: 200
Number of guides: 40
Number of interactions: 494
Average interaction per tourist: 2.47
Average interaction per guide: 12.35
Sparsity: 93.83 %


## Create the URM

In [100]:
# Create the User Rating Matrix
URM_all = sps.csr_matrix(
    (rating_df["rating"].values,
    (rating_df["tourist_id"].values, rating_df["guide_id"].values))
)

URM_all

<200x40 sparse matrix of type '<class 'numpy.float64'>'
	with 494 stored elements in Compressed Sparse Row format>

In [101]:
URM_train = URM_all

## Create the ICM

In [102]:
icm_df = guide_df.copy(deep=True)

In [103]:
multiclass_attributes = ['gender', 'education', 'biography', 'languages_spoken', 'keywords']

for n in multiclass_attributes:
    s = icm_df[n].explode()
    icm_df = icm_df.join(pd.crosstab(s.index, s))
    icm_df.drop(labels=n,axis=1,inplace=True)

In [104]:
icm_df

Unnamed: 0,id,name,birth_date,now_available,price,current_location,experience,female,male,bachelor,...,countryside,food,history,literature,museums,music,rafting,sport,tracking,wine
0,0,Ronald Oneill,1961-02-13,True,25,"{'lat': 40.342693584880706, 'lon': 18.16438078...",9,0,1,0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1,1,Anna Weaver,1964-03-25,True,36,"{'lat': 40.3367551413547, 'lon': 18.1569120995...",28,1,0,0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
2,2,Evelyn Pacheco,1981-10-22,True,34,"{'lat': 40.362447049660204, 'lon': 18.14225129...",24,1,0,0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3,Paul Richards,1982-07-23,True,46,"{'lat': 40.36584086550253, 'lon': 18.183002910...",9,0,1,1,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,4,Jason Gomez,1999-10-14,True,30,"{'lat': 40.354724889812935, 'lon': 18.20308322...",7,0,1,0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
5,5,Timothy Smith,1959-08-06,True,27,"{'lat': 40.35854623058413, 'lon': 18.183459879...",2,0,1,0,...,,,,,,,,,,
6,6,Joseph Moore,1997-01-02,True,34,"{'lat': 40.35774331848761, 'lon': 18.163273185...",6,0,1,0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
7,7,Curtis Snow,1980-11-27,True,33,"{'lat': 40.365193043282794, 'lon': 18.18548169...",10,0,1,1,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
8,8,Christopher Simon,1981-01-08,True,31,"{'lat': 40.35265552781134, 'lon': 18.151723396...",1,0,1,0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
9,9,James Dean,1992-04-11,True,37,"{'lat': 40.33774657696426, 'lon': 18.182064664...",13,0,1,0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


## Build the model

### Collaborative Filtering

In [10]:
from Recommenders.Compute_Similarity_Python import Compute_Similarity_Python

In [11]:
class ItemKNNCFRecommender(object):
    
    def __init__(self, URM):
        self.URM = URM
        
            
    def fit(self, topK=5, shrink=3, normalize=True, similarity="cosine"):
        
        similarity_object = Compute_Similarity_Python(self.URM, shrink=shrink, 
                                                  topK=topK, normalize=normalize, 
                                                  similarity = similarity)
        
        self.W_sparse = similarity_object.compute_similarity()

        
    def recommend(self, user_id, at=None, exclude_seen=True):
        # compute the scores using the dot product
        user_profile = self.URM[user_id]
        scores = user_profile.dot(self.W_sparse).toarray().ravel()

        if exclude_seen:
            scores = self.filter_seen(user_id, scores)

        # rank items
        ranking = scores.argsort()[::-1]
            
        return ranking[:at]
    
    
    def filter_seen(self, user_id, scores):

        start_pos = self.URM.indptr[user_id]
        end_pos = self.URM.indptr[user_id+1]

        user_profile = self.URM.indices[start_pos:end_pos]
        
        scores[user_profile] = -np.inf

        return scores

### Content-based Filtering

## Fit the model

In [12]:
recommender = ItemKNNCFRecommender(URM_train)
recommender.fit(shrink=0.5, topK=5)

Similarity column 40 (100.0%), 6418.46 column/sec. Elapsed time 0.01 sec


## Generate outputs

In [13]:
recommendations = []

for i,id in tqdm(enumerate(arr_tourists)):
    rec = recommender.recommend(id, at=3, exclude_seen=True)
    rec_list = rec
    rec_row = ' '.join(str(s) for s in rec_list)
    recommendations.append(rec_row)

200it [00:00, 3572.45it/s]


In [14]:
recommendations[:10]

['20 31 23',
 '4 10 38',
 '12 38 18',
 '10 12 19',
 '19 21 12',
 '26 38 34',
 '33 21 14',
 '16 14 21',
 '16 13 15',
 '6 18 29']

In [15]:
result_df = pd.DataFrame(
    data = {'tourist_id': arr_tourists,
            'guides': recommendations}
)

result_df

Unnamed: 0,tourist_id,guides
0,0,20 31 23
1,1,4 10 38
2,2,12 38 18
3,3,10 12 19
4,4,19 21 12
...,...,...
195,195,9 32 18
196,196,24 32 37
197,197,16 14 21
198,198,26 20 3


In [24]:
# Show some examples: select a tourist by id to visualize the received recommendations
sample_tourist = 10
pd.DataFrame(tourist_df.loc[sample_tourist,:])

Unnamed: 0,10
id,10
languages,['chinese']
keywords,['rafting']


In [25]:
sample_guide_list = list(map(int, recommendations[sample_tourist].split(" ")))
guide_df.loc[sample_guide_list,:]

Unnamed: 0,id,gender,name,birth_date,now_available,languages_spoken,price,education,biography,keywords,current_location,experience
13,13,female,Maria Crawford,2001-10-13,True,"['chinese', 'english']",27,master,"Editor, film/video","['sport', 'archeology', 'beer']","{'lat': 40.35377358678542, 'lon': 18.174728239...",5
29,29,female,Valerie Ferrell,1970-06-06,True,"['bulgarian', 'chinese']",28,master,Dramatherapist,"['archeology', 'literature', 'tracking']","{'lat': 40.35465462623879, 'lon': 18.172666005...",15
6,6,male,Joseph Moore,1997-01-02,True,"['english', 'chinese', 'french']",34,middle-school,English as a foreign language teacher,"['rafting', 'art', 'sport']","{'lat': 40.35774331848761, 'lon': 18.163273185...",6
