# Guide Recommendation Pipeline

In [1]:
# Import libraries

import numpy as np
import pandas as pd
import scipy.sparse as sps
import matplotlib.pyplot as pyplot

from tqdm import tqdm

## Load and preprocess data

In [2]:
# Load dataframe for tourist attributes

tourist_file = open('Data/tourists_200.csv')

tourist_df = pd.read_csv(
    filepath_or_buffer = tourist_file,
    sep = ';',
    header = 0
)

tourist_df.rename(columns={tourist_df.columns[0]: 'id'}, inplace=True)

tourist_df

Unnamed: 0,id,languages,keywords
0,0,"['italian', 'english']","['cinema', 'music']"
1,1,['chinese'],['beer']
2,2,"['bulgarian', 'spanish']","['history', 'museums', 'beer']"
3,3,"['deutsche', 'dutch']","['tracking', 'literature', 'music']"
4,4,['spanish'],"['tracking', 'art']"
...,...,...,...
195,195,['bulgarian'],[]
196,196,"['french', 'spanish']","['tracking', 'cinema']"
197,197,"['chinese', 'spanish']","['music', 'literature', 'art']"
198,198,['italian'],[]


In [3]:
# Load dataframe for guide attributes

guide_file = open('Data/guides_40.csv')

guide_df = pd.read_csv(
    filepath_or_buffer = guide_file,
    sep = ';',
    header = 0
)

guide_df.rename(columns={guide_df.columns[0]: 'id'}, inplace=True)

guide_df

Unnamed: 0,id,gender,name,birth_date,now_available,languages_spoken,price,education,biography,keywords,current_location,experience
0,0,male,Sean Lewis,1982-10-10,True,['english'],25,high-school,Multimedia programmer,['museums'],"{'lat': 40.342693584880706, 'lon': 18.16438078...",8
1,1,female,Leah Blackwell,1959-10-31,True,"['italian', 'dutch']",36,middle-school,"Education officer, museum","['cinema', 'rafting', 'history', 'wine']","{'lat': 40.3367551413547, 'lon': 18.1569120995...",26
2,2,female,Tracey Lopez,1963-09-25,True,"['chinese', 'french', 'english']",34,phd,Child psychotherapist,"['food', 'archeology', 'art']","{'lat': 40.362447049660204, 'lon': 18.14225129...",31
3,3,male,Charles Young,1959-06-19,True,['bulgarian'],46,bachelor,Scientific laboratory technician,"['countryside', 'rafting', 'art']","{'lat': 40.36584086550253, 'lon': 18.183002910...",16
4,4,male,David Burgess,1966-09-09,True,"['deutsche', 'french']",30,master,Equality and diversity officer,"['countryside', 'tracking', 'beer']","{'lat': 40.354724889812935, 'lon': 18.20308322...",5
5,5,male,Connor Davis,1997-06-04,True,"['deutsche', 'dutch', 'bulgarian']",27,middle-school,Purchasing manager,[],"{'lat': 40.35854623058413, 'lon': 18.183459879...",3
6,6,male,Michael Lee,1966-12-10,True,"['english', 'chinese', 'french']",34,middle-school,"Journalist, broadcasting","['rafting', 'art', 'sport']","{'lat': 40.35774331848761, 'lon': 18.163273185...",3
7,7,male,Derrick Davidson,1998-03-17,True,"['spanish', 'dutch', 'french', 'italian']",33,bachelor,Mental health nurse,"['museums', 'art', 'countryside', 'wine']","{'lat': 40.365193043282794, 'lon': 18.18548169...",2
8,8,male,Christopher Smith,1958-10-09,True,"['bulgarian', 'english', 'chinese']",31,middle-school,Ergonomist,"['literature', 'cinema', 'music']","{'lat': 40.35265552781134, 'lon': 18.151723396...",42
9,9,male,Albert Moore,1973-07-20,True,"['chinese', 'bulgarian', 'dutch', 'english']",37,phd,International aid/development worker,['museums'],"{'lat': 40.33774657696426, 'lon': 18.182064664...",11


In [4]:
# Load dataframe for ratings

rating_file = open('Data/ratings_200_40.csv')

rating_df = pd.read_csv(
    filepath_or_buffer = rating_file,
    sep = ';',
    header = 0
)

rating_df

Unnamed: 0,0,1,2
0,0,7,4.0
1,0,3,3.0
2,1,23,5.0
3,2,4,4.0
4,2,27,4.0
...,...,...,...
351,197,23,2.0
352,198,35,4.0
353,199,18,4.0
354,199,21,4.0


In [5]:
# Format the rating dataframe

rating_df.rename(columns={rating_df.columns[0]: 'tourist_id',
                          rating_df.columns[1]: 'guide_id',
                          rating_df.columns[2]: 'rating',
                         }, inplace=True)

rating_df

Unnamed: 0,tourist_id,guide_id,rating
0,0,7,4.0
1,0,3,3.0
2,1,23,5.0
3,2,4,4.0
4,2,27,4.0
...,...,...,...
351,197,23,2.0
352,198,35,4.0
353,199,18,4.0
354,199,21,4.0


In [6]:
# Check if there are duplicated interactions

rating_df.drop_duplicates(subset=['tourist_id','guide_id'],inplace=True)
rating_df

Unnamed: 0,tourist_id,guide_id,rating
0,0,7,4.0
1,0,3,3.0
2,1,23,5.0
3,2,4,4.0
4,2,27,4.0
...,...,...,...
351,197,23,2.0
352,198,35,4.0
353,199,18,4.0
354,199,21,4.0


In [7]:
# Statistics about data
arr_tourists = tourist_df["id"].unique()
arr_guides = guide_df["id"].unique()

n_tourists = len(arr_tourists)
n_guides = len(arr_guides)
n_interactions = len(rating_df)

print("Number of tourists: {:d}".format(n_tourists))
print("Number of guides: {:d}".format(n_guides))
print("Number of interactions: {:d}".format(n_interactions))

print("Average interaction per tourist: {:.2f}".format(n_interactions/n_tourists))
print("Average interaction per guide: {:.2f}".format(n_interactions/n_guides))
print("Sparsity: {:.2f} %".format((1-float(n_interactions)/(n_guides*n_tourists))*100))

Number of tourists: 200
Number of guides: 40
Number of interactions: 356
Average interaction per tourist: 1.78
Average interaction per guide: 8.90
Sparsity: 95.55 %


## Create data matrix

In [8]:
# Create the User Rating Matrix
URM_all = sps.csr_matrix(
    (rating_df["rating"].values,
    (rating_df["tourist_id"].values, rating_df["guide_id"].values))
)

URM_all

<200x40 sparse matrix of type '<class 'numpy.float64'>'
	with 356 stored elements in Compressed Sparse Row format>

In [9]:
URM_train = URM_all

## Build and fit the models

### Collaborative Filtering

In [10]:
from Recommenders.Compute_Similarity_Python import Compute_Similarity_Python

In [11]:
class ItemKNNCFRecommender(object):
    
    def __init__(self, URM):
        self.URM = URM
        
            
    def fit(self, topK=5, shrink=3, normalize=True, similarity="cosine"):
        
        similarity_object = Compute_Similarity_Python(self.URM, shrink=shrink, 
                                                  topK=topK, normalize=normalize, 
                                                  similarity = similarity)
        
        self.W_sparse = similarity_object.compute_similarity()

        
    def recommend(self, user_id, at=None, exclude_seen=True):
        # compute the scores using the dot product
        user_profile = self.URM[user_id]
        scores = user_profile.dot(self.W_sparse).toarray().ravel()

        if exclude_seen:
            scores = self.filter_seen(user_id, scores)

        # rank items
        ranking = scores.argsort()[::-1]
            
        return ranking[:at]
    
    
    def filter_seen(self, user_id, scores):

        start_pos = self.URM.indptr[user_id]
        end_pos = self.URM.indptr[user_id+1]

        user_profile = self.URM.indices[start_pos:end_pos]
        
        scores[user_profile] = -np.inf

        return scores

In [12]:
recommender = ItemKNNCFRecommender(URM_train)
recommender.fit(shrink=0.5, topK=5)

Similarity column 40 (100.0%), 39983.79 column/sec. Elapsed time 0.00 sec


## Generate outputs

In [13]:
recommendations = []

for i,id in tqdm(enumerate(arr_tourists)):
    rec = recommender.recommend(id, at=3, exclude_seen=True)
    rec_list = rec
    rec_row = ' '.join(str(s) for s in rec_list)
    recommendations.append(rec_row)

200it [00:00, 9090.88it/s]


In [14]:
recommendations[:10]

['10 1 27',
 '11 26 4',
 '36 8 1',
 '24 11 33',
 '38 23 28',
 '24 31 9',
 '14 24 34',
 '30 19 9',
 '14 34 35',
 '13 17 34']

In [15]:
result_df = pd.DataFrame(
    data = {'tourist_id': arr_tourists,
            'guides': recommendations}
)

result_df

Unnamed: 0,tourist_id,guides
0,0,10 1 27
1,1,11 26 4
2,2,36 8 1
3,3,24 11 33
4,4,38 23 28
...,...,...
195,195,2 23 27
196,196,9 25 17
197,197,27 8 31
198,198,18 33 10


In [16]:
# Show some examples: select a tourist by id to visualize the received recommendations
sample_tourist = 1
pd.DataFrame(tourist_df.loc[sample_tourist,:])

Unnamed: 0,1
id,1
languages,['chinese']
keywords,['beer']


In [17]:
sample_guide_list = list(map(int, recommendations[sample_tourist].split(" ")))
guide_df.loc[sample_guide_list,:]

Unnamed: 0,id,gender,name,birth_date,now_available,languages_spoken,price,education,biography,keywords,current_location,experience
11,11,female,Stephanie Hampton,2001-02-13,True,['bulgarian'],23,phd,"Radiographer, therapeutic","['cinema', 'beer', 'rafting']","{'lat': 40.367394960036485, 'lon': 18.17553586...",5
26,26,male,Julian Lopez,1958-06-11,True,['dutch'],30,phd,Water engineer,"['tracking', 'beer', 'museums']","{'lat': 40.37981446263931, 'lon': 18.178854600...",45
4,4,male,David Burgess,1966-09-09,True,"['deutsche', 'french']",30,master,Equality and diversity officer,"['countryside', 'tracking', 'beer']","{'lat': 40.354724889812935, 'lon': 18.20308322...",5
