# Guide Recommendation Pipeline

In [1]:
# Import libraries

import numpy as np
import pandas as pd
import scipy.sparse as sps
import matplotlib.pyplot as pyplot

from tqdm import tqdm

## Data loading and preprocessing

In [2]:
# Load dataframe for tourist attributes

tourist_file = open('tourists.csv')

tourist_df = pd.read_csv(
    filepath_or_buffer = tourist_file,
    sep = ';',
    header = 0
)

tourist_df.rename(columns={tourist_df.columns[0]: 'id'}, inplace=True)

tourist_df

Unnamed: 0,id,languages,keywords
0,0,['spanish'],"['history', 'sport', 'art', 'countryside']"
1,1,['italian'],"['history', 'wine']"
2,2,"['italian', 'spanish']","['countryside', 'tracking']"
3,3,['spanish'],[]
4,4,['spanish'],['rafting']
...,...,...,...
95,95,['english'],"['art', 'rafting', 'beer']"
96,96,['bulgarian'],['music']
97,97,['english'],['rafting']
98,98,['italian'],"['beer', 'rafting', 'rafting', 'rafting']"


In [3]:
# Load dataframe for guide attributes

guide_file = open('guides.csv')

guide_df = pd.read_csv(
    filepath_or_buffer = guide_file,
    sep = ';',
    header = 0
)

guide_df.rename(columns={guide_df.columns[0]: 'id'}, inplace=True)

guide_df

Unnamed: 0,id,gender,name,birth_date,now_available,languages_spoken,price,education,biography,keywords,current_location,experience
0,0,female,Chelsea Martinez,1937-02-01,True,"['dutch', 'chinese']",29,bachelor,"Designer, textile",['wine'],"{'lat': 40.360583803214006, 'lon': 18.17052799...",10
1,1,male,Daniel Ibarra,1916-10-03,True,['bulgarian'],29,phd,Social researcher,['archeology'],"{'lat': 40.351701701073104, 'lon': 18.17094077...",6
2,2,female,Wendy Anderson,1945-02-24,True,"['french', 'italian']",28,master,English as a second language teacher,"['museums', 'food', 'wine']","{'lat': 40.35688994865342, 'lon': 18.178920648...",4
3,3,male,Robert Adams,1915-04-10,True,['english'],31,elementary,"Librarian, public",[],"{'lat': 40.352684858988965, 'lon': 18.17184990...",4
4,4,female,Grace Sparks,1940-04-08,True,"['spanish', 'bulgarian']",33,master,Copy,['literature'],"{'lat': 40.353073531674546, 'lon': 18.17082903...",4
5,5,male,Michael Christian,1995-03-25,True,['chinese'],28,high-school,Marine scientist,"['music', 'history']","{'lat': 40.36338197601472, 'lon': 18.181614978...",6
6,6,male,George Reeves,2022-07-16,True,"['english', 'chinese']",29,high-school,Public relations account executive,[],"{'lat': 40.345630619548274, 'lon': 18.16613237...",3
7,7,female,Cindy Molina,1954-05-16,True,['chinese'],36,bachelor,Insurance account manager,"['literature', 'museums']","{'lat': 40.36147220160234, 'lon': 18.172627891...",5
8,8,female,Amanda Roberts,1977-09-25,True,['bulgarian'],33,bachelor,Land,"['cinema', 'museums', 'literature', 'history']","{'lat': 40.344907525808964, 'lon': 18.18945843...",7
9,9,female,Alyssa Hill,2018-03-27,True,['deutsche'],32,bachelor,"Scientist, physiological","['food', 'music', 'cinema', 'literature', 'mus...","{'lat': 40.33631630303364, 'lon': 18.175822508...",1


In [4]:
# Load dataframe for ratings

rating_file = open('ratings.csv')

rating_df = pd.read_csv(
    filepath_or_buffer = rating_file,
    sep = ';',
    header = 0
)

rating_df

Unnamed: 0.1,Unnamed: 0,0,1,2
0,0,0,13,3.0
1,1,0,8,2.0
2,2,1,6,4.0
3,3,1,9,4.0
4,4,1,18,2.0
...,...,...,...,...
167,167,96,17,4.0
168,168,96,8,4.0
169,169,97,0,4.0
170,170,98,3,4.0


In [5]:
rating_df.drop(columns=rating_df.columns[0], inplace=True)
rating_df.rename(columns={rating_df.columns[0]: 'tourist_id',
                          rating_df.columns[1]: 'guide_id',
                          rating_df.columns[2]: 'rating',
                         }, inplace=True)

rating_df

Unnamed: 0,tourist_id,guide_id,rating
0,0,13,3.0
1,0,8,2.0
2,1,6,4.0
3,1,9,4.0
4,1,18,2.0
...,...,...,...
167,96,17,4.0
168,96,8,4.0
169,97,0,4.0
170,98,3,4.0


In [6]:
# Create arrays of unique ids and ids
arr_tourists = tourist_df["id"].unique()
arr_guides = guide_df["id"].unique()

n_tourists = len(arr_tourists)
n_guides = len(arr_guides)
n_interactions = len(rating_df)

print("Average interaction per tourist {:.2f}".format(n_interactions/n_tourists))
print("Average interaction per guide {:.2f}".format(n_interactions/n_guides))
print("Sparsity {:.2f} %".format((1-float(n_interactions)/(n_guides*n_tourists))*100))

Average interaction per tourist 1.72
Average interaction per guide 8.60
Sparsity 91.40 %


In [7]:
# Create the User Rating Matrix
URM_all = sps.csr_matrix(
    (rating_df["rating"].values,
    (rating_df["tourist_id"].values, rating_df["guide_id"].values))
)

URM_all

<100x20 sparse matrix of type '<class 'numpy.float64'>'
	with 172 stored elements in Compressed Sparse Row format>

In [8]:
URM_train = URM_all

## Build and fit the recommender

### Collaborative Filtering

In [9]:
from Recommenders.Compute_Similarity_Python import Compute_Similarity_Python

In [10]:
class ItemKNNCFRecommender(object):
    
    def __init__(self, URM):
        self.URM = URM
        
            
    def fit(self, topK=5, shrink=3, normalize=True, similarity="cosine"):
        
        similarity_object = Compute_Similarity_Python(self.URM, shrink=shrink, 
                                                  topK=topK, normalize=normalize, 
                                                  similarity = similarity)
        
        self.W_sparse = similarity_object.compute_similarity()

        
    def recommend(self, user_id, at=None, exclude_seen=True):
        # compute the scores using the dot product
        user_profile = self.URM[user_id]
        scores = user_profile.dot(self.W_sparse).toarray().ravel()

        if exclude_seen:
            scores = self.filter_seen(user_id, scores)

        # rank items
        ranking = scores.argsort()[::-1]
            
        return ranking[:at]
    
    
    def filter_seen(self, user_id, scores):

        start_pos = self.URM.indptr[user_id]
        end_pos = self.URM.indptr[user_id+1]

        user_profile = self.URM.indices[start_pos:end_pos]
        
        scores[user_profile] = -np.inf

        return scores

In [11]:
recommender = ItemKNNCFRecommender(URM_train)
recommender.fit(shrink=0.5, topK=5)

Similarity column 20 (100.0%), 20001.43 column/sec. Elapsed time 0.00 sec


## Generate recommendations

In [12]:
recommendations = []

for i,id in tqdm(enumerate(arr_tourists)):
    rec = recommender.recommend(id, at=3, exclude_seen=True)
    rec_list = rec
    rec_row = ' '.join(str(s) for s in rec_list)
    recommendations.append(rec_row)

100it [00:00, 9091.37it/s]


In [13]:
recommendations[:10]

['6 5 17',
 '13 7 15',
 '5 14 3',
 '13 7 11',
 '16 19 18',
 '14 12 19',
 '1 19 13',
 '14 3 16',
 '5 14 3',
 '17 10 5']

In [14]:
result_df = pd.DataFrame(
    data = {'tourist_id': arr_tourists,
            'guides': recommendations}
)

result_df

Unnamed: 0,tourist_id,guides
0,0,6 5 17
1,1,13 7 15
2,2,5 14 3
3,3,13 7 11
4,4,16 19 18
...,...,...
95,95,8 5 9
96,96,13 12 18
97,97,3 7 19
98,98,1 8 19


In [16]:
sample_guide_list = list(map(int, recommendations[sample_tourist].split(" ")))
guide_df.loc[sample_guide_list,:]

Unnamed: 0,id,gender,name,birth_date,now_available,languages_spoken,price,education,biography,keywords,current_location,experience
13,13,female,Pamela Maldonado,2014-01-09,True,['italian'],22,bachelor,"Sound technician, broadcasting/film/video","['music', 'museums']","{'lat': 40.34084957083049, 'lon': 18.187107603...",10
7,7,female,Cindy Molina,1954-05-16,True,['chinese'],36,bachelor,Insurance account manager,"['literature', 'museums']","{'lat': 40.36147220160234, 'lon': 18.172627891...",5
15,15,male,Dylan Miller,1975-07-05,True,"['deutsche', 'dutch']",31,phd,Product designer,[],"{'lat': 40.35998434462983, 'lon': 18.170060702...",1
