# Correlation Recommendation System

### This is an example for measuring the correlation of items based on their ratings


In [1]:
import numpy as np
import pandas as pd
import os
base_dir = "datasets\\RCdata"


In [2]:
frame = pd.read_csv(os.path.join(base_dir, "rating_final.csv"))
cuisine = pd.read_csv(os.path.join(base_dir, "chefmozcuisine.csv"))
geodata = pd.read_csv(os.path.join(
    base_dir, "geoplaces2.csv"), encoding="ISO-8859-1")


In [3]:
# Get the user ratings of the places by their placeID (rating is between 0 and 2 for simplicity of the dataset)
frame.head()


Unnamed: 0,userID,placeID,rating,food_rating,service_rating
0,U1077,135085,2,2,2
1,U1077,135038,2,2,1
2,U1077,132825,2,2,2
3,U1077,135060,1,2,2
4,U1068,135104,1,1,2


In [4]:
# Get the ID and name of the places
geodata.head()
places = geodata[['placeID', 'name']]
places.head()


Unnamed: 0,placeID,name
0,134999,Kiku Cuernavaca
1,132825,puesto de tacos
2,135106,El Rincón de San Francisco
3,132667,little pizza Emilio Portes Gil
4,132613,carnitas_mata


In [5]:
# Get the cuisine type by placeID
cuisine.head()


Unnamed: 0,placeID,Rcuisine
0,135110,Spanish
1,135109,Italian
2,135107,Latin_American
3,135106,Mexican
4,135105,Fast_Food


## Grouping and Ranking Data


In [6]:
# Grouping places by their ID and their average rating
rating = pd.DataFrame(frame.groupby('placeID')['rating'].mean())
rating.head()


Unnamed: 0_level_0,rating
placeID,Unnamed: 1_level_1
132560,0.5
132561,0.75
132564,1.25
132572,1.0
132583,1.0


In [7]:
# Adding to the previous dataset the count of how many reviews each place got
rating['rating_count'] = pd.DataFrame(
    frame.groupby('placeID')['rating'].count())
rating.head()


Unnamed: 0_level_0,rating,rating_count
placeID,Unnamed: 1_level_1,Unnamed: 2_level_1
132560,0.5,4
132561,0.75,4
132564,1.25,4
132572,1.0,15
132583,1.0,4


In [8]:
# Get some statistics
rating.describe()
# There are 130 unique places that have been reviewd in the dataframe
# The max value means that the most popular place in the Dataset got 36 reviews


Unnamed: 0,rating,rating_count
count,130.0,130.0
mean,1.179622,8.930769
std,0.349354,6.124279
min,0.25,3.0
25%,1.0,5.0
50%,1.181818,7.0
75%,1.4,11.0
max,2.0,36.0


In [9]:
# Find the place with most ratings
rating.sort_values('rating_count', ascending=False).head()


Unnamed: 0_level_0,rating,rating_count
placeID,Unnamed: 1_level_1,Unnamed: 2_level_1
135085,1.333333,36
132825,1.28125,32
135032,1.178571,28
135052,1.28,25
132834,1.0,25


In [10]:
# Find the name of the place
places[places['placeID'] == 135085]


Unnamed: 0,placeID,name
121,135085,Tortas Locas Hipocampo


In [11]:
# Find the cuisine of the place
cuisine[cuisine['placeID'] == 135085]


Unnamed: 0,placeID,Rcuisine
44,135085,Fast_Food


## Preparing Data for Analysis


In [13]:
places_crosstab = pd.pivot_table(
    data=frame, values='rating', index='userID', columns='placeID')
places_crosstab.head()
# The cross tab if full of NaN values - Nan means the user in the row didn't review the restaurant in the column


placeID,132560,132561,132564,132572,132583,132584,132594,132608,132609,132613,...,135080,135081,135082,135085,135086,135088,135104,135106,135108,135109
userID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
U1001,,,,,,,,,,,...,,,,0.0,,,,,,
U1002,,,,,,,,,,,...,,,,1.0,,,,1.0,,
U1003,,,,,,,,,,,...,2.0,,,,,,,,,
U1004,,,,,,,,,,,...,,,,,,,,2.0,,
U1005,,,,,,,,,,,...,,,,,,,,,,


In [17]:
# Isolate the user ratings from the most popular restaurant (tortas)
tortas_ratings = places_crosstab[135085]
tortas_ratings[tortas_ratings >= 0]
# if you'll add .size, you'll get 36 review - as expected


userID
U1001    0.0
U1002    1.0
U1007    1.0
U1013    1.0
U1016    2.0
U1027    1.0
U1029    1.0
U1032    1.0
U1033    2.0
U1036    2.0
U1045    2.0
U1046    1.0
U1049    0.0
U1056    2.0
U1059    2.0
U1062    0.0
U1077    2.0
U1081    1.0
U1084    2.0
U1086    2.0
U1089    1.0
U1090    2.0
U1092    0.0
U1098    1.0
U1104    2.0
U1106    2.0
U1108    1.0
U1109    2.0
U1113    1.0
U1116    2.0
U1120    0.0
U1122    2.0
U1132    2.0
U1134    2.0
U1135    0.0
U1137    2.0
Name: 135085, dtype: float64

# Evaluating the Similarity based on Correlation


In [21]:
# Find the pearson R correlation coefficient of places that are correlated with tortas
similar_to_tortas = places_crosstab.corrwith(tortas_ratings)
corr_tortas = pd.DataFrame(similar_to_tortas, columns=['PearsonR'])
corr_tortas.dropna(inplace=True)
corr_tortas.head()
# That correlation might not be segnificant, and we need to filter some data that might interrupt our results


  c = cov(x, y, rowvar, dtype=dtype)
  c *= np.true_divide(1, fact)


Unnamed: 0_level_0,PearsonR
placeID,Unnamed: 1_level_1
132572,-0.428571
132723,0.301511
132754,0.930261
132825,0.700745
132834,0.814823


In [28]:
tortas_corr_summary = corr_tortas.join(rating['rating_count'])
# Filter places with at least 10 reviews that are most correlated with tortas. print the 10 results
tortas_corr_summary = tortas_corr_summary[tortas_corr_summary['rating_count'] >= 10].sort_values(
    'PearsonR', ascending=False)
tortas_corr_summary.head(10)
# The 1 are not meaningful here - for those places there was only one reviewer and he gave eveyrone the same score (I don't know where she got that information, but I believe here)


Unnamed: 0_level_0,PearsonR,rating_count
placeID,Unnamed: 1_level_1,Unnamed: 2_level_1
135076,1.0,13
135085,1.0,36
135066,1.0,12
132754,0.930261,13
135045,0.912871,13
135062,0.898933,21
135028,0.892218,15
135042,0.881409,20
135046,0.867722,11
132872,0.840168,12


In [47]:
places_corr_tortas_real = pd.DataFrame(
    [132754, 135045, 135062, 135028, 135042, 135046, 132872], index=np.arange(7), columns=['placeID'])
summary = pd.merge(places_corr_tortas_real, cuisine, on='placeID')
summary
# We only see places that were found in the cuisine dataset. this is why we see 5 instead of 7


Unnamed: 0,placeID,Rcuisine
0,132754,Mexican
1,135028,Mexican
2,135042,Chinese
3,135046,Fast_Food
4,132872,American


In [50]:
# Find the name of one of the places from the list, based on cuisine similarity (tortas is also Fast_Food)
places[places['placeID'] == 135046]


Unnamed: 0,placeID,name
42,135046,Restaurante El Reyecito


In [52]:
cuisine['Rcuisine'].describe()


count         916
unique         59
top       Mexican
freq          239
Name: Rcuisine, dtype: object