# Groeps Opdracht CI
## Joost Vledder, Sadjia Safdari, Simon Kreulen & Jasper van Eck

### Inhoudsopgave <a name='Top'></a>

[Imports](#import)

[Load data](#loaddata)

[Data Clean Up](#cleanup)

[Basic Data Stats](#basicdata)

[Plots](#plots)

[Cosine Similarity](#cossim)

[Query Vector](#queryvector)

[SVD](#svd)

[K-Means](#kmeans)

### Imports <a name='import'></a>

[Top](#Top)

In [1]:
import os
import pandas as pd
import numpy as np
np.seterr(divide='raise', over='raise', under='raise', invalid='raise')
import matplotlib.pyplot as plt
from collections import defaultdict
import json
#pip install geopy
from geopy import geocoders
from geopy.geocoders import Nominatim
from sklearn.cluster import KMeans
from sklearn.decomposition import TruncatedSVD

### Load data <a name='loaddata'></a>

[Top](#Top)

In [None]:
#rootdir = './yelp/data'
rootdir = './yelp/yelp/data'

df_business = pd.DataFrame()
df_users = pd.DataFrame()
df_reviews = pd.DataFrame()

count = 0

def load_jsons(data_path, file):
    """ helper function to load '.json' files (they're not proper jsons) """
    file_path = os.path.join(subdir, file)
    lines = []
    with open(file_path) as jsons:
        try:
            lines = [json.loads(json_line) for json_line in jsons]
        except:
            print(file_path)
    return pd.DataFrame(lines)

for subdir, dirs, files in os.walk(rootdir):
    if count == 200: #1078 is the total amount of cities
        break
        
    for file in files:
        if os.path.basename(file) == 'business.json':
            df_business = df_business.append(load_jsons(subdir, file))
        if os.path.basename(file) == 'user.json':
            df_users = df_users.append(load_jsons(subdir, file))
        if os.path.basename(file) == 'review.json':
            df_reviews = df_reviews.append(load_jsons(subdir, file))
    count += 1

df_business = df_business.set_index('business_id')
df_users = df_users.set_index('user_id')
df_reviews = df_reviews.set_index('review_id')

display(df_business.head())
display(df_users.head())
display(df_reviews.head())

In [None]:
df_business.to_pickle('./business_pickle.pkl')
df_users.to_pickle('./users_pickle.pkl')
df_reviews.to_pickle('./reviews_pickle.pkl')

In [None]:
df_business = pd.read_pickle('./business_pickle.pkl')
df_users = pd.read_pickle('./users_pickle.pkl')
df_reviews = pd.read_pickle('./reviews_pickle.pkl')

### Clean up Data For Content Based <a name='cleanup'></a>

[Top](#Top)

In [None]:
cat_dict = defaultdict(lambda: defaultdict(int))

# Drop na categories
df_business = df_business[df_business['categories'].notna()]

# Normalize long, lat, stars & review_count
#x = cos(lat) * cos(lon)
#y = cos(lat) * sin(lon)
df_business['stars'] = (df_business['stars']-df_business['stars'].min()) / (df_business['stars'].max()-df_business['stars'].min())
df_business['review_count'] = (df_business['review_count']-df_business['review_count'].min()) / (df_business['review_count'].max()-df_business['review_count'].min())
#df_business['x_axis'] = np.cos(df_business['latitude']) * np.cos(df_business['longitude'])
#df_business['y_axis'] = np.cos(df_business['latitude']) * np.sin(df_business['longitude'])

# Dataframe of categories one hot encoded per business_id
df_tmp = df_business['categories'].str.split(pat=', ',expand=True)

# Create dict of categories
for index, row in df_tmp.iterrows():
    for _,elem in row.items():
        if elem and elem not in cat_dict[index]:
            cat_dict[index][elem] += 1  

In [None]:
# Create DF from dict
df_cats = pd.DataFrame.from_dict(cat_dict, orient='index')

#Drop NaN column
df_cats = df_cats.drop(columns='NaN',errors='ignore')

# Fillna with 0
df_cats = df_cats.fillna(0)

# Join back into business
df_business = df_business.join(df_cats,on='business_id')

# Drop cols
df_business = df_business.drop(columns=['address','state','postal_code','attributes','categories','hours'],errors='ignore')

# Remove elite & friends columns because of non use
df_users = df_users.drop(columns=['elite','friends'],axis=1,errors='ignore')

In [None]:
display(df_business.head())

In [None]:
df_business.to_pickle('./business_pickle.pkl')
df_cats.to_pickle('./cats_pickle.pkl')
df_users.to_pickle('./users_pickle.pkl')
df_reviews.to_pickle('./reviews_pickle.pkl')

In [None]:
df_business = pd.read_pickle('./business_pickle.pkl')
df_cats = pd.read_pickle('./cats_pickle.pkl')
df_users = pd.read_pickle('./users_pickle.pkl')
df_reviews = pd.read_pickle('./reviews_pickle.pkl')

### Basic Data Stats for Content Based <a name='basicdata'></a>

[Top](#Top)

In [None]:
# Some information about the category amounts per business

# series of amount of categories per business
cats_business = df_cats.sum(axis=1)
display(cats_business)

avg_cats_bus = cats_business.mean()
max_cats_bus = cats_business.max()
min_cats_bus = cats_business.min()

print('Average amount of categories for businesses: ', avg_cats_bus)
print('Minimum amount of categories for businesses: ', min_cats_bus)
print('Maximum amount of categories for businesses: ', max_cats_bus)

# series of amount of businesses with the index amount of categories
cat_bus_distribution = cats_business.value_counts()
display(cat_bus_distribution)

In [None]:
# series of number of businesses with index as category
cats_presences = df_cats.sum(axis=0).sort_values(ascending=False)
display(cats_presences)

avg_cats = cats_presences.mean()
max_cats = cats_presences.max()
min_cats = cats_presences.min()

### Plots <a name='plots'></a>

[Top](#Top)

In [None]:
hist_bus = df_business.hist(column=['stars','review_count'])
df_business.plot(x='latitude',y='longitude',kind='scatter')
plt.show()
cats_presences.plot()

In [None]:
df_business = pd.read_pickle('./business_pickle.pkl')
df_cats = pd.read_pickle('./cats_pickle.pkl')
df_users = pd.read_pickle('./users_pickle.pkl')
df_reviews = pd.read_pickle('./reviews_pickle.pkl')

### Basic Data Stats for Collaborative Based <a name='basicdata'></a>

[Top](#Top)

In [None]:
# reset index
df_reviews = df_reviews.reset_index()

# count reviews per user and reviews per business
reviews_per_user = df_reviews.groupby("user_id")["review_id"].count().sort_values(ascending=False).to_frame()
reviews_per_business = df_reviews.groupby("business_id")["review_id"].count().sort_values(ascending=False).to_frame()

# plot both review counts
reviews_per_user.plot()
reviews_per_business.plot()

In [2]:
df_business = pd.read_pickle('./business_pickle.pkl')
df_cats = pd.read_pickle('./cats_pickle.pkl')
df_users = pd.read_pickle('./users_pickle.pkl')
df_reviews = pd.read_pickle('./reviews_pickle.pkl')

### Cosine Similarity <a name='cossim'></a>

[Top](#Top)

In [3]:
# Input vectors are series, convert to np array
def cosineSim(vector, docVector):
    vector = vector.to_numpy()
    docVector = docVector.to_numpy()
    
    # Get lengths of vectors
    sim = (np.sqrt(sum(vector**2))*np.sqrt(sum(docVector**2)))
    
    # Ensure no division by 0
    if sim == 0.:
        return np.nan
    
    # Calculate cosine sim
    return vector.dot(docVector)/sim

In [4]:
def rankedBusinessList(df_business, queryVector):
    df_bus_features = df_business.copy().drop(columns=['name','city','stars','review_count','is_open','Score Cos'],errors='ignore')
    scoreList = np.zeros(len(df_business))
    for i in range(len(df_bus_features)):
        scoreList[i] = cosineSim(queryVector, df_bus_features.iloc[i])
    
    df_business['Score Cos'] = scoreList
    return df_business.sort_values(by=['Score Cos','stars'],ascending=False)

### Create Query Vector <a name='queryvector'></a>

[Top](#Top)

This first cell get the longitude and latitude of a given city and state, and normalizes both values.

In [5]:
loc_string = 'Las Vegas NV'
gn = Nominatim(user_agent='WalterGKurtz')
location = gn.geocode(loc_string)
queryVector = pd.Series(data=0.,index=df_business.columns)
queryVector.at['latitude'] = location.latitude
queryVector.at['longitude'] = location.longitude
#x = cos(lat) * cos(lon)
#y = cos(lat) * sin(lon)
#queryVector.at['x_axis'] = np.cos(location.latitude) * np.cos(location.longitude)
#queryVector.at['y_axis'] = np.cos(location.latitude) * np.sin(location.longitude)


The second cell inputs the preferences of the user.

In [6]:
# Preferences
queryVector = queryVector.drop(labels=['name','city','stars','review_count','is_open'],errors='ignore')
queryVector.at['Automotive'] = 1.
queryVector.at['Auto Repair'] = 1.
queryVector.at['Fast Food'] = 1.
queryVector.at['Restaurants'] = 1.
queryVector.at['Pizza'] = 1.
queryVector.at['Men\'s Clothing'] = 1.
queryVector.at['Women\'s Clothing'] = 1.
queryVector.at['Fashion'] = 1.
queryVector.at['Bars'] = 1.
display(queryVector)
queryVector.to_csv('test.csv', header=False)

latitude                      36.167256
longitude                   -115.148516
Restaurants                    1.000000
Mexican                        0.000000
Canadian (New)                 0.000000
Seafood                        0.000000
Nightlife                      0.000000
Bars                           1.000000
Event Planning & Services      0.000000
Venues & Event Spaces          0.000000
Lounges                        0.000000
Hotels                         0.000000
Hotels & Travel                0.000000
Cajun/Creole                   0.000000
Shaved Ice                     0.000000
Desserts                       0.000000
Food                           0.000000
Fashion                        1.000000
Department Stores              0.000000
Shopping                       0.000000
Auto Repair                    1.000000
Automotive                     1.000000
Barbers                        0.000000
Hair Salons                    0.000000
Local Flavor                   0.000000


In [7]:
display(rankedBusinessList(df_business,queryVector).head(20))

Unnamed: 0_level_0,city,is_open,latitude,longitude,name,review_count,stars,Restaurants,Mexican,Canadian (New),...,Virtual Reality Centers,Horse Racing,Fireworks,Marinas,Planetarium,Glass Blowing,Land Surveying,Sugar Shacks,Surf Schools,Score Cos
business_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0mpL21omHLi4mHWs3Ee29g,"Las Vegas, NV",1,36.127524,-115.1715,Diesel,0.003385,0.75,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.999725
ot9iwwY_oSzf9VWkEl4XkA,Scottsdale,1,33.590424,-111.835572,Papa John's Pizza,0.009188,0.375,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.999715
6xzOn_fxkIaHVxjtb_n4RQ,Scottsdale,1,33.625153,-111.925243,Express,0.002418,0.625,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.999714
xAl2BxNhlimcXZZQzdNRzw,Scottsdale,1,33.585413,-111.925029,MOD Pizza,0.031431,0.75,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.999712
qVtPl0PtlynFEKz8Qa3RcQ,Mesa,1,33.472347,-111.701723,MOD Pizza,0.010638,0.75,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.999707
StnWWcLsXBAhEqsbQkZgiw,Scottsdale,1,33.479058,-111.909497,Little Caesars Pizza,0.008704,0.375,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.999701
phslBiJbYRg0xxXUkk1baQ,Scottsdale,0,33.470445,-111.925769,Papa John's Pizza,0.011122,0.375,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.999699
WQAaGMguMmnxQdgMeEFY9g,Mesa,1,33.436141,-111.815404,Little Caesar's Pizza,0.004352,0.75,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.999699
ZI6CBpptCb1Nw-udZ8wonw,Mesa,0,33.417347,-111.788637,Pizza Hut,0.0,0.125,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.999698
mJ0CA86roChYwaDkDxFGwg,Mesa,1,33.432942,-111.843711,Papa John's Pizza,0.002418,0.5,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.999698


### SVD <a name='svd'></a>

[Top](#Top)

In [8]:
stars = df_business['stars'].values
cities = df_business['city'].values
bus_ids = df_business.index
df_tmp_svd = df_business.copy().drop(columns=['name','city', 'stars','review_count','is_open', 'Score Cos'],errors='ignore').to_numpy()
trunc_svd = TruncatedSVD(n_components=500).fit(df_tmp_svd)
reduced_x = trunc_svd.transform(df_tmp_svd)
singles = trunc_svd.singular_values_

In [9]:
queryVectorReduced = trunc_svd.transform(np.array([queryVector]))

df_reduced_x = pd.DataFrame(data=reduced_x, index=bus_ids)
df_reduced_x.insert(0, 'stars', stars)
df_reduced_x.insert(0, 'city', cities)
df_queryVectorReduced = pd.Series(data=queryVectorReduced[0])
display(rankedBusinessList(df_reduced_x,df_queryVectorReduced).head(20))

Unnamed: 0_level_0,city,stars,0,1,2,3,4,5,6,7,...,491,492,493,494,495,496,497,498,499,Score Cos
business_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0mpL21omHLi4mHWs3Ee29g,"Las Vegas, NV",0.75,120.411027,-8.436954,-0.535224,-0.900672,0.333026,0.616635,-0.135435,-0.124738,...,0.003471,0.004009,0.000947,-0.004215,-0.00111,0.000247,0.001601,-0.001218,-0.00254,0.999726
ot9iwwY_oSzf9VWkEl4XkA,Scottsdale,0.375,116.380048,-9.561375,0.915494,-0.008312,-0.143732,0.246353,0.07397,-0.544503,...,0.000569,0.000426,-0.000437,-7.5e-05,-0.000477,-0.000402,-0.00078,0.001707,-0.000418,0.999716
6xzOn_fxkIaHVxjtb_n4RQ,Scottsdale,0.625,116.477761,-9.564413,0.264081,-0.9158,0.260436,0.84199,-0.039561,-0.447271,...,0.000187,0.002923,0.001805,-0.001021,0.002023,-0.00099,0.001605,-0.001647,-0.002866,0.999714
xAl2BxNhlimcXZZQzdNRzw,Scottsdale,0.75,116.461492,-9.598709,0.916003,-0.008394,-0.143616,0.246373,0.07401,-0.544537,...,0.000567,0.000426,-0.000437,-7.7e-05,-0.000478,-0.000403,-0.000781,0.001708,-0.000419,0.999713
qVtPl0PtlynFEKz8Qa3RcQ,Mesa,0.75,116.212321,-9.622388,0.916805,-0.008713,-0.143254,0.246683,0.074203,-0.544611,...,0.000566,0.000431,-0.000434,-7.4e-05,-0.000474,-0.000406,-0.000777,0.001711,-0.000421,0.999708
StnWWcLsXBAhEqsbQkZgiw,Scottsdale,0.375,116.40819,-9.692022,0.917684,-0.008827,-0.14308,0.246677,0.074253,-0.544667,...,0.000562,0.000431,-0.000433,-7.8e-05,-0.000477,-0.000407,-0.000779,0.001712,-0.000424,0.999702
phslBiJbYRg0xxXUkk1baQ,Scottsdale,0.375,116.420191,-9.705981,0.917904,-0.008874,-0.143019,0.246703,0.074278,-0.544683,...,0.000561,0.000431,-0.000433,-7.9e-05,-0.000477,-0.000408,-0.000779,0.001713,-0.000424,0.9997
WQAaGMguMmnxQdgMeEFY9g,Mesa,0.75,116.304923,-9.697602,0.917944,-0.008942,-0.142952,0.246794,0.074323,-0.544693,...,0.000562,0.000433,-0.000432,-7.7e-05,-0.000475,-0.000409,-0.000777,0.001713,-0.000424,0.9997
ZI6CBpptCb1Nw-udZ8wonw,Mesa,0.125,116.273142,-9.705319,0.918127,-0.009002,-0.142882,0.246846,0.074358,-0.544708,...,0.000562,0.000434,-0.000431,-7.7e-05,-0.000474,-0.000409,-0.000777,0.001714,-0.000425,0.999699
mJ0CA86roChYwaDkDxFGwg,Mesa,0.5,116.330106,-9.710917,0.918132,-0.008975,-0.142907,0.246805,0.074339,-0.544706,...,0.000561,0.000433,-0.000432,-7.8e-05,-0.000475,-0.000409,-0.000777,0.001714,-0.000425,0.999699


### K-means <a name='kmeans'></a>

[Top](#Top)

In [None]:
kmeans = KMeans(n_clusters=200,random_state=0).fit(recombo)

In [None]:
# Predict query vector

### Cohen's Kappa <a name='cohenkappa'></a>

[Top](#Top)

In [None]:
# Dataformat
# 1 | 0
# 1 | 1
# 0 | 1

def PE(data):
    '''On input data, return the P(E) (expected agreement).'''
    relevant = 0
    nonrelevant = 0
    # Iterate over the data
    for i in data:
        for j in i:
            
            # Top up the relevant documents by one if 1 is encountered
            if j == 1:
                relevant += 1
            # Top up the nonrelevant documents by one if 0 is encountered
            if j == 0:
                nonrelevant += 1

    # Calculates the total of inspected documents for the judges combined
    total = len(data)*2

    # Calculates the pooled marginals
    rel = relevant/total
    nonrel = nonrelevant/total

    # Calculates the P(E)
    P_E = nonrel**2 + rel **2    
    return    P_E 


def kappa(data, P_E):
    agree = 0
    for i in data:
        temp = None
        for j in i:
            if temp == j:
                agree += 1
            temp = j
    P_A = agree / len(data)
    if P_E == 1:
        kappa = 1
    else:
        kappa = (P_A - P_E)/(1 - P_E)   
    return kappa

In [None]:
def AveragePrecision(ranked_list_of_results, list_of_relevant_objects):
    total = len(list_of_relevant_objects)
    sumPk = 0
    rank = 0
    relevant = 0
    for result in ranked_list_of_results:
        rank += 1
        if result in list_of_relevant_objects:
            relevant += 1
            sumPk += relevant/rank
            
    aprecision = sumPk/total
    return aprecision