# Groeps Opdracht CI
## Joost Vledder, Sadjia Safdari, Simon Kreulen & Jasper van Eck

### Inhoudsopgave <a name='Top'></a>

[Imports](#import)

[Load data](#loaddata)

[Data Clean Up](#cleanup)

[Basic Data Stats](#basicdata)

[Plots](#plots)

[Cosine Similarity](#cossim)

[Query Vector](#queryvector)

[SVD](#svd)

[K-Means](#kmeans)

### Imports <a name='import'></a>

[Top](#Top)

In [1]:
import os
import pandas as pd
import numpy as np
np.seterr(divide='raise', over='raise', under='raise', invalid='raise')
import matplotlib.pyplot as plt
from collections import defaultdict
import json
#pip install geopy
from geopy import geocoders
from geopy.geocoders import Nominatim
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.decomposition import TruncatedSVD

### Load data <a name='loaddata'></a>

[Top](#Top)

In [None]:
#rootdir = './yelp/data'
rootdir = './yelp/data'

df_business = pd.DataFrame()
df_users = pd.DataFrame()
df_reviews = pd.DataFrame()

count = 0

def load_jsons(data_path, file):
    # function to help load json files, since sometimes they give utf8 encoding errors, sometimes they don't
    file_path = os.path.join(subdir, file)
    lines = []
    with open(file_path) as jsons:
        try:
            lines = [json.loads(json_line) for json_line in jsons]
        except:
            print(file_path)
    return pd.DataFrame(lines)

for subdir, dirs, files in os.walk(rootdir):
    if count == 200: #1078 is the total amount of cities
        break
        
    for file in files:
        if os.path.basename(file) == 'business.json':
            df_business = df_business.append(load_jsons(subdir, file))
        if os.path.basename(file) == 'user.json':
            df_users = df_users.append(load_jsons(subdir, file))
        if os.path.basename(file) == 'review.json':
            df_reviews = df_reviews.append(load_jsons(subdir, file))
    count += 1

df_business = df_business.set_index('business_id')
df_users = df_users.set_index('user_id')
df_reviews = df_reviews.set_index('review_id')

display(df_business.head())
display(df_users.head())
display(df_reviews.head())

In [None]:
df_business.to_pickle('./business_pickle.pkl')
df_users.to_pickle('./users_pickle.pkl')
df_reviews.to_pickle('./reviews_pickle.pkl')

In [None]:
df_business = pd.read_pickle('./business_pickle.pkl')
df_users = pd.read_pickle('./users_pickle.pkl')
df_reviews = pd.read_pickle('./reviews_pickle.pkl')

### Clean up Data For Content Based <a name='cleanup'></a>

[Top](#Top)

In [None]:
cat_dict = defaultdict(lambda: defaultdict(int))

# Drop na categories
df_business = df_business[df_business['categories'].notna()]

# Normalize long, lat, stars & review_count
#x = cos(lat) * cos(lon)
#y = cos(lat) * sin(lon)
df_business['stars'] = (df_business['stars']-df_business['stars'].min()) / (df_business['stars'].max()-df_business['stars'].min())
df_business['review_count'] = (df_business['review_count']-df_business['review_count'].min()) / (df_business['review_count'].max()-df_business['review_count'].min())
#df_business['x_axis'] = np.cos(df_business['latitude']) * np.cos(df_business['longitude'])
#df_business['y_axis'] = np.cos(df_business['latitude']) * np.sin(df_business['longitude'])

# Dataframe of categories one hot encoded per business_id
df_tmp = df_business['categories'].str.split(pat=', ',expand=True)

# Create dict of categories
for index, row in df_tmp.iterrows():
    for _,elem in row.items():
        if elem and elem not in cat_dict[index]:
            cat_dict[index][elem] += 1  

In [None]:
# Create DF from dict
df_cats = pd.DataFrame.from_dict(cat_dict, orient='index')

#Drop NaN column
df_cats = df_cats.drop(columns='NaN',errors='ignore')

# Fillna with 0
df_cats = df_cats.fillna(0)

# Join back into business
df_business = df_business.join(df_cats,on='business_id')

# Drop cols
df_business = df_business.drop(columns=['address','state','postal_code','attributes','categories','hours'],errors='ignore')

# Remove elite & friends columns because of non use
df_users = df_users.drop(columns=['elite','friends'],axis=1,errors='ignore')

In [None]:
display(df_business.head())

In [None]:
df_business.to_pickle('./business_pickle.pkl')
df_cats.to_pickle('./cats_pickle.pkl')
df_users.to_pickle('./users_pickle.pkl')
df_reviews.to_pickle('./reviews_pickle.pkl')

In [None]:
df_business = pd.read_pickle('./business_pickle.pkl')
df_cats = pd.read_pickle('./cats_pickle.pkl')
df_users = pd.read_pickle('./users_pickle.pkl')
df_reviews = pd.read_pickle('./reviews_pickle.pkl')

### Basic Data Stats for Content Based <a name='basicdata'></a>

[Top](#Top)

In [None]:
# Some information about the category amounts per business

# series of amount of categories per business
cats_business = df_cats.sum(axis=1)
display(cats_business)

avg_cats_bus = cats_business.mean()
max_cats_bus = cats_business.max()
min_cats_bus = cats_business.min()

print('Average amount of categories for businesses: ', avg_cats_bus)
print('Minimum amount of categories for businesses: ', min_cats_bus)
print('Maximum amount of categories for businesses: ', max_cats_bus)

# series of amount of businesses with the index amount of categories
cat_bus_distribution = cats_business.value_counts()
display(cat_bus_distribution)

In [None]:
# series of number of businesses with index as category
cats_presences = df_cats.sum(axis=0).sort_values(ascending=False)
display(cats_presences)

avg_cats = cats_presences.mean()
max_cats = cats_presences.max()
min_cats = cats_presences.min()

### Plots <a name='plots'></a>

[Top](#Top)

In [None]:
hist_bus = df_business.hist(column=['stars','review_count'])
df_business.plot(x='latitude',y='longitude',kind='scatter')
plt.show()
cats_presences.plot()

In [None]:
df_business = pd.read_pickle('./business_pickle.pkl')
df_cats = pd.read_pickle('./cats_pickle.pkl')
df_users = pd.read_pickle('./users_pickle.pkl')
df_reviews = pd.read_pickle('./reviews_pickle.pkl')

### Basic Data Stats for Collaborative Based <a name='basicdata'></a>

[Top](#Top)

In [None]:
# reset index
df_reviews = df_reviews.reset_index()

# count reviews per user and reviews per business
reviews_per_user = df_reviews.groupby("user_id")["review_id"].count().sort_values(ascending=False).to_frame()
reviews_per_business = df_reviews.groupby("business_id")["review_id"].count().sort_values(ascending=False).to_frame()

# plot both review counts
reviews_per_user.plot()
reviews_per_business.plot()

In [None]:
"""""
userplot --> the distribution of the reviews over the users (long-tail): considering the fact that there are more
reviews on business than users (one user can rate more than 1 business), it is probably better to choose
item-based CF than user-based CF. 

"""""
user_series = df_users['review_count'].sort_values(ascending=False).drop_duplicates(keep='first')

display(users_series)
userplot = users_series.plot()

userplot.set(xlabel='user_id', ylabel='review_count')
plt.show()

In [2]:
df_business = pd.read_pickle('./business_pickle.pkl')
df_cats = pd.read_pickle('./cats_pickle.pkl')
df_users = pd.read_pickle('./users_pickle.pkl')
df_reviews = pd.read_pickle('./reviews_pickle.pkl')

### Cosine Similarity <a name='cossim'></a>

[Top](#Top)

In [3]:
# Input vectors are series, convert to np array
def cosineSim(vector, docVector):
    vector = vector.to_numpy()
    docVector = docVector.to_numpy()
    
    # Get lengths of vectors
    sim = (np.sqrt(sum(vector**2))*np.sqrt(sum(docVector**2)))
    
    # Ensure no division by 0
    if sim == 0.:
        return np.nan
    
    # Calculate cosine sim
    return vector.dot(docVector)/sim

In [4]:
def rankedBusinessList(df_business, queryVector):
    df_bus_features = df_business.copy().drop(columns=['name','city','stars','review_count','is_open','Score Cos'],errors='ignore')
    scoreList = np.zeros(len(df_business))
    for i in range(len(df_bus_features)):
        scoreList[i] = cosineSim(queryVector, df_bus_features.iloc[i])
    
    df_business['Score Cos'] = scoreList
    return df_business.sort_values(by=['Score Cos','stars'],ascending=False)

### Create Query Vector <a name='queryvector'></a>

[Top](#Top)

This first cell get the longitude and latitude of a given city and state, and normalizes both values.

In [5]:
loc_string = 'Las Vegas NV'
gn = Nominatim(user_agent='WalterGKurtz')
location = gn.geocode(loc_string)
queryVector = pd.Series(data=0.,index=df_business.columns)
queryVector.at['latitude'] = location.latitude
queryVector.at['longitude'] = location.longitude
#x = cos(lat) * cos(lon)
#y = cos(lat) * sin(lon)
#queryVector.at['x_axis'] = np.cos(location.latitude) * np.cos(location.longitude)
#queryVector.at['y_axis'] = np.cos(location.latitude) * np.sin(location.longitude)


The second cell inputs the preferences of the user.

In [6]:
# Preferences
queryVector = queryVector.drop(labels=['name','city','stars','review_count','is_open'],errors='ignore')
queryVector.at['Automotive'] = 0
queryVector.at['Auto Repair'] = 0
queryVector.at['Fast Food'] = 0
queryVector.at['Restaurants'] = 0
queryVector.at['Pizza'] = 1.
queryVector.at['Men\'s Clothing'] = 0
queryVector.at['Women\'s Clothing'] = 0
queryVector.at['Fashion'] = 0
queryVector.at['Bars'] = 0
display(queryVector)
#queryVector.to_csv('test.csv', header=False)

latitude                      36.167256
longitude                   -115.148516
Restaurants                    0.000000
Mexican                        0.000000
Canadian (New)                 0.000000
Seafood                        0.000000
Nightlife                      0.000000
Bars                           0.000000
Event Planning & Services      0.000000
Venues & Event Spaces          0.000000
Lounges                        0.000000
Hotels                         0.000000
Hotels & Travel                0.000000
Cajun/Creole                   0.000000
Shaved Ice                     0.000000
Desserts                       0.000000
Food                           0.000000
Fashion                        0.000000
Department Stores              0.000000
Shopping                       0.000000
Auto Repair                    0.000000
Automotive                     0.000000
Barbers                        0.000000
Hair Salons                    0.000000
Local Flavor                   0.000000


In [None]:
display(rankedBusinessList(df_business,queryVector).head(20))

### SVD <a name='svd'></a>

[Top](#Top)

In [7]:
stars = df_business['stars'].values
cities = df_business['city'].values
names = df_business['name'].values
bus_ids = df_business.index
df_tmp_svd = df_business.copy().drop(columns=['name','city', 'stars','review_count','is_open', 'Score Cos'],errors='ignore').to_numpy()

In [None]:
# This code can be used to find the optimal amount of dimension by the SVD

# trunc_svd = TruncatedSVD(n_components=len(df_tmp_svd[0])-1).fit(df_tmp_svd)
# reduced_x = trunc_svd.transform(df_tmp_svd)
# singles = trunc_svd.singular_values_

# eigvals = singles**2 / np.sum(singles**2)
# fig = plt.figure(figsize=(8,5))
# sing_vals = np.arange(len(df_tmp_svd[0])-1) + 1
# plt.plot(sing_vals[2:200], eigvals[2:200], 'ro-', linewidth=2)
# plt.title('Scree Plot')
# plt.xlabel('Component number')
# plt.ylabel('Eigenvalue')
# plt.show()

In [8]:
optimal_dims = 25
trunc_svd = TruncatedSVD(n_components=optimal_dims).fit(df_tmp_svd)
reduced_x = trunc_svd.transform(df_tmp_svd)

In [9]:
queryVectorReduced = trunc_svd.transform(np.array([queryVector]))

df_reduced_x = pd.DataFrame(data=reduced_x, index=bus_ids)
df_reduced_x.insert(0, 'stars', stars)
df_reduced_x.insert(0, 'city', cities)
df_reduced_x.insert(0, 'name', names)
df_queryVectorReduced = pd.Series(data=queryVectorReduced[0])
display(rankedBusinessList(df_reduced_x,df_queryVectorReduced).head(20))

Unnamed: 0_level_0,name,city,stars,0,1,2,3,4,5,6,...,16,17,18,19,20,21,22,23,24,Score Cos
business_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
k-tg18rWRibZ4jKpoqhb0Q,Bonchon - SW Las Vegas,South Las Vegas,0.625,120.457864,-8.497728,0.774275,0.006914,-0.148181,0.209628,0.039618,...,-0.091224,-0.032573,-0.093541,-0.016698,0.021572,-0.012635,-0.016225,-0.035609,0.016276,0.999951
kZh2Auqu-e22DNiiRXMMLA,Rimowa Flagship Store,South Las Vegas,1.0,120.406613,-8.456108,-0.370709,-0.541531,0.176289,0.323276,-0.061555,...,-0.011263,-0.126616,0.004486,-0.171199,-0.142841,-0.144545,0.011386,0.049827,0.100029,0.999948
RCKf_8W4mkQzjffgYmJK1A,Portofino On the Lake,Lake Las Vegas,0.25,120.168199,-8.376705,-0.380255,-0.573755,0.169418,0.356738,-0.094092,...,0.051344,-0.3047,-0.077807,-0.35579,-0.47794,0.213182,0.13256,-0.117002,-0.35679,0.999929
G3VVtNuIZCYndh3OwoGrKQ,"Ryan C Shipp, DMD MS",Hendserson,0.875,120.30729,-8.512477,-0.178458,0.429247,0.053527,-0.044053,-0.726774,...,0.017566,0.105975,-0.045871,-0.032894,-0.006787,-0.009928,-0.008578,-0.000421,-0.003919,0.999927
ArE3m3mTQ9rkP8NnQaG3HQ,St Rose Catholic Church,Anthem,0.375,116.737049,-9.448789,-0.026413,0.066268,-0.064992,-0.092451,-0.046144,...,-0.046504,-0.103139,-0.091875,0.062593,-0.118272,-0.043137,0.041893,0.039769,-0.083352,0.999916
jutfk7U4GV899q6qfYp5dA,Christ's Church of the Valley - Anthem Campus,Anthem,1.0,116.750513,-9.451571,-0.026387,0.066268,-0.06499,-0.092457,-0.046145,...,-0.04651,-0.103148,-0.091886,0.062599,-0.118282,-0.043142,0.041895,0.039772,-0.083359,0.999916
nWr55hE4c_qT4tI9hq_AlA,Wells Fargo Bank,Scottsdale,0.75,116.539646,-9.421697,-0.036498,0.068006,-0.100152,-0.121199,-0.031612,...,-0.063815,-0.186666,-0.143455,0.0439,-0.23868,-0.123554,0.10529,0.126774,-0.117452,0.999914
Y_HzJ7yJU3YJlOze5lByPw,BMO Harris Bank,Scottsdale,0.375,116.533391,-9.428206,-0.036918,0.067189,-0.10026,-0.121032,-0.031894,...,-0.064192,-0.189133,-0.144464,0.042188,-0.242265,-0.126306,0.106523,0.128785,-0.117142,0.999914
dxRJt46lcGLdVb_3Xq5mfw,Chase Bank,Anthem,0.5,116.745958,-9.454381,-0.036268,0.068054,-0.10018,-0.121315,-0.031653,...,-0.0639,-0.186813,-0.143615,0.043993,-0.23885,-0.12363,0.10533,0.126828,-0.117561,0.999913
5M4zz4CI9q67luyWnFBVhw,Allstate Insurance Agent: Debra K. Atkinson,Scottsdale,0.625,116.533969,-9.428583,-0.036009,0.070519,-0.098183,-0.122054,-0.040034,...,-0.072411,-0.2023,-0.162552,0.063705,-0.259014,-0.135922,0.129467,0.156277,-0.119824,0.999913


### K-means <a name='kmeans'></a>

[Top](#Top)

In [None]:
# # This block of code was used to find the optimal K, it takes very long to run, so that's why it is commented now.

# from yellowbrick.cluster import KElbowVisualizer

# df_bus_kmeans = df_reduced_x.copy().drop(columns=['name','city','stars','Score Cos'],errors='ignore').to_numpy()

# # choose k

# model = KMeans(random_state=0)
# visualizer = KElbowVisualizer(model, k=[100, 200, 300, 400, 500, 600, 700, 800, 900, 1000])
# visualizer.fit(df_bus_kmeans)        # Fit the data to the visualizer
# visualizer.show() 

In [10]:
df_bus_kmeans = df_reduced_x.copy().drop(columns=['name','city','stars','Score Cos'],errors='ignore').to_numpy()
kmeans = KMeans(n_clusters=300,random_state=0).fit(df_bus_kmeans)

In [11]:
# Predict query vector
labels = np.array(kmeans.predict(df_bus_kmeans))
prediction = kmeans.predict([df_queryVectorReduced.to_numpy()])

matches = [y[0] for y,val in np.ndenumerate(labels) if val == prediction]

df_matched = df_business.iloc[matches]

# display(df_matched.sort_values(by=['stars'],ascending=False).head(20))

display_df = df_matched.sort_values(by=['stars'],ascending=False).head(20)
display_display_df = display_df[['city','latitude', 'longitude', 'name', 'stars', 'Score Cos']]
display(display_display_df)

KeyError: "['Score Cos'] not in index"

### Cohen's Kappa <a name='cohenkappa'></a>

[Top](#Top)

In [None]:
# Dataformat
# 1 | 0
# 1 | 1
# 0 | 1

def PE(data):
    '''On input data, return the P(E) (expected agreement).'''
    relevant = 0
    nonrelevant = 0
    # Iterate over the data
    for i in data:
        for j in i:
            
            # Top up the relevant documents by one if 1 is encountered
            if j == 1:
                relevant += 1
            # Top up the nonrelevant documents by one if 0 is encountered
            if j == 0:
                nonrelevant += 1

    # Calculates the total of inspected documents for the judges combined
    total = len(data)*2

    # Calculates the pooled marginals
    rel = relevant/total
    nonrel = nonrelevant/total

    # Calculates the P(E)
    P_E = nonrel**2 + rel **2    
    return    P_E 


def kappa(data, P_E):
    agree = 0
    for i in data:
        temp = None
        for j in i:
            if temp == j:
                agree += 1
            temp = j
    P_A = agree / len(data)
    if P_E == 1:
        kappa = 1
    else:
        kappa = (P_A - P_E)/(1 - P_E)   
    return kappa

In [None]:
def AveragePrecision(ranked_list_of_results, list_of_relevant_objects):
    total = len(list_of_relevant_objects)
    sumPk = 0
    rank = 0
    relevant = 0
    for result in ranked_list_of_results:
        rank += 1
        if result in list_of_relevant_objects:
            relevant += 1
            sumPk += relevant/rank
            
    aprecision = sumPk/total
    return aprecision