# <center> XOKind - Machine Learning/Data Science Intern Interview <center>
## <center> Yelp rating predictions <center>
### <center> Traditional Machine learning Vs Graph Machine Learning <center>

#### Creating relational database that will be convered into graph database using Neo4j
- Users.json
- Restaurants.json
- Categories.json
- User-To-Restaurant.json
- Restaurant-To-Category.json 

with all the relevant node and edge properties

In [None]:
import pandas as pd

In [None]:
business_json_path = 'dataset/business.json'
review_json_path = 'dataset/review.json'
user_json_path = 'dataset/user.json'

In [None]:
# =============================================================================
# Business data
# =============================================================================
#read full business file and extract information about top 10 restaurants only

size = 500000

business = pd.read_json(business_json_path, lines=True,
                    dtype={'business_id':str,'name':str,
                             'address':str,'city':str,
                             'latitude':float,'longitude':float,
                             'state':str,'postal_code':str,
                             'stars':float,'review_count':int,
                             'is_open':int,
                             'attributes':object,'categories':object,
                             'hours':object},
                    chunksize=size)


business_drop_columns = ['name', 'address', 'city', 'state', 'postal_code',
                         'latitude', 'longitude', 'attributes', 'hours']
chunk_list_business = []

my_top_10_restaurants = '|'.join([ 'Food', 'Nightlife', 'Bars', 'American (Traditional)', 'American (New)', 'Breakfast & Brunch', 
                                  'Sandwiches', 'Mexican', 'Burgers', 'Pizza'])


my_top_10_restaurants = 'Restaunrants&' + my_top_10_restaurants


for chunk_business in business:
    # Drop columns that aren't needed
    chunk_business = chunk_business.drop(business_drop_columns, axis=1)
    
    # Renaming column name to avoid conflicts
    chunk_business.rename(columns={'stars': 'business_stars', 'review_count': 'business_review_count',
                                      'review_stars': 'business_review_stars'}, inplace=True)
    
    chunk_business = chunk_business[chunk_business['categories'].str.contains(my_top_10_restaurants, case=True,na=False)]
    
    chunk_list_business.append(chunk_business)

    
df_restaurants = pd.concat(chunk_list_business, ignore_index=True, join='outer', axis=0)

In [None]:
#Save restaurants file with business id as primary key 
df_restaurants.drop('categories', axis = 1).to_json('neo4j_dataset/restaurants.json', orient='records')

In [None]:
#Delete non-essential columns to save memory

del chunk_business
del chunk_list_business

In [None]:
# =============================================================================
# Reviews data
# =============================================================================
size = 500000

review = pd.read_json(review_json_path, lines=True,
                      dtype={'review_id':str,'user_id':str,
                             'business_id':str,'stars':int,
                             'date':str,'text':str,'useful':int,
                             'funny':int,'cool':int},
                      chunksize=size)

chunk_list = []
for chunk_review in review:
    
    # Drop columns that aren't needed
    chunk_review = chunk_review.drop(['text', 'date', 'review_id','useful','funny','cool'], axis=1)
    
    # Renaming column name to avoid conflicts
    chunk_review = chunk_review.rename(columns={'stars': 'review_stars'})
    
    # Inner merge with edited business file so only reviews related to the restaurants remain
    chunk_merged = pd.merge(df_restaurants, chunk_review, on='business_id', how='inner')
    
    # Show feedback on progress
    print(f"{chunk_merged.shape[0]} out of {size:,} related reviews")
    
    chunk_list.append(chunk_merged)

    
# After trimming down the review file, concatenate all relevant data back to one dataframe
df_restaurant_reviews = pd.concat(chunk_list, ignore_index=True, join='outer', axis=0)

In [None]:
#Delete non-essential data to save memory

del chunk_review
del chunk_merged
del chunk_list

In [None]:
# =============================================================================
# User data
# =============================================================================

size = 500000

user = pd.read_json(user_json_path, lines=True,
                      dtype={'user_id':str,'name':str,
                             'yelping_since':str,'review_count':int,
                             'friends':object,'useful':int,
                             'funny':int,'cool':int,'fans':int,
                             'elite':list, 'average_stars':float,'compliment_hot':int,
                             'compliment_more':int,'compliment_more':int,'compliment_profile':int,
                             'compliment_cute':int,'compliment_list':int,'compliment_note':int,
                             'compliment_plain':int,'compliment_cool':int,'compliment_funny':int,
                             'compliment_writer':int,'compliment_photos':int},
                      chunksize=size)

user_drop_columns = ['name', 'yelping_since', 'friends', 'elite']

chunk_list_user = []

for chunk_user in user:
    # Drop columns that aren't needed
    chunk_user = chunk_user.drop(user_drop_columns, axis=1)
    
    # Renaming column name to avoid conflicts
    chunk_user.rename(columns={'review_count': 'user_review_count', 'average_stars': 'user_average_stars'})
    
    chunk_list_user.append(chunk_user)

    
# concatenate to one dataframe
df_user = pd.concat(chunk_list_user, ignore_index=True, join='outer', axis=0)

In [None]:
#Delete non-essential data to save memory

del chunk_user
del chunk_list_user

In [None]:
# Merge users and restaurant reviews data --> this dataframe will contain information about user, restaurant and review

df_users_top10 = df_user.merge(df_restaurant_reviews, how='inner', left_on=["user_id"], right_on=["user_id"])

In [None]:
#Delete non-essential data to save memory

del df_user
del df_restaurant_reviews

In [None]:
# Create a new feature mean compliment score for each users

merged_drop_columns = ['business_id', 'user_id', 'elite']

df_users_top10.drop(merged_drop_columns, axis = 1, inplace = True)

compliment_columns = ['compliment_cool', 'compliment_cute', 'compliment_funny', 
                               'compliment_hot', 'compliment_list', 'compliment_more',
                               'compliment_note', 'compliment_photos', 'compliment_plain', 
                               'compliment_profile', 'compliment_writer']


df_users_top10['mean_compliment_score'] = df_users_top10.loc[: , compliment_columns].mean(axis=1)

df_users_top10.drop(compliment_columns, axis = 1, inplace = True)

In [None]:
# Save users information - with user_id as primary key

df_users_top10.drop_duplicates(inplace=True, ignore_index = True)

df_users_top10.to_json('neo4j_dataset/users.json', orient='records')

In [None]:
# Expand by restaurant category to investigate restaurent categories and their overall count in data

df_yelp_expand_by_category = df_restaurants.assign(categories = df_yelp.categories
                         .str.split(', ')).explode('categories')

df_yelp_category_count = df_yelp_expand_by_category.categories.value_counts()


top10 = [ 'Food', 'Nightlife', 'Bars', 'American (Traditional)', 'American (New)', 'Breakfast & Brunch', 
                                  'Sandwiches', 'Mexican', 'Burgers', 'Pizza']

top_10_dict = {}

#Dict with mapping between category name and category id

for i, val in enumerate(top10):
    top_10_dict[val] = i+1


In [None]:

#df_yelp_expand_by_category = df_yelp_expand_by_category[df_yelp_expand_by_category.categories.isin(top10)] already all are 
#in top 10 restaurant categories - i just ran this line to validate

#df_yelp_category_count = df_yelp_expand_by_category.categories.value_counts()

df_yelp_expand_by_category.drop(['business_stars','business_review_count','is_open'], axis = 1, inplace = True)

#Replace category names with category id as it is needed to create edges in the graph database
df_restaurants_category = df_yelp_expand_by_category.replace({'categories': top_10_dict})
df_restaurants_category.rename(columns={'categories': 'category_id'}, inplace = True)


In [None]:
# Save restaurant to category connections
df_restaurants_category.reset_index(inplace=True)

df_restaurants_category.to_json('neo4j_dataset/restaurant_to_category.json', orient='records')

In [None]:
# Save user to restaurant connections - each line has unique combination of user_id and business_id - used to create edges
df_user_restaurant_reviews.drop(['business_stars', 'business_review_count', 'is_open', 
                                                          'categories'], axis = 1, inplace = True)


df_user_restaurant_reviews.to_json('neo4j_dataset/user_to_restaurant.json', orient='records')

In [None]:
#save categories as json file
imported_categories = pd.read_csv('dataset/categories_with_id.csv')

imported_categories.to_json('neo4j_dataset/categories.json', orient='records')