In [49]:
import pandas as pd
import numpy as np 
from sklearn.metrics.pairwise import cosine_similarity
from scipy.spatial.distance import mahalanobis
from scipy.stats import spearmanr
from sklearn.metrics.pairwise import euclidean_distances
import random


In [50]:
# id_df = pd.read_csv('Restauant-CusineUpdated_-_results-20230110-115819.csv')
# id_df = id_df[~id_df['short_name'].str.contains("DNU")]
# id_df = id_df[~id_df['short_name'].str.contains("dnu")]
# id_df.to_csv("All_Restauants.csv", index=False)
# print(id_df.shape)


In [6]:
# Reading info for customer and restaraunt profiles 
customer_profiles = pd.read_csv('customer_profiles_more_context2.csv')
restauraunt_profiles = pd.read_csv('combined_restaurant_order_profile.csv')
id_df = pd.read_csv('All_Restauants.csv')

print(customer_profiles.shape)
print(restauraunt_profiles.shape)

# Reading info for orders and restaraunt names, for later use
orders = pd.read_csv('orders_3.5mil.csv')
# restaraunt_names = pd.read_csv('restaurant_id_and_names.csv')
restaraunt_scores = pd.read_csv('Restuarant_Cuisine(Updated-Unique).csv')

# Dropping useless column in customer and restaraunt profiles 
customer_profiles = customer_profiles.drop('Unnamed: 0', axis=1)

(106251, 56)
(2355, 55)


In [8]:
restauraunt_profiles = restauraunt_profiles[restauraunt_profiles['short_name'].isin(id_df['short_name'])]

##### Over here was try 4 different metrics to find similarity between vectors, Cosine Similarity, Square data transformation then Cosine Similarity, Euclidean Distance, Spearman Correlation Coefficient. It seems Cosine Similarity is a strong choice in terms of speed and results, with square root cosine being an interesting contender

In [9]:
def recommend_cosine_sim(customer_vector, restaraunt_profiles, num_recommendations):
    # Calculate cosine similarity between customer and restaraunts, and stores the indicies of the highest similarity
    # restaurants in variable value. 
    customer_cuisines = customer_vector.iloc[-54:].to_numpy()
    customer_cuisines = customer_cuisines.reshape(1, -1)
    restaurant_cuisines = restauraunt_profiles.iloc[:, 1:].to_numpy()
    similarity = cosine_similarity(customer_cuisines, restaurant_cuisines)
    value = np.argsort(similarity.flatten())[::-1]
    top_X_restaraunts_id = []
    top_X_short_names = []
    # Ensures unique restaraunt names in recommendation
    for i in value:
        curr_name = restauraunt_profiles.iloc[i]['short_name']
        skip_score_same_name = id_df[id_df['short_name'] == curr_name]
        max_skip_score_row = skip_score_same_name.loc[skip_score_same_name['skip_score'].idxmax()]['restaurant_id']
        top_X_restaraunts_id.append(max_skip_score_row)
#         top_X_restaraunts
        if len(top_X_restaraunts_id) == num_recommendations:
            break
    top_X_restaraunts = restauraunt_profiles.iloc[value[:num_recommendations]]
    top_cos_scores = np.sort(similarity.flatten())[::-1][:num_recommendations]
    top_X_restaraunts.insert(55,'cosine similarity score', top_cos_scores)
    top_X_restaraunts.insert(1,'restaraunt_id', top_X_restaraunts_id)
    return top_X_restaraunts

In [None]:
def recommend_sqrt_cosine_sim(customer_vector, restaraunt_profiles):
    customer_cuisines = customer_vector.iloc[-54:].to_numpy()
    customer_cuisines = np.sqrt(customer_cuisines.astype(float))
    customer_cuisines = customer_cuisines.reshape(1, -1)  
    restaurant_cuisines = restauraunt_profiles.iloc[:, 1:].to_numpy()
    restaurant_cuisines = np.sqrt(restaurant_cuisines)
    similarity = cosine_similarity(customer_cuisines, restaurant_cuisines)
    value = np.argsort(similarity.flatten())[::-1]
    top_5_restaraunts = restauraunt_profiles.iloc[value[:5]]
    top_cos_scores = similarity.flatten()[value[:5]]
    top_5_restaraunts.insert(55,'cosine similarity score', top_cos_scores)
    
    return top_5_restaraunts

In [None]:
def recommend_euclidean(customer_vector, restaraunt_profiles):
    customer_cuisines = customer_vector.iloc[-54:].to_numpy()
    customer_cuisines = customer_cuisines.reshape(1, -1)
    restaurant_cuisines = restauraunt_profiles.iloc[:, 1:].to_numpy()
    similarity = euclidean_distances(customer_cuisines, restaurant_cuisines)
    value = np.argsort(similarity.flatten())
    top_5_restaraunts = restauraunt_profiles.iloc[value[:5]]
    top_cos_scores = similarity.flatten()[value[:5]]
    
    return top_5_restaraunts

In [None]:
def spearman_correlation_coefficent(customer_vector, restaraunt_profiles):
    customer_cuisines = customer_vector.iloc[-54:]
    customer_cuisines = pd.to_numeric(customer_cuisines, errors='raise')
    top_10 = customer_cuisines.nlargest(54)
    customer_cuisines = pd.Series(top_10.values, index=top_10.index)
    customer_cuisines = customer_cuisines.sort_index()
    customer_numpy = customer_cuisines.to_numpy()
#     print(customer_cuisines.index)
    restaurant_cuisines = restauraunt_profiles.iloc[:, 1:]
    intersect_cols = list(set(customer_cuisines.index) & set(restaurant_cuisines.columns))
    restaurant_cuisines = restaurant_cuisines[intersect_cols]
    restaurant_cuisines = restaurant_cuisines.sort_index(axis=1)
#     print(restaurant_cuisines.columns)
    restaurant_cuisines = restaurant_cuisines.to_numpy()
    correlations = []
    for i in range(restaurant_cuisines.shape[0]):
        r = restaurant_cuisines[i]
        corr, _ = spearmanr(customer_numpy, r)
        if np.isnan(corr):
            correlations.append(-1)
        else:
            correlations.append(corr)
    print(np.max(correlations))
    value = np.argsort(correlations)[::-1]
    top_5_restaraunts = restauraunt_profiles.iloc[value[:5]]
    return top_5_restaraunts

In [18]:
def pretty_print_recommendation(recommendation, customer_vector):
    c_row = customer_vector
    c_row = c_row.drop('customer_id')
    c_row = pd.to_numeric(c_row, errors='raise')
    c_top = c_row.nlargest(55)
#     c_sqrt = c_row.pow(1./2)
    c_top.name = "Customer Preference"
#     c_sqrt.name = "Customer Preference Sqrt"
    customer_orders = orders[orders['customer_id'] == customer_vector['customer_id']]
    order_restauraunt = customer_orders['restaurant_short_name'].to_list()
    order_item_list = customer_orders['item_list'].str.split(", ").to_list()
#     for i in range(0, len(order_item_list)):
#         print(order_restauraunt[i])
#         print(order_item_list[i])
    for i in range(5):
        r_row = recommendation.iloc[i]
        restaraunt_name = r_row['short_name']
        restaraunt_score = id_df[id_df['restaurant_id'] == r_row['restaraunt_id']]['skip_score']
#         print(r_row['restaurant_id'])
        r_row = r_row.drop('restaraunt_id')
        r_row = r_row.drop('short_name')
        r_row = pd.to_numeric(r_row, errors='raise')
        r_top = r_row.nlargest(56)
        r_top.name = "Restauraunt Offers"
        compare = pd.concat([c_top, r_top], axis=1)
        print(restaraunt_name)
#         print(restaraunt_score)
        print(compare[:5])
        print("-------------")

##### Here is where we try the multiple different methods and compare results 

In [None]:
top_restaraunts = None
momin = 2024
for i in range (1):
    random_number = random.randint(0, 100000) 
#     top_restaraunts = recommend_euclidean(customer_profiles.iloc[random_number], restauraunt_profiles)
#     pretty_print_recommendation(top_restaraunts, customer_profiles.iloc[random_number])
#     print("======================")

    top_restaraunts = recommend_cosine_sim(customer_profiles.iloc[random_number], restauraunt_profiles)
    pretty_print_recommendation(top_restaraunts, customer_profiles.iloc[random_number])
    print("======================")
#     top_restaraunts2 = recommend_sqrt_cosine_sim(customer_profiles.iloc[random_number], restauraunt_profiles)
#     pretty_print_recommendation(top_restaraunts2, customer_profiles.iloc[random_number])
#     print("======================")
#     top_restaraunts = spearman_correlation_coefficent(customer_profiles.iloc[random_number], restauraunt_profiles)
#     pretty_print_recommendation(top_restaraunts, customer_profiles.iloc[random_number])
#     print("======================")



##### Here we add variety into our system by picking the top 2 cuisines a customer prefers and replacing them with a similar cuisine. The similar cuisines were calculated using a similarity matrix. We randomly pick one of the top 3 most similar cuisines as the replacement for the current cuisine. E.g Customer likes Fries, the top 3 similar cuisine are Pork, Soup, Coffee/Tea so we select one of those

In [27]:
rf = pd.read_csv("similarity_matrix_final.csv")
similar_cuisines = {}
for index, row in rf.iterrows():
    test = row.drop("Unnamed: 0")
    test = pd.to_numeric(test, errors='raise')
    columns = test.nlargest(7).tail(5)
#     columns = columns.drop(row.iloc[0])
    similar_cuisines[row.iloc[0]] = columns

In [34]:
def update_3_top_cuisines(customer_vector, similarity_matrix):
    # Picks two highest customer cuisine preferences and swaps them with similar cuisines found 
    # in a similarity matrix. E.g customer likes Chicken 0.25 and Rice 0.2, will swap them out so they 
    # could become Fast Food and Lamb.
    customer_vector_id = customer_vector.iloc[0]
    customer_cuisines =  customer_vector.iloc[-54:]
    org_sum = str(customer_cuisines.sum())
    top3 =  pd.to_numeric(customer_cuisines, errors='raise').nlargest(3)
    print(pd.to_numeric(customer_cuisines, errors='raise').nlargest(5))
    for cuisine in top3.index.to_list():
        similar_cuisines_dict = similarity_matrix[cuisine].to_dict()
        print(similar_cuisines_dict)
        old_cuisine = cuisine
        old_cuisine_preference = top3[old_cuisine]
        print(old_cuisine, old_cuisine_preference)
        similar_cuisine = random.choice(list(similar_cuisines_dict.keys()))
        similarity_of_new_cuisine = similar_cuisines_dict[similar_cuisine]
        print(similar_cuisine, similarity_of_new_cuisine)
        updated_old_cuisine_value = customer_cuisines[similar_cuisine] 
        updated_similar_cuisine_value = old_cuisine_preference
        customer_cuisines[old_cuisine] = updated_old_cuisine_value
        customer_cuisines[similar_cuisine] = updated_similar_cuisine_value
    print("Sums", org_sum, " ", customer_cuisines.sum())
    top5 = pd.to_numeric(customer_cuisines, errors='raise').nlargest(5)
    print(top5)
    customer_id_series = pd.Series({"customer_id": customer_vector_id})
    final_test_customer = pd.concat([customer_id_series, customer_cuisines])
    return final_test_customer

In [None]:
new_customer_vector =  update_2_top_cuisines(customer_profiles.iloc[momin], similar_cuisines)
top_restaraunts_new = recommend_cosine_sim(new_customer_vector, restauraunt_profiles)
pretty_print_recommendation(top_restaraunts_new, new_customer_vector)

##### Now we try to include the skip score as part of the equation when recommending restaruants. The new formula for calculating top restauraunts is <b>score = (weight)*(skip_score)+(1-weight)*(cosine_similarity)</b> where weight is a user set variable. In our case we set weight to 0.2

In [41]:
def adjust_recommendations_using_skip_score(top_restaraunts, weight):
    if weight < 0 or weight > 1:
        return
    adjusted_scores = []
    for i in range(0, top_restaraunts.shape[0]):
        r_row = top_restaraunts.iloc[i]
        restaraunt_score = id_df[id_df['restaurant_id'] == r_row['restaraunt_id']]['skip_score']
        adjusted_scores.append(weight*restaraunt_score.iloc[0]/100+(1-weight)*
                               r_row['cosine similarity score'])
    top_restaraunts.insert(56,'adjusted score', adjusted_scores)
#     print(top_restaraunts[['cosine similarity score', 'adjusted score']])
    new_top_restaraunts = top_restaraunts.sort_values(by='adjusted score', ascending=False)
    return new_top_restaraunts

In [None]:
new_top_restaraunts = adjust_recommendations_using_skip_score(top_restaraunts, 0.1)

##### Trying to combine all the steps above into one solution, find top 5 restaraunts using cosine sim then apply skip score weighing. Replace bottom two restaraunts with varied restaraunts using similarity matrix for similar foods (also apply skip score weighting to find top two restaraunts) then finally recommend top 5

In [54]:
def recommend_pipeline():
    avoid_restaraunts = []
    random_number = random.randint(0, 100000) 
    top_restaraunts = recommend_cosine_sim(customer_profiles.iloc[random_number], restauraunt_profiles, 5)
    new_customer_vector =  update_3_top_cuisines(customer_profiles.iloc[random_number], similar_cuisines)
    top_restaraunts_varied = recommend_cosine_sim(new_customer_vector, restauraunt_profiles, 5)
    top_restaraunts_adjusted = adjust_recommendations_using_skip_score(top_restaraunts, 0.1)
    top_restaraunts_varied_adjusted = adjust_recommendations_using_skip_score(top_restaraunts_varied, 0.1)
    print(top_restaraunts_adjusted[['cosine similarity score', 'adjusted score']])
    print(top_restaraunts_varied_adjusted[['cosine similarity score', 'adjusted score']])
    intermediate_top_3 = top_restaraunts_adjusted.head(3)
    intermediate_top_2 = top_restaraunts_varied_adjusted.head(2)
    final_recommendations = pd.concat([intermediate_top_3, intermediate_top_2])
    print(final_recommendations.shape)
    pretty_print_recommendation(final_recommendations, customer_profiles.iloc[random_number])
    return final_recommendations

In [57]:
recommendations = recommend_pipeline()
# Error that I got to fix, no repeats in top 3/bottom 2

Indian      1.0
African     0.0
Alcohol     0.0
Bakery      0.0
Barbecue    0.0
Name: 20513, dtype: float64
{'Chicken': 0.7176293519438741, 'Lamb': 0.6357881673627198, 'Healthy': 0.6109596859772877, 'Vegetarian': 0.5956379905349253, 'Noodles': 0.5363554750098376}
Indian 1.0
Vegetarian 0.5956379905349253
{'Halal': 0.4741995204185406, 'Beef': 0.4723741071825232, 'Chinese': 0.4616668936912207, 'Rice': 0.4486607259135435, 'Soup': 0.445026511764928}
African 0.0
Chinese 0.4616668936912207
{'Chicken': 0.4375423773630279, 'Burgers': 0.4373175001866742, 'Fast food': 0.4303329974296635, 'Butcher': 0.4275678479441326, 'Sandwiches & Subs': 0.4245562842028815}
Alcohol 0.0
Butcher 0.4275678479441326
Sums 1.0   1.0
Vegetarian    1.0
African       0.0
Alcohol       0.0
Bakery        0.0
Barbecue      0.0
Name: 20513, dtype: float64
      cosine similarity score  adjusted score
1037                 0.993639        0.979276
1657                 0.999056        0.979151
1540                 0.993533     