### This file is used to create a normalized vector for all customers from a csv containing orders. The vector represents that customers preference in cuisines/food types

##### Input: Csv file containing skip the dishes orders (with customer_id, restaraunt_id and itemlist). Csv file containing restaraunt cuisines, created from AssignRestarauntCuisines.ipynb
##### Output: Csv file where each row is a specific customers preference for food, one column being the customers id and the rest being the preference for a specific cuisine from 0.0-1.0. Row is normalized so it adds up to 1.0.

In [1]:
import pandas as pd 
import numpy as np 
import math

In [2]:
# df is order csv
df = pd.read_csv("./from_skip_files/orders_3.5mil.csv")
# rf is restaraunt cuisines. 
rf = pd.read_csv('./derived_files/Cuisine_Restaraunt.csv')

In [3]:
print(df.shape)

(3500000, 13)


In [4]:
# List of ethnic or cultural cuisines
# Plus convenience
ethnic_or_cultural_items = [
    "Convenience",
    "Indian",
    "Italian",
    "Chinese",
    "Vietnamese",
    "Japanese",
    "Mediterranean",
    "Middle Eastern",
    "Mexican",
    "Korean",
    "Thai",
    "French",
    "African",
    "Latin American",
    "Ethiopian",
    "Caribbean",
    "Filipino",
    "Spanish",
    "German",
]

In [5]:
# Map of cuisine type to keywords that fit under that cuisine
cuisine_map = {
    "Chicken" : ["chicken"],
    "Fries" : ["fries"],
    "Beef" : ["beef"],
    "Pork" : ["pork", "bacon", "pepperoni"],
    "Rice" : ["rice"],
    "Lamb" : ["lamb"],
    "Vegetarian" : ["vegan", "vegetarian", "veggie", "beyond meat"],
    "Sandwiches & Subs" : ["sandwich", "sub", "wrap", "blt"],
    "Desserts" : ["blizzard", "ice cream", "frozen", "dessert", "chocolate", "drizzle", "desserts", "milkshake", "candy", 
                  "candies", "sundae", "oreo", "skor", "brownie", "shake", "tiramisu", "timbits", "ben and jerry", 
                  "cheesecake", "cookie"],
    "Canadian" : ["canadian", "canadien", "alberta","poutine"],
    "Fast food" : ["fast food","combo", "meal"],
    "Burgers" : ["burger", "patty", "mcdouble", "big mac", "quarter pounder"],
    "Seafood" : ["fish", "seafood", "shrimp", "crab", "lobster", "prawn", "seaweed", 
                 "salmon", "tuna", "poke", "calamari", "squid", "fish and chips"],
    "Healthy" : ["organic", "health", "protein", "salad", "fresh", "tofu",  "fruit", "water", "vegetable", "smoothie", "parfait"],
    "Pizza" : ["pizza"],
    "Breakfast & Brunch" : ["egg", "toast", "benedict", "breakfast", "brunch", "cereal", "pancake", "waffle", "hash brown"],
    "Coffee/Tea" : ["coffee", "tea", "americano", "cappuccino", "latte", "cafe", "chai", "london fog"],
    "Alcohol" : ["beer", "wine", "liquor", "budweiser", "bud light", "spirits", "corona", "stella artois", 
                 "michelob ultra", "mike's hard", "labatt", "sauvignon", "smirnoff", "vodka", "whisky", "cognac",
                 "white claw", "pinot noir"],
    "Noodles" : ["noodle", "vermicelli"],
    "Pub food" : ["wing", "onion ring", "wedge", "mac & cheese", "mac and cheese", "gravy", "mashed potato", "breadsticks"], 
    "Indian" : ["indian", "naan", "nan", "samosa", "masala", "aloo", "paneer", "biryani", "tandoori", "roti", "tikka"],
    "Italian" : ["italian", "pasta", "spaghetti", "penne", "fettuccini", "lasagna", "lasagne", "linguini", "ravioli", "tortellini", "meatball", "canoli"],
    "Bakery" : ["danish", "cake", "bun", "donut", "muffin","bagel", "doughnut", "pie", "scone", "rolls", "loaf"],
    "Barbecue" : ["barbecue", "bbq", "grill", "buffalo"],
    "Chinese" : ["chinese", "china", "hot pot", "wonton", "cantonese", "mein", "gyoza"],
    "Vietnamese" : ["vietnamese", "pho", "viet", "bun cha","ca kho to"],
    "Japanese" : ["japanese","japan", "ramen", "sashimi", "teriyaki", "katsu", "tempura", "edamame", "bento", "takoyaki"],
    "Tacos": ["taco"],
    "Sushi" : ["sushi"],
    "Mediterranean" : ["mediterranean","pita", "damascus", "greek", "greece", "briam","taramasalata", "opa"],
    "Hot Dogs & Sausages" : ["hot dogs","sausage", "weiner"],
    "Middle Eastern" : ["middle eastern","falafel", "hummus","shawarma", "baklava", "donair", "tzatziki"],
    "Convenience" : ["convenience","pre-made","grocery", "slurpee"],
    "Mexican" : ["mexican", "chilaquiles", "burrito", "nacho", "quesadilla", "queso", "taquito", "salsa"],
    "Steakhouse" : ["steakhouse", "steak"],
    "Halal" : ["halal", "zabiha"],
    "Korean" : ["korean", "kimchi", "bulgogi","bibimbap","tteokbokki","jjambbong","doenjang"],
    "Thai" : ["thai", "tom yum goong", "green curry"],
    "Soup" : ["soup"],
    "Gluten Free" : ["gluten free", "no gluten"],
    "Popcorn": ["popcorn"], 
    "Pet Food" : ["pet", "dog", "cat"],
    "Bubble Tea" : ["bubble tea", "boba", "milk tea", "taro milk"],
    "French" : ["french", "francais", "crepe", "foie gras","coq au vin","cassoulet","baguette","croissant","gougeres","cajun & creole", "creole"],
    "African" : ["african","pap en vleis", "shisa nyama","bunny chow","koshari"],
    "Latin American" : ["latin","asado","saltena","feijoada","empanada","bandeja paisa",
                        "gallo pinto","ropa vieja","mangu","encebollado", "pupusas","pepian","peruvian"],
    "Haute Cuisine" : ["haute", "high class", "expensive", "champagne"],
    "Ethiopian" : ["ethiopian","tibs","kitfo","beyainatu","fuul"],
    "Caribbean" : ["caribbean", "jamaica","barbados","bahamas"],
    "Filipino" : ["filipino","adobo","lechon","sisig","bulalo"],
    "Spanish" : ["spanish","paella valenciana","patatas bravas","gazpacho","pimientos de padron","jamon","tapas","churro"],
    "Butcher" : ["raw", "butcher","delicatessen"],
    "Kosher" : ["kosher", 'kashrut', 'jewish'],
    "German" : ["german","schnitzel","rouladen","eintopf","sauerbraten"]
}
print(len(list(cuisine_map.keys())))

54


In [6]:
# Checks for any repeats in value lists accross cuisine types in the map above
freq = {}
for key in cuisine_map.keys():
    for food in cuisine_map[key]:
        if food in freq:
            freq[food] += 1    
        else:
            freq[food] = 1
for key in freq.keys():
    if freq[key] > 1:
        print(key)

In [7]:
# # Calculates the amount of items that the cusine map catches
# hit = 0
# all = 0
# test = []
# test2 = []
# def f(x, y):
#   global hit, all
#   ret_val = False
#   res = isinstance(x, str)
#   if not res:
#     print(x)
#     return False
#   items = x.split(", ")
#   any_items_found = False
#   for item in items:
#     item_found = False
#     all +=1
#     temp_item = item.lower()
#     flag = False
#     for key in cuisine_map:
#       if flag: 
#         break
#       for food in cuisine_map[key]:
#         if food in temp_item:
#           hit += 1
#           ret_val = True
#           any_items_found = True
#           flag = True
#           item_found = True
#           break
#     if not item_found:
#       test.append(temp_item)
#       test2.append(y) 
#           # return True
#   # if not any_items_found:
#   #   r_name = y.lower()
#   #   flag = False
#   #   for key in cuisine_map:
#   #     if flag:
#   #       break
#   #     for food in cuisine_map[key]:
#   #       if food in r_name:
#   #         hit +=1 
#   #         flag = True
#   #         ret_val = True
#   #         break
#   return ret_val

# result = [f(x,y) for x,y in zip(df['item_list'],df['restaurant_short_name'])]
# print(sum(result))
# print(len(result))
# print(hit)
# print(all)
# print(hit/all*100)

In [8]:
print(df['customer_id'].value_counts().describe())
orders_by_customer = df.groupby('customer_id').size().reset_index(name='order_count')
customers_with_many_orders = orders_by_customer[orders_by_customer['order_count'] > 10]
print(customers_with_many_orders['customer_id'].shape)


count    107828.000000
mean         32.459102
std          73.573687
min           1.000000
25%           1.000000
50%           5.000000
75%          28.000000
max        2069.000000
Name: customer_id, dtype: float64
(42096,)


In [9]:
import datetime
import pytz
import dateutil.parser

#This function takes in the order date, and returns weight based on the recency of the order
#Under 3 months it has weight of 1
#Between 3 months to 6 months, it has weight between 1 to 0.7
#Between 6 months to 12 months, it has weight between 0.7 to 0.3
#Between 24 months to 12 months, it has weight between 0.3 to 0.0
#After 24 months the weight is 0, meaning we don't consider orders after 2 years from now

def get_order_weight(order_date_str):
    order_date = dateutil.parser.parse(order_date_str)
    delta = datetime.datetime.now(pytz.utc) - order_date
    days_since_order = delta.days
    weight_0to3months = 1
    weight_3to6months = 0.7
    weight_6to12months = 0.3
    weight_12to24months = 0.2
    if days_since_order < 90:
        return weight_0to3months
    elif days_since_order < 180:
        slope = (weight_3to6months - weight_0to3months) / (180 - 90)
        return 1.0 + slope * (days_since_order - 90)
    elif days_since_order < 365:
        slope = (weight_6to12months - weight_3to6months) / (365 - 180)
        return 0.7 + slope * (days_since_order - 180)
    elif days_since_order < 730:
        slope = (weight_12to24months - weight_6to12months) / (730 - 365)
        return 0.3 + slope * (days_since_order - 365)
    else:
        return 0.1

In [12]:
# Main method that counts frequency of cuisine types occuring in users order history and uses that
# as a metric to calculate their preference by dividing that specific cuisine by total cuisines
# if a user orders an item that has the word "chicken" in it 3 times and had 20 items total their
# prefence for chicken would be 3/20 or 0.15.
#Instead of doing cuisine + 1, for every cuisine matched, this code now does cuisine + weight. With weight, it will
#It will now take into account of the recency of the order
overall_cuisine = dict(zip(cuisine_map.keys(), [0] * 80))
grouped = df.groupby('customer_id')
customer_profile_list = []
miss = 0
def process_group(group):
    global overall_cuisine
    global miss 
    customer_profile = dict(zip(cuisine_map.keys(), [0] * 80))
    total_filtered = 0
    for index, row in group.iterrows():
        items = row['item_list'].split(", ")
        order_date =  row['created_time']
        weight = get_order_weight(order_date)
        any_items_found = False
        for item in items:
            lower_item = item.lower()
            cuisine_identified_item = False
            for key in cuisine_map:
                for cuisine in cuisine_map[key]:
                    if cuisine in lower_item:    
                        if key in ethnic_or_cultural_items:
                            cuisine_identified_item = True  
                        customer_profile[key] += weight
                        overall_cuisine[key] += weight
                        total_filtered += weight
                        any_items_found = True
                        break
            if not cuisine_identified_item:
                restaraunt_cuisines_df = rf[rf['short_name'] == row['restaurant_short_name']]['cussine_list']
                if restaraunt_cuisines_df.shape[0] > 0:
                    restaraunt_cuisines_list = restaraunt_cuisines_df.str.split(", ")
                    restaraunt_cuisines = restaraunt_cuisines_list.iloc[0]
                    for cuisine in restaraunt_cuisines:
                        customer_profile[cuisine] +=weight 
                        overall_cuisine[cuisine] +=weight
                        total_filtered += weight
    map_total_filtered = 0
    for value in customer_profile.values():
        map_total_filtered += value
    if math.isclose(total_filtered, map_total_filtered, abs_tol=0.003) and total_filtered != 0:
        for key in customer_profile:
            customer_profile[key] = customer_profile[key]/total_filtered
        customer_profile['customer_id'] = group.name
        customer_profile_list.append(customer_profile)
    else:
        miss += 1
#         print(map_total_filtered, " ", total_filtered)
#         print("missed customer: ", group.name)
    if len(customer_profile_list) % 5000 == 0:
        print(len(customer_profile_list)/107800) 
        print(miss)


grouped.apply(process_group)

0.04638218923933209
67
0.09276437847866419
141
0.1391465677179963
203
0.18552875695732837
277
0.23191094619666047
342
0.2782931354359926
423
0.3246753246753247
501
0.37105751391465674
567
0.4174397031539889
613
0.46382189239332094
694
0.5102040816326531
756
0.5565862708719852
827
0.6029684601113172
909
0.6493506493506493
978
0.6957328385899815
1043
0.7421150278293135
1108
0.7884972170686456
1177
0.8348794063079777
1269
0.8812615955473099
1344
0.9276437847866419
1424
0.974025974025974
1501


In [13]:
sorted_dict = sorted(overall_cuisine.items(), key=lambda x: x[1], reverse=True)
for item in enumerate(sorted_dict):
    print(item[1])

('Chicken', 449846.40652071993)
('Fast food', 345028.4771926414)
('Japanese', 305475.7541257607)
('Burgers', 275867.6701847636)
('Bakery', 250570.17622218424)
('Sandwiches & Subs', 217261.64164852054)
('Vietnamese', 177960.08990071583)
('Pork', 175209.160357341)
('Chinese', 166387.22696720387)
('Italian', 156382.27137416598)
('Desserts', 154925.72950070564)
('Beef', 153665.2864314957)
('Coffee/Tea', 147873.33181211248)
('Indian', 139349.236964521)
('Breakfast & Brunch', 137078.1542395359)
('Healthy', 134270.1181911875)
('Seafood', 128032.19303755386)
('Pizza', 122388.85381002461)
('Convenience', 117200.31610402772)
('Fries', 103876.59630894964)
('Mexican', 88674.64594479327)
('Barbecue', 80289.1663470585)
('Pub food', 76007.16387016559)
('Noodles', 75411.24137479119)
('Middle Eastern', 74653.97322471676)
('Thai', 73454.1198963136)
('Rice', 70841.4509292446)
('Mediterranean', 66714.35968769962)
('Soup', 52960.142977855154)
('Korean', 45334.25581756343)
('Canadian', 44532.39406018304)
('

In [14]:
# Checks how many customers which the filter couldn't match any of their items so they have no 
# information on their preferences.
customer_profile_df = pd.DataFrame(customer_profile_list)
print(df['customer_id'].nunique())
print(customer_profile_df['customer_id'].nunique())

107828
106306


In [15]:
# Reorganizes the columns so id appears first and cuisines are sorted in alphabetical order
customer_profile_df = customer_profile_df.reindex(columns=['customer_id'] + list(customer_profile_df.columns.drop('customer_id')))
cols = customer_profile_df.columns.tolist()
cols = cols[:-54] + sorted(cols[-54:])
customer_profile_df = customer_profile_df.reindex(columns=cols)
print(customer_profile_df)

                                 customer_id  African   Alcohol    Bakery  \
0       0000000d-9a20-4580-85aa-f4ca9062388c      0.0  0.000000  0.026200   
1       00000782-7adf-4924-a5d3-a2c24fef785d      0.0  0.000000  0.000000   
2       0000267e-c83e-4a48-9776-8163eab97b6a      0.0  0.003465  0.104122   
3       000034cf-8586-4214-8239-7a05453cff98      0.0  0.000000  0.069767   
4       00004011-f1d0-42c2-8909-60d0e4620f7d      0.0  0.000000  0.000000   
...                                      ...      ...       ...       ...   
106301  1ee2d4da-a834-4064-a101-9f139d0bfbf5      0.0  0.003647  0.086415   
106302  1ee2d643-15bc-45a0-bdc3-8c8d8aab2e1d      0.0  0.000000  1.000000   
106303  1ee2dd88-1776-43da-9043-0591419f37ea      0.0  0.000000  0.142316   
106304  1ee2f0e4-8f54-4a00-8fbc-57ad13431b12      0.0  0.166228  0.000000   
106305  1ee2fa78-a89d-4783-8705-4f8f33eea1ba      0.0  0.000000  0.030170   

        Barbecue      Beef  Breakfast & Brunch  Bubble Tea   Burgers  \
0  

In [16]:
# Make sure each row adds in table adds to 1, since each row is normalized this should always happen.
for index, row in customer_profile_df.iterrows():
    if row[1:].sum() > 1.05 or row[1:].sum() < 0.95:
      print(row[1:].sum())
      print("Something is wrong at index: ", index)

In [1]:
# Converts to CSV file
print(customer_profile_df)
customer_profile_df.to_csv('./derived_files/customer_profiles_recency.csv', index=False)

NameError: name 'customer_profile_df' is not defined