In [1]:
import pandas as pd
import numpy as np
import datetime

import matplotlib.pyplot as plt
import sklearn


# Import necessary modules
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from math import sqrt

import tensorflow as tf

# Keras specific
import keras
from keras.models import Sequential
from keras.layers import Dense

#load data from data files and place into panda dataframes
custOrders = pd.read_csv('data/customer_orders.csv', low_memory=False)
itemID=pd.read_csv('../SQL/item.csv')
print(custOrders)

      order_id  customer_id      order_timestamp weather    item_id  \
0            1           92   2020-11-3 13:24:56   snowy  640405339   
1            2           11  2017-10-16 02:13:38   snowy  640404963   
2            3           53   2020-6-18 12:36:16  cloudy  640404963   
3            4           43    2018-9-2 07:50:42   rainy  640405389   
4            5           49  2017-12-26 02:51:48   rainy  640405323   
...        ...          ...                  ...     ...        ...   
3995      3996           76   2015-10-6 10:51:52   rainy  640405339   
3996      3997           36  2019-12-13 02:36:28   sunny  640404923   
3997      3998           69   2020-4-11 00:53:16  cloudy  640405323   
3998      3999           45   2019-5-27 02:01:56   rainy  640405058   
3999      4000           25   2018-8-19 04:00:30   sunny  640405331   

                item_name taste_profile  
0       Large Curly Fries        savory  
1            Cheeseburger        savory  
2            Cheesebu

In [2]:
#manually arranges menu items onto a numberline
itemID=itemID.reindex([5,6,2,1,4,0,3,19,13,14,15,16,17,18, 7,8,9,10,11,12,20])

#
itemID['item_score'] = range(1, len(itemID) + 1)
itemID = itemID.drop("item_description", axis=1, errors='ignore')
itemID = itemID.drop("item_image", axis=1, errors='ignore')


#creates dictionary where key is a menu's id number and the value is a 
#tuple with the new menu id embedding(for ML model to predict on) and the menu item's name
itemID_Dict={}
for i, row in itemID.iterrows():
    itemID_Dict[row.loc["item_id"]]=(row.loc["item_score"], row.loc["item_name"])

#print out dictionary and itemID dataframe
print(itemID_Dict)
itemID.head(100)


{640405112: (1, 'Double Cheeseburger'), 640405172: (2, 'Double Bacon Cheeseburger'), 640405025: (3, 'Bacon Cheeseburger'), 640404963: (4, 'Cheeseburger'), 640405085: (5, 'Double Hamburger'), 640404923: (6, 'Hamburger'), 640405058: (7, 'Veggie Burger'), 640405347: (8, 'Onion Rings'), 640405296: (9, 'Small Fries'), 640405307: (10, 'Regular Fries'), 640405315: (11, 'Large Fries'), 640405323: (12, 'Small Curly Fries'), 640405331: (13, 'Regular Curly Fries'), 640405339: (14, 'Large Curly Fries'), 640405355: (15, 'Small Drink'), 640405371: (16, 'Regular Drink'), 640405380: (17, 'Large Drink'), 640405389: (18, 'Small Shake'), 640405395: (19, 'Regular Shake'), 640405399: (20, 'Large Shake'), 640405348: (21, 'Coffee')}


Unnamed: 0,item_id,item_name,price,taste_profile,item_type,item_score
5,640405112,Double Cheeseburger,3.29,savory,burger,1
6,640405172,Double Bacon Cheeseburger,3.79,savory,burger,2
2,640405025,Bacon Cheeseburger,2.79,savory,burger,3
1,640404963,Cheeseburger,2.29,savory,burger,4
4,640405085,Double Hamburger,2.99,savory,burger,5
0,640404923,Hamburger,3.07,savory,burger,6
3,640405058,Veggie Burger,2.49,healthy,burger,7
19,640405347,Onion Rings,2.19,sweet,side,8
13,640405296,Small Fries,1.49,savory,side,9
14,640405307,Regular Fries,1.79,savory,side,10


In [3]:
#function converts a cyclical data point(like time) into a 2 point representation on a circle
#parameters: maximum = highest possible value of reading(ex. 24 for hours in a day, 365 for days in year)
#reading = the data reading that should be converted to an embedding
# returns = pair of floats, which represent the reading's point on a circle
def cycleEmbed(maximum, reading):
    maxi=maximum

    sin = np.sin(2*np.pi*reading/maxi)
    cos = np.cos(2*np.pi*reading/maxi)
    return (sin, cos)

#This functions takes a dataframe and embed time values to cycles on a circle in-place
#this is a wrapper which uses the values returned from cycleEmbed() to create 2 new columns in df
#parameters:
#df = dataframe that is going to be manipulated
#colName = column name which will be used as reading parameter in cycleEmbed
#maxi = value to be used as the maximum parameter in cycleEmbed

def timeEmbed(df, colName, maxi):
    name1=colName+"1"
    name2=colName+"2"
    print(name1)
    print(name2)
    df[name1]=None
    df[name2]=None
    for i, row in df.iterrows():
        temp=cycleEmbed(maxi, row[colName])
        df.at[i, name1]=temp[0]
        df.at[i, name2]=temp[1]


#function takes a column name, dataframe, and then converts the column in the dataframe into a integers, 
#where each integer represents a unique value from that column
#function is pass by reference, and changes the dataFrame in place
def enumerateCol(colName, dataFrame):
    count=0
    diction={}
    for i, row in dataFrame.iterrows():
        currentVal=row.loc[colName]
        if currentVal in diction:
            dataFrame.at[i, colName] = diction[currentVal]
        else:
            diction[currentVal]=count
            count+=1
            dataFrame.at[i, colName] = diction[currentVal]
            
#function is a variation of the function above, and enumerates column based on the itemID_Dict  
def enumerateItemID(colName, dataFrame):
    count=0
    diction=itemID_Dict
    print(diction)
    for i, row in dataFrame.iterrows():
        currentVal=row.loc[colName]
        dataFrame.at[i, colName] = diction[currentVal][0]

In [4]:
#split timestamp in "order_timestamp" into a number of individual columns for each part of the timestamp
if "order_timestamp" in custOrders: 
    custOrders["order_timestamp"]=pd.to_datetime(custOrders["order_timestamp"])
    custOrders["order_date"]=custOrders["order_timestamp"].dt.date
    custOrders["order_year"]=custOrders["order_timestamp"].dt.year
    custOrders["order_month"]=custOrders["order_timestamp"].dt.month
    custOrders["order_day"]=custOrders["order_timestamp"].dt.day
    custOrders["order_hour"]=custOrders["order_timestamp"].dt.hour
    custOrders["order_minute"]=custOrders["order_timestamp"].dt.minute
    custOrders["order_second"]=custOrders["order_timestamp"].dt.second

    #create a column for the weekday(mon,tues, weds, ...etc)
    custOrders["order_weekday"]=custOrders["order_day"]

    for i, row in custOrders.iterrows():
        custOrders.at[i, "order_weekday"] = datetime.datetime(row["order_year"], row["order_month"],row["order_day"],0,0,0).weekday()

    #drop original column; information moved into other columns added above
    custOrders = custOrders.drop("order_timestamp", axis=1, errors='ignore') 
    #convert every time based column into a point on a circle with timeEmbed()    
    timeEmbed(custOrders, "order_month", 12)
    timeEmbed(custOrders, "order_day", 31)
    timeEmbed(custOrders, "order_hour", 24)
    timeEmbed(custOrders, "order_minute", 60)
    timeEmbed(custOrders, "order_second", 60)
    timeEmbed(custOrders, "order_weekday", 7)


    custOrders = custOrders.drop("order_month", axis=1, errors='ignore')
    custOrders = custOrders.drop("order_day", axis=1, errors='ignore')
    custOrders = custOrders.drop("order_hour", axis=1, errors='ignore')
    custOrders = custOrders.drop("order_year", axis=1, errors='ignore')
    custOrders = custOrders.drop("order_minute", axis=1, errors='ignore')
    custOrders = custOrders.drop("order_second", axis=1, errors='ignore')
    custOrders = custOrders.drop("order_weekday", axis=1, errors='ignore')


#Enumerate columns that are categorical
enumerateCol("weather", custOrders)
enumerateItemID("item_id", custOrders)
enumerateCol("taste_profile", custOrders)
enumerateCol("item_type", custOrders)
    
#drop columns which are optional(not all customers will use it), clearly not useful, or redundant

custOrders = custOrders.drop("item_image", axis=1, errors='ignore')
custOrders = custOrders.drop("customization_id", axis=1, errors='ignore') #customization is no in every order
custOrders = custOrders.drop("customization", axis=1, errors='ignore')
custOrders = custOrders.drop("opt_in", axis=1, errors='ignore')
custOrders = custOrders.drop("item_description", axis=1, errors='ignore')


custOrders = custOrders.drop("birthday", axis=1, errors='ignore') #check if part of opt in
custOrders = custOrders.drop("gender", axis=1, errors='ignore') #check if part of opt in
custOrders = custOrders.drop("item_name", axis=1) #reduntance since we have item id? we need 

custOrders = custOrders.drop("order_date", axis=1, errors='ignore') #check if part of opt in
custOrders = custOrders.drop("item_image", axis=1, errors='ignore')
custOrders = custOrders.drop("birthday", axis=1, errors='ignore')

custOrders.head()

order_month1
order_month2
order_day1
order_day2
order_hour1
order_hour2
order_minute1
order_minute2
order_second1
order_second2
order_weekday1
order_weekday2
{640405112: (1, 'Double Cheeseburger'), 640405172: (2, 'Double Bacon Cheeseburger'), 640405025: (3, 'Bacon Cheeseburger'), 640404963: (4, 'Cheeseburger'), 640405085: (5, 'Double Hamburger'), 640404923: (6, 'Hamburger'), 640405058: (7, 'Veggie Burger'), 640405347: (8, 'Onion Rings'), 640405296: (9, 'Small Fries'), 640405307: (10, 'Regular Fries'), 640405315: (11, 'Large Fries'), 640405323: (12, 'Small Curly Fries'), 640405331: (13, 'Regular Curly Fries'), 640405339: (14, 'Large Curly Fries'), 640405355: (15, 'Small Drink'), 640405371: (16, 'Regular Drink'), 640405380: (17, 'Large Drink'), 640405389: (18, 'Small Shake'), 640405395: (19, 'Regular Shake'), 640405399: (20, 'Large Shake'), 640405348: (21, 'Coffee')}


Unnamed: 0,customer_id,order_id,weather,item_id,price,taste_profile,item_type,order_month1,order_month2,order_day1,order_day2,order_hour1,order_hour2,order_minute1,order_minute2,order_second1,order_second2,order_weekday1,order_weekday2
0,170,390,0,5,2.99,0,0,-1.0,-0.0,0.201299,0.97953,-0.707107,0.707107,0.309017,-0.951057,0.951057,-0.309017,-0.781831,0.62349
1,152,1620,1,2,3.79,0,0,0.5,-0.866025,-0.968077,-0.250653,-0.5,-0.866025,0.743145,0.669131,0.669131,0.743145,0.0,1.0
2,186,782,2,2,3.79,0,0,0.5,0.866025,-0.101168,-0.994869,0.866025,0.5,-0.866025,0.5,-0.5,0.866025,0.974928,-0.222521
3,150,1571,3,5,2.99,0,0,0.0,-1.0,-0.651372,-0.758758,0.0,1.0,0.0,-1.0,0.104528,-0.994522,0.974928,-0.222521
4,40,240,2,4,2.29,0,0,-0.5,-0.866025,0.998717,-0.050649,-0.258819,0.965926,-0.866025,0.5,-0.809017,-0.587785,0.974928,-0.222521


In [5]:
# Get a Dictionary containing the pairs of column names & data type objects.
print('Data type of each column of Dataframe :')
print(dict(custOrders.dtypes))

#cast all columns to float64 since some columns are python objects, specifically the time embedding columns
for col in custOrders.columns:
    custOrders[col] = custOrders[col].astype(np.float64)
custOrders.head(10)

Data type of each column of Dataframe :
{'customer_id': dtype('int64'), 'order_id': dtype('int64'), 'weather': dtype('O'), 'item_id': dtype('int64'), 'price': dtype('float64'), 'taste_profile': dtype('O'), 'item_type': dtype('O'), 'order_month1': dtype('O'), 'order_month2': dtype('O'), 'order_day1': dtype('O'), 'order_day2': dtype('O'), 'order_hour1': dtype('O'), 'order_hour2': dtype('O'), 'order_minute1': dtype('O'), 'order_minute2': dtype('O'), 'order_second1': dtype('O'), 'order_second2': dtype('O'), 'order_weekday1': dtype('O'), 'order_weekday2': dtype('O')}


Unnamed: 0,customer_id,order_id,weather,item_id,price,taste_profile,item_type,order_month1,order_month2,order_day1,order_day2,order_hour1,order_hour2,order_minute1,order_minute2,order_second1,order_second2,order_weekday1,order_weekday2
0,170.0,390.0,0.0,5.0,2.99,0.0,0.0,-1.0,-1.83697e-16,0.201299,0.97953,-0.707107,0.707107,0.309017,-0.9510565,0.9510565,-0.309017,-0.781831,0.62349
1,152.0,1620.0,1.0,2.0,3.79,0.0,0.0,0.5,-0.8660254,-0.968077,-0.250653,-0.5,-0.866025,0.7431448,0.6691306,0.6691306,0.743145,0.0,1.0
2,186.0,782.0,2.0,2.0,3.79,0.0,0.0,0.5,0.8660254,-0.101168,-0.994869,0.866025,0.5,-0.8660254,0.5,-0.5,0.866025,0.974928,-0.222521
3,150.0,1571.0,3.0,5.0,2.99,0.0,0.0,1.224647e-16,-1.0,-0.651372,-0.758758,0.0,1.0,5.665539e-16,-1.0,0.1045285,-0.994522,0.974928,-0.222521
4,40.0,240.0,2.0,4.0,2.29,0.0,0.0,-0.5,-0.8660254,0.998717,-0.050649,-0.258819,0.965926,-0.8660254,0.5,-0.809017,-0.587785,0.974928,-0.222521
5,46.0,1841.0,1.0,2.0,3.79,0.0,0.0,-2.449294e-16,1.0,0.394356,0.918958,-0.5,-0.866025,-0.8660254,0.5,5.665539e-16,-1.0,0.0,1.0
6,123.0,1159.0,0.0,3.0,2.79,0.0,0.0,-0.8660254,-0.5,-0.790776,-0.612106,0.965926,-0.258819,-0.9781476,-0.2079117,0.5877853,0.809017,0.433884,-0.900969
7,38.0,1333.0,0.0,1.0,3.29,0.0,0.0,1.0,6.123234000000001e-17,-0.848644,0.528964,0.0,1.0,1.0,2.832769e-16,-0.8660254,-0.5,0.0,1.0
8,22.0,1132.0,0.0,7.0,2.49,1.0,0.0,1.224647e-16,-1.0,0.968077,-0.250653,-0.5,0.866025,0.8660254,0.5,0.5,-0.866025,0.433884,-0.900969
9,92.0,334.0,0.0,3.0,2.79,0.0,0.0,0.5,0.8660254,0.937752,0.347305,0.965926,-0.258819,0.9781476,0.2079117,0.5,-0.866025,0.0,1.0


In [15]:
d={}
s=set()
for i, row in custOrders.iterrows():
    #print(custOrders.at[i, "order_id"])
    orderID=custOrders.at[i, "order_id"]
    menuitemindex=round(custOrders.at[i, "item_id"])-1
    if(custOrders.at[i, "order_id"] not in d):
        d[orderID]=np.zeros(len(itemID))
        d[orderID][menuitemindex]+=1
    else:
        d[orderID][menuitemindex]+=1
        s.add(orderID)


for val in s:
    print(d[val])

[0. 0. 0. 2. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0. 3. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0. 2. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 2. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[2. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[0. 2. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[0. 2. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 3. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 2. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 2. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[2. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[0. 2. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0. 2. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[2. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 2. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[3. 0. 0. 0. 0. 0. 0. 0. 

In [2]:
d[1744]

NameError: name 'd' is not defined