In [1]:
import pandas as pd
import numpy as np
from numpy import linalg as la
import sys
np.set_printoptions(threshold=sys.maxsize)
import datetime

import matplotlib.pyplot as plt
import sklearn


# Import necessary modules
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from math import sqrt

import tensorflow as tf
#load data from data files and place into panda dataframes
custOrders = pd.read_csv('../FeatureEngineering/data/customer_orders.csv', low_memory=False)
itemID=pd.read_csv('../SQL/item.csv')

#turn off panda max column display limit
print(pd.options.display.max_columns)
pd.options.display.max_columns
mns = None



20


In [2]:
#define functions from article:  
#https://towardsdatascience.com/predict-ratings-with-svd-in-collaborative-filtering-recommendation-system-733aaa768b14

#measure simularity with Euclidian Distance
def ecludSim(inA,inB): 
    return 1.0/(1.0+la.norm(inA-inB))

#Decide k: The value of k is determined by the percentage of the sum of squares of the first k singular 
#values to the sum of squares of the total singular values.
def get_k(sigma,percentage):
    sigma_sqr=sigma**2 
    sum_sigma_sqr=sum(sigma_sqr) 
    k_sum_sigma=0 
    k=0
    for i in sigma:
        k_sum_sigma+=i**2
        k+=1
        if k_sum_sigma>=sum_sigma_sqr*percentage:            
            return k

#Convert the original SVD to k dimensions spaces:        
def svdEst(testdata,user,simMeas,item,percentage):
    n=np.shape(testdata)[1]
    sim_total=0.0;
    rat_sim_total=0.0
    u,sigma,vt=la.svd(testdata)
    k=get_k(sigma,percentage)    
    #Construct the diagonal matrix     
    sigma_k=np.diag(sigma[:k])    
    #Convert the original data to k-dimensional space (lower dimension) according to the value of k. formed_items represents the value of item in k-dimensional space after conversion.
    formed_items=np.around(np.dot(np.dot(u[:,:k], sigma_k),vt[:k, :]),decimals=3)     
    for j in range(n):
        user_rating=testdata[user,j]
        if user_rating==0 or j==item:continue
        # the similarity between item and item j
        similarity=simMeas(formed_items[item,:].T,formed_items[j,:].T) 
        sim_total+=similarity 
        # product of similarity and the rating of user to item j, then sum
        rat_sim_total+=similarity*user_rating 
    if sim_total==0:
        return 0
    else:
        return np.around(rat_sim_total/sim_total,decimals=3) 
     
def recommend(testdata,user,sim_meas,est_method, percentage=0.9):
#     print("given SVD testData")
#     print(testdata)
    unrated_items=np.nonzero(testdata[user,:]==0)[0].tolist()
    rated_items=np.nonzero(testdata[user,:]!=0)[0].tolist()
    
    if len(unrated_items)==0:
        return print('everything is rated')
    item_scores=[]
    #put rated items into the item_scores list
    for item in rated_items:
        #want to ignore the column 0, it holds the userID associated with the row and should not be put in the same
        #category as the items
        if item==0:
            continue
        
        derived_score=testdata[user][item]
        #itemID-itemName replacement here?
        item_scores.append((item,derived_score))

    #predict score of unrated items them put them into the item_scores list

    for item in unrated_items:
        estimated_score=est_method(testdata,user,sim_meas,item,percentage)
        item_scores.append((item,estimated_score))
    
    item_scores=sorted(item_scores,key=lambda x:x[1],reverse=True)
    return item_scores


#==========Functions from NN ==========
#function is a variation of the function above, and enumerates column based on the itemID_Dict  
def enumerateItemID(colName, dataFrame):
    count=0
    diction=itemID_Dict
    print(diction)
    for i, row in dataFrame.iterrows():
        currentVal=row.loc[colName]
        dataFrame.at[i, colName] = diction[currentVal][0]
        
        


In [3]:
#manually arranges menu items onto a numberline
itemID=itemID.reindex([5,6,2,1,4,0,3,19,13,14,15,16,17,18, 7,8,9,10,11,12,20])

#
itemID['item_score'] = range(1, len(itemID) + 1)
itemID = itemID.drop("item_description", axis=1, errors='ignore')
itemID = itemID.drop("item_image", axis=1, errors='ignore')


#creates dictionary where key is a menu's id number and the value is a 
#tuple with the new menu id embedding(for ML model to predict on) and the menu item's name
itemID_Dict={}
for i, row in itemID.iterrows():
    itemID_Dict[row.loc["item_id"]]=(row.loc["item_score"], row.loc["item_name"])

#print out dictionary and itemID dataframe
print(itemID_Dict)
#itemID.head(100)

#save item_id to item name
code_to_item={}
for pair in itemID_Dict:
    code_to_item[itemID_Dict[pair][0]]=itemID_Dict[pair][1]
print(code_to_item)

{640405112: (1, 'Double Cheeseburger'), 640405172: (2, 'Double Bacon Cheeseburger'), 640405025: (3, 'Bacon Cheeseburger'), 640404963: (4, 'Cheeseburger'), 640405085: (5, 'Double Hamburger'), 640404923: (6, 'Hamburger'), 640405058: (7, 'Veggie Burger'), 640405347: (8, 'Onion Rings'), 640405296: (9, 'Small Fries'), 640405307: (10, 'Regular Fries'), 640405315: (11, 'Large Fries'), 640405323: (12, 'Small Curly Fries'), 640405331: (13, 'Regular Curly Fries'), 640405339: (14, 'Large Curly Fries'), 640405355: (15, 'Small Drink'), 640405371: (16, 'Regular Drink'), 640405380: (17, 'Large Drink'), 640405389: (18, 'Small Shake'), 640405395: (19, 'Regular Shake'), 640405399: (20, 'Large Shake'), 640405348: (21, 'Coffee')}
{1: 'Double Cheeseburger', 2: 'Double Bacon Cheeseburger', 3: 'Bacon Cheeseburger', 4: 'Cheeseburger', 5: 'Double Hamburger', 6: 'Hamburger', 7: 'Veggie Burger', 8: 'Onion Rings', 9: 'Small Fries', 10: 'Regular Fries', 11: 'Large Fries', 12: 'Small Curly Fries', 13: 'Regular Curl

In [4]:
def loadExData(rows, cols):
    a= np.zeros((rows,cols))    
    return a

In [5]:

#recommend(testdata,0,sim_meas=ecludSim,est_method=svdEst, percentage=0.9)

In [6]:
#change original itemID from DB into a new itemID for SVD, 
#itemID will double as the item's column in the User-Item Matrix  
enumerateItemID("item_id", custOrders)
#orderData is a numpy 2d array, whose number of columns=# of items + 1   (that +1 column is to for the user id )
orderData=loadExData(0, len(itemID_Dict)+1)
print(orderData)

#the remainder of the cell will iterate through the DB customer Orders and combine items in the same order into a single row
#NOTE: custOrders rows are items, and this loop assumes that items in the same order are sequential and will use that 
#assumption to append the order into the orderData matrix

#This is not an issue because in a later cell we iterate through the orderData and combine all the orders
#made by the same user into a single row, just be mindful that if we do not get DB data in this specific format we wont get 
#all items correctly place into the correct order

#TODO: just make a  dictionary rather than a 2D array? Would avoid the DB format assumption
#TODO: change to multiple items in same order get larger score?
order=np.zeros(orderData.shape[1]) #1d array
#NOTE: orders must have items listed sequentially 

prevOrder=-1
for i, row in custOrders.iterrows():
    
    currentUser=row.loc["customer_id"]
    currentOrder=row.loc["order_id"]
    currentItemIndex=row.loc["item_id"]
    
    if (currentOrder!=prevOrder) and (prevOrder>0):
        #new order is up, append data to array
        orderData=np.append(orderData, [order], axis=0)
        #make new order
        order=np.zeros(orderData.shape[1])
    
    #add to order
    order[0]=currentUser
    order[currentItemIndex]=1#change to +=1 if you want to count multiple of the same items in a single order
        
    
    prevOrder=currentOrder

orderData=np.append(orderData, [order], axis=0)


{640405112: (1, 'Double Cheeseburger'), 640405172: (2, 'Double Bacon Cheeseburger'), 640405025: (3, 'Bacon Cheeseburger'), 640404963: (4, 'Cheeseburger'), 640405085: (5, 'Double Hamburger'), 640404923: (6, 'Hamburger'), 640405058: (7, 'Veggie Burger'), 640405347: (8, 'Onion Rings'), 640405296: (9, 'Small Fries'), 640405307: (10, 'Regular Fries'), 640405315: (11, 'Large Fries'), 640405323: (12, 'Small Curly Fries'), 640405331: (13, 'Regular Curly Fries'), 640405339: (14, 'Large Curly Fries'), 640405355: (15, 'Small Drink'), 640405371: (16, 'Regular Drink'), 640405380: (17, 'Large Drink'), 640405389: (18, 'Small Shake'), 640405395: (19, 'Regular Shake'), 640405399: (20, 'Large Shake'), 640405348: (21, 'Coffee')}
[]


In [7]:
print(orderData)

[[ 52.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   1.   1.
    0.   0.   0.   0.   0.   0.   0.   0.]
 [194.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   1.   0.   0.
    0.   0.   0.   0.   0.   0.   0.   0.]
 [  1.   0.   0.   0.   1.   0.   0.   0.   1.   0.   1.   0.   0.   0.
    0.   1.   0.   0.   0.   0.   0.   0.]
 [ 47.   0.   0.   0.   0.   1.   0.   0.   0.   0.   0.   0.   0.   0.
    0.   0.   1.   0.   0.   1.   0.   0.]
 [192.   0.   0.   0.   1.   0.   0.   0.   0.   0.   0.   1.   0.   0.
    0.   0.   0.   0.   0.   0.   0.   1.]
 [157.   0.   0.   0.   0.   0.   0.   1.   0.   1.   0.   0.   0.   0.
    0.   0.   0.   0.   0.   0.   0.   1.]
 [ 79.   0.   0.   0.   0.   1.   0.   0.   0.   0.   0.   0.   0.   0.
    1.   0.   0.   0.   0.   0.   0.   0.]
 [ 32.   1.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
    0.   0.   0.   1.   1.   0.   1.   0.]
 [ 29.   0.   0.   0.   1.   0.   0.   0.   0.   0.   0.   0.   0.   0.
    0.  

In [8]:
print(orderData.shape)
custOrders.head(10000)

(2000, 22)


Unnamed: 0,customer_id,order_id,weather,order_timestamp,item_id,item_name,taste_profile,item_type
0,52,1752,snowy,2015-01-02 20:38:35,12,Small Curly Fries,savory,side
1,52,1752,snowy,2015-01-02 20:38:35,13,Regular Curly Fries,savory,side
2,194,244,snowy,2015-01-04 16:51:57,11,Large Fries,savory,side
3,1,714,cloudy,2015-01-05 20:58:57,10,Regular Fries,savory,side
4,1,714,cloudy,2015-01-05 20:58:57,15,Small Drink,chilling,beverage
...,...,...,...,...,...,...,...,...
4804,178,1088,rainy,2020-12-26 08:02:53,19,Regular Shake,sweet,beverage
4805,122,1290,sunny,2020-12-28 04:46:26,6,Hamburger,savory,burger
4806,122,1290,sunny,2020-12-28 04:46:26,4,Cheeseburger,savory,burger
4807,122,1290,sunny,2020-12-28 04:46:26,5,Double Hamburger,savory,burger


In [9]:
userDict={}
userCount=1

#NOTE: combines all orders made by a user into a single row, representing their orderhistory
for i, row in enumerate(orderData):
    if row[0] not in userDict:
        userDict[row[0]]=row
    else:

        
        temp=np.sum([userDict[row[0]],row], axis=0)
        temp[0]=row[0]
        userDict[row[0]]=temp
        


In [10]:
print(userDict)

{52.0: array([52.,  3.,  2.,  2.,  3.,  3.,  1.,  1.,  5.,  0.,  1.,  0.,  3.,
        1.,  1.,  0.,  0.,  2.,  1.,  1.,  2.,  0.]), 194.0: array([194.,   1.,   1.,   1.,   1.,   0.,   0.,   1.,   1.,   4.,   0.,
         1.,   1.,   4.,   0.,   1.,   0.,   2.,   0.,   2.,   1.,   2.]), 1.0: array([1., 1., 1., 0., 2., 1., 3., 3., 4., 2., 2., 0., 0., 0., 3., 3., 0.,
       1., 1., 0., 4., 0.]), 47.0: array([47.,  0.,  1.,  1.,  0.,  1.,  1.,  1.,  0.,  0.,  0.,  1.,  1.,
        1.,  0.,  0.,  2.,  0.,  1.,  1.,  1.,  1.]), 192.0: array([192.,   1.,   2.,   1.,   1.,   2.,   0.,   0.,   1.,   2.,   0.,
         1.,   1.,   0.,   3.,   0.,   1.,   1.,   1.,   0.,   2.,   2.]), 157.0: array([157.,   2.,   0.,   1.,   0.,   0.,   2.,   2.,   0.,   3.,   0.,
         1.,   0.,   1.,   3.,   2.,   2.,   2.,   0.,   1.,   1.,   1.]), 79.0: array([79.,  1.,  0.,  2.,  2.,  2.,  2.,  0.,  2.,  0.,  2.,  1.,  0.,
        1.,  1.,  0.,  2.,  1.,  1.,  1.,  2.,  0.]), 32.0: array([32.,  4.,  0.,  

In [11]:
#turn dictionary of arrays into a 2d numpy array
finVal=np.vstack(list(userDict.values()))
#save the row number of the user's data 
user_to_row_Dict={}
for i, row in enumerate(finVal):
    if(row[0] not in user_to_row_Dict):
        user_to_row_Dict[ int(row[0])]=i
    
print(user_to_row_Dict)
    

{52: 0, 194: 1, 1: 2, 47: 3, 192: 4, 157: 5, 79: 6, 32: 7, 29: 8, 98: 9, 132: 10, 76: 11, 151: 12, 46: 13, 97: 14, 43: 15, 121: 16, 183: 17, 99: 18, 33: 19, 181: 20, 7: 21, 63: 22, 22: 23, 172: 24, 38: 25, 140: 26, 164: 27, 103: 28, 196: 29, 34: 30, 185: 31, 87: 32, 171: 33, 53: 34, 40: 35, 69: 36, 23: 37, 122: 38, 128: 39, 193: 40, 134: 41, 93: 42, 117: 43, 173: 44, 135: 45, 45: 46, 96: 47, 102: 48, 68: 49, 57: 50, 124: 51, 26: 52, 9: 53, 126: 54, 182: 55, 184: 56, 174: 57, 179: 58, 112: 59, 165: 60, 149: 61, 17: 62, 59: 63, 5: 64, 191: 65, 150: 66, 10: 67, 48: 68, 55: 69, 12: 70, 71: 71, 80: 72, 167: 73, 16: 74, 177: 75, 168: 76, 20: 77, 85: 78, 120: 79, 180: 80, 123: 81, 142: 82, 113: 83, 197: 84, 108: 85, 41: 86, 58: 87, 107: 88, 95: 89, 195: 90, 119: 91, 64: 92, 110: 93, 6: 94, 65: 95, 137: 96, 152: 97, 131: 98, 145: 99, 114: 100, 11: 101, 84: 102, 27: 103, 61: 104, 54: 105, 18: 106, 148: 107, 2: 108, 176: 109, 111: 110, 104: 111, 166: 112, 92: 113, 66: 114, 169: 115, 88: 116, 44:

In [12]:
finVal


array([[ 52.,   3.,   2.,   2.,   3.,   3.,   1.,   1.,   5.,   0.,   1.,
          0.,   3.,   1.,   1.,   0.,   0.,   2.,   1.,   1.,   2.,   0.],
       [194.,   1.,   1.,   1.,   1.,   0.,   0.,   1.,   1.,   4.,   0.,
          1.,   1.,   4.,   0.,   1.,   0.,   2.,   0.,   2.,   1.,   2.],
       [  1.,   1.,   1.,   0.,   2.,   1.,   3.,   3.,   4.,   2.,   2.,
          0.,   0.,   0.,   3.,   3.,   0.,   1.,   1.,   0.,   4.,   0.],
       [ 47.,   0.,   1.,   1.,   0.,   1.,   1.,   1.,   0.,   0.,   0.,
          1.,   1.,   1.,   0.,   0.,   2.,   0.,   1.,   1.,   1.,   1.],
       [192.,   1.,   2.,   1.,   1.,   2.,   0.,   0.,   1.,   2.,   0.,
          1.,   1.,   0.,   3.,   0.,   1.,   1.,   1.,   0.,   2.,   2.],
       [157.,   2.,   0.,   1.,   0.,   0.,   2.,   2.,   0.,   3.,   0.,
          1.,   0.,   1.,   3.,   2.,   2.,   2.,   0.,   1.,   1.,   1.],
       [ 79.,   1.,   0.,   2.,   2.,   2.,   2.,   0.,   2.,   0.,   2.,
          1.,   0.,   1.,   1., 

In [13]:

#finVal= finVal[:,1:]
#print(finVal)

In [14]:
for i, row in enumerate(finVal):
    print(row)


[52.  3.  2.  2.  3.  3.  1.  1.  5.  0.  1.  0.  3.  1.  1.  0.  0.  2.
  1.  1.  2.  0.]
[194.   1.   1.   1.   1.   0.   0.   1.   1.   4.   0.   1.   1.   4.
   0.   1.   0.   2.   0.   2.   1.   2.]
[1. 1. 1. 0. 2. 1. 3. 3. 4. 2. 2. 0. 0. 0. 3. 3. 0. 1. 1. 0. 4. 0.]
[47.  0.  1.  1.  0.  1.  1.  1.  0.  0.  0.  1.  1.  1.  0.  0.  2.  0.
  1.  1.  1.  1.]
[192.   1.   2.   1.   1.   2.   0.   0.   1.   2.   0.   1.   1.   0.
   3.   0.   1.   1.   1.   0.   2.   2.]
[157.   2.   0.   1.   0.   0.   2.   2.   0.   3.   0.   1.   0.   1.
   3.   2.   2.   2.   0.   1.   1.   1.]
[79.  1.  0.  2.  2.  2.  2.  0.  2.  0.  2.  1.  0.  1.  1.  0.  2.  1.
  1.  1.  2.  0.]
[32.  4.  0.  1.  0.  2.  2.  2.  0.  2.  2.  1.  0.  2.  1.  0.  2.  2.
  2.  1.  3.  2.]
[29.  0.  2.  3.  2.  1.  0.  1.  2.  0.  1.  1.  0.  2.  3.  0.  0.  0.
  0.  0.  1.  1.]
[98.  2.  1.  2.  1.  2.  2.  0.  2.  0.  1.  1.  0.  1.  3.  0.  1.  0.
  1.  2.  0.  1.]
[132.   0.   1.   0.   2.   1.   1.   0.   2.  

In [15]:
# ununrated=np.nonzero(finVal[2,:]!=0)
# unrated=np.nonzero(finVal[2,:]==0)

# print(unrated)
# print(ununrated)

#np.nonzero(testdata[user,:]==0)[0].tolist()

In [19]:
finVal_onlyItems=finVal[:,1:]

finVal_normed = finVal_onlyItems / finVal_onlyItems.max(axis=0)
targetUser=194
recommend(finVal_normed,user_to_row_Dict[targetUser],sim_meas=ecludSim,est_method=svdEst, percentage=0.9)

given SVD testData
[[0.6        0.4        0.5        0.75       0.75       0.25
  0.16666667 0.83333333 0.         0.25       0.         0.75
  0.2        0.2        0.         0.         0.28571429 0.16666667
  0.25       0.33333333 0.        ]
 [0.2        0.2        0.25       0.25       0.         0.
  0.16666667 0.16666667 1.         0.         0.14285714 0.25
  0.8        0.         0.2        0.         0.28571429 0.
  0.5        0.16666667 0.5       ]
 [0.2        0.2        0.         0.5        0.25       0.75
  0.5        0.66666667 0.5        0.5        0.         0.
  0.         0.6        0.6        0.         0.14285714 0.16666667
  0.         0.66666667 0.        ]
 [0.         0.2        0.25       0.         0.25       0.25
  0.16666667 0.         0.         0.         0.14285714 0.25
  0.2        0.         0.         0.5        0.         0.16666667
  0.25       0.16666667 0.25      ]
 [0.2        0.4        0.25       0.25       0.5        0.
  0.         0.166666

[(8, 1.0),
 (12, 0.8),
 (18, 0.5),
 (20, 0.5),
 (13, 0.335),
 (4, 0.334),
 (17, 0.333),
 (9, 0.33),
 (15, 0.329),
 (5, 0.327),
 (16, 0.2857142857142857),
 (2, 0.25),
 (3, 0.25),
 (11, 0.25),
 (1, 0.2),
 (14, 0.2),
 (6, 0.16666666666666666),
 (7, 0.16666666666666666),
 (19, 0.16666666666666666),
 (10, 0.14285714285714285)]

In [17]:
finVal_normed[1,:]


array([0.2       , 0.2       , 0.25      , 0.25      , 0.        ,
       0.        , 0.16666667, 0.16666667, 1.        , 0.        ,
       0.14285714, 0.25      , 0.8       , 0.        , 0.2       ,
       0.        , 0.28571429, 0.        , 0.5       , 0.16666667,
       0.5       ])