<a href="https://colab.research.google.com/github/Msourabh/Recommendation_system/blob/master/rs_v2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import pandas as pd
import numpy as np
import sys
from itertools import combinations, groupby
from collections import Counter
from IPython.display import display


In [0]:
data = pd.read_csv('order_products__prior.csv')

In [0]:
orders = data.set_index('order_id')['product_id'].rename('item_id')

In [0]:
print(orders.shape)
display(orders.head(10))
type(orders)

(1775174,)


order_id
2    33120
2    28985
2     9327
2    45918
2    30035
2    17794
2    40141
2     1819
2    43668
3    33754
Name: item_id, dtype: int64

pandas.core.series.Series

In [0]:
# return item pairs.
def get_item_pairs(order_item):
    order_item = order_item.reset_index().as_matrix()
    for order_id, order_object in groupby(order_item, lambda x: x[0]):
        item_list = [item[1] for item in order_object]
              
        for item_pair in combinations(item_list, 2):
            yield item_pair
# return frequency of items          
def freq(iterable):
    if type(iterable) == pd.core.series.Series:
        return iterable.value_counts().rename("freq")
    else: 
        return pd.Series(Counter(iterable)).rename("freq")
      
# Returns number of unique orders
def order_count(order_item):
    return len(set(order_item.index))

In [0]:
# find the frequency of each item
item_freq = freq(orders).to_frame()
type(item_freq)
print(item_freq.shape)
item_freq.head()

(39604, 1)


Unnamed: 0,freq
24852,25969
13176,20776
21137,14462
21903,13204
47209,11632


In [0]:
# find number of unique orders
unique_orders = order_count(orders)
# calculate the support value for each item.
item_freq['support'] = (item_freq['freq']/unique_orders)*100
print(item_freq.shape)
item_freq.head()


(39604, 2)


Unnamed: 0,freq,support
24852,25969,14.768204
13176,20776,11.815018
21137,14462,8.224335
21903,13204,7.508928
47209,11632,6.614954


In [0]:
support_t = 0.01 #(0.01%)
#filter items which have support more than the threashold value.
quali_items= item_freq[item_freq['support']>=support_t].index
print(item_freq.shape)
item_freq.head()
item_freq.support.describe()
orders = orders[orders.isin(quali_items)]
print(orders.shape)
#print(quali_items)

(39604, 2)
(1639026,)


In [0]:
# find orders with less than 2 items.
order_size = freq(orders.index).to_frame()
print(order_size.head())
print(order_size.shape)
quali_orders = order_size[order_size.freq>=2].index
orders = orders[orders.index.isin(quali_orders)]
print(orders.shape)
print(orders.head())

   freq
2     8
3     8
4    12
5    21
7     2
(174787, 1)
(1629116,)
order_id
2    33120
2    28985
2     9327
2    45918
2    30035
Name: item_id, dtype: int64


In [0]:
# find item_pair usinf orders
items = Counter(get_item_pairs(orders))


  


In [0]:
# find support anc create dataframe for this.
item_a = []
item_b = []
freq_c = []
support = []

for i in items:
  item_a.append(i[0])
  item_b.append(i[1])
  freq_c.append(items[i])
  support.append((items[i]/len(quali_orders))*100)

result = pd.DataFrame(columns=['item_a','item_b','freq','support'])
result['item_a'] = item_a
result['item_b'] = item_b
result['freq'] = freq_c
result['support'] = support

In [0]:
print(result.shape)
result.head()

(5592646, 4)


Unnamed: 0,item_a,item_b,freq,support
0,33120,28985,23,0.01395
1,33120,9327,3,0.00182
2,33120,45918,2,0.001213
3,33120,30035,1,0.000607
4,33120,17794,18,0.010917


In [0]:
# filter the result based on the support threashold
result = result[result.support>=support_t]
print(result.shape)
print(result.head())

(51081, 4)
    item_a  item_b  freq   support
0    33120   28985    23  0.013950
4    33120   17794    18  0.010917
10   28985   17794   157  0.095222
28   33754   24838    21  0.012737
30   33754   21903    93  0.056406


In [0]:
# measure confidence score.
result['confidence_a->b'] = np.array(result['support'])/np.array(item_freq.loc[result['item_a'],'support'])
result['confidence_b->a'] = np.array(result['support'])/np.array(item_freq.loc[result['item_b'],'support'])

In [0]:
result.head()

Unnamed: 0,item_a,item_b,freq,support,confidence_a->b,confidence_b->a
0,33120,28985,23,0.01395,0.021574,0.006635
4,33120,17794,18,0.010917,0.016884,0.004748
10,28985,17794,157,0.095222,0.045292,0.041416
28,33754,24838,21,0.012737,0.012991,0.008156
30,33754,21903,93,0.056406,0.057532,0.007512


In [0]:
# measure lift score
result['lift'] = np.array(result['support'])/(np.array(item_freq.loc[result['item_a'],'support'])*np.array(item_freq.loc[result['item_b'],'support']))

In [0]:
result.head()

Unnamed: 0,item_a,item_b,freq,support,confidence_a->b,confidence_b->a,lift
0,33120,28985,23,0.01395,0.021574,0.006635,0.010262
4,33120,17794,18,0.010917,0.016884,0.004748,0.007344
10,28985,17794,157,0.095222,0.045292,0.041416,0.019699
28,33754,24838,21,0.012737,0.012991,0.008156,0.008319
30,33754,21903,93,0.056406,0.057532,0.007512,0.007662


In [0]:
# sort result based on lift value
result = result.sort_values('lift', ascending=False)
print(result.head())
print(result.shape)

        item_a  item_b  freq  ...  confidence_a->b  confidence_b->a       lift
456627   29479   16508    20  ...         0.410199         0.313681  10.607493
654638   29126   36361    17  ...         0.262765         0.377725   9.626172
68911    23953   27553    19  ...         0.273835         0.397330   9.441625
496971   12820   11212    21  ...         0.306806         0.361239   8.701613
79737     8186   42085    20  ...         0.260126         0.387824   8.316651

[5 rows x 7 columns]
(51081, 7)


In [0]:
# import product data to get product name
product_names = pd.read_csv('products.csv')
product_names.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id
0,1,Chocolate Sandwich Cookies,61,19
1,2,All-Seasons Salt,104,13
2,3,Robust Golden Unsweetened Oolong Tea,94,7
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1
4,5,Green Chile Anytime Sauce,5,13


In [0]:
# filter products using qualified orders 
product_name=product_names[product_names['product_id'].isin(quali_items)]


In [0]:
# make copy to convert item_id into item_name
final_result = result.copy()
final_result['item_a'] = pd.Series(result['item_a']).replace(list(product_names['product_id']),list(product_names['product_name']))
final_result['item_b'] = pd.Series(result['item_b']).replace(list(product_names['product_id']),list(product_names['product_name']))
final_result.head()

Unnamed: 0,item_a,item_b,freq,support,confidence_a->b,confidence_b->a,lift
456627,Gobble Gobble Turkey Pouch Stage 3,Stage 3 Hearty Meals - Chick Chick Organic Bab...,20,0.01213,0.410199,0.313681,10.607493
654638,Organic Strawberry Chia Lowfat 2% Cottage Cheese,Organic Cottage Cheese Blueberry Acai Chia,17,0.010311,0.262765,0.377725,9.626172
68911,Cream on Top Strawberry Organic Yogurt,Organic Blueberry Cream On Top Whole Milk Yogurt,19,0.011524,0.273835,0.39733,9.441625
496971,Organic Fruit Yogurt Smoothie Mixed Berry,Apple Blueberry Fruit Yogurt Smoothie,21,0.012737,0.306806,0.361239,8.701613
79737,"Baby Food Pouch - Butternut Squash, Carrot & C...",Baby Food Pouch - Spinach Pumpkin & Chickpea,20,0.01213,0.260126,0.387824,8.316651


In [0]:
# function to recommend using item_id
def get_recommendation_by_id(item):
  selected_item_a = result[result['item_a'] == item]
  selected_item_b = result[result['item_b'] == item]
  if selected_item_a.shape[0] == 0 and selected_item_b.shape[0] == 0:
    return {'message':'There is no recoomendation for this item'}
  elif selected_item_a.shape[0] != 0 and selected_item_b.shape[0] != 0:
    #items = list(selected_item_a['item_b'])+list(selected_item_b['item_a'])
    selected_item_a = selected_item_a.loc[:,['item_b','support','confidence_a->b','lift']].rename(columns={'item_b':'item_id','confidence_a->b':'confidence'})
    selected_item_b = selected_item_b.loc[:,['item_a','support','confidence_b->a','lift']].rename(columns={'item_a':'item_id','confidence_b->a':'confidence'})
    final = pd.concat([selected_item_a,selected_item_b],axis=0)
    return final
  if selected_item_a.shape[0] != 0:
    return selected_item_a.loc[:,['item_b','support','confidence_a->b','lift']].rename(columns={'item_b':'item_id','confidence_a->b':'confidence'})
  if selected_item_b.shape[0] != 0:
    return selected_item_b.loc[:,['item_a','support','confidence_b->a','lift']].rename(columns={'item_a':'item_id','confidence_b->a':'confidence'})
  

In [0]:
get_recommendation_by_id(21903)

Unnamed: 0,item_id,support,confidence,lift
416836,13457,0.014556,0.001939,0.034432
1504456,9000,0.013343,0.001777,0.028151
387269,6860,0.012130,0.001615,0.028125
23750,12597,0.012737,0.001696,0.027364
8217,40311,0.010311,0.001373,0.025416
461445,32412,0.011524,0.001535,0.025221
207887,30953,0.011524,0.001535,0.024533
292890,20760,0.013950,0.001858,0.024198
522734,2780,0.010311,0.001373,0.023442
19157,33401,0.071569,0.009531,0.022834


In [0]:
product_names[product_names['product_id'] == 21903]

Unnamed: 0,product_id,product_name,aisle_id,department_id
21902,21903,Organic Baby Spinach,123,4


In [0]:
# function to recommend using item_name
def get_recommendation_by_name(item):
  selected_item_a = final_result[final_result['item_a'] == item]
  selected_item_b = final_result[final_result['item_b'] == item]
  if selected_item_a.shape[0] == 0 and selected_item_b.shape[0] == 0:
    return {'message':'There is no recoomendation for this item'}
  elif selected_item_a.shape[0] != 0 and selected_item_b.shape[0] != 0:
    #items = list(selected_item_a['item_b'])+list(selected_item_b['item_a'])
    selected_item_a = selected_item_a.loc[:,['item_b','support','confidence_a->b','lift']].rename(columns={'item_b':'item_name','confidence_a->b':'confidence'})
    selected_item_b = selected_item_b.loc[:,['item_a','support','confidence_b->a','lift']].rename(columns={'item_a':'item_name','confidence_b->a':'confidence'})
    final = pd.concat([selected_item_a,selected_item_b],axis=0)
    return final
  if selected_item_a.shape[0] != 0:
    return selected_item_a.loc[:,['item_b','support','confidence_a->b','lift']].rename(columns={'item_b':'item_name','confidence_a->b':'confidence'})
  if selected_item_b.shape[0] != 0:
    return selected_item_b.loc[:,['item_a','support','confidence_b->a','lift']].rename(columns={'item_a':'item_name','confidence_b->a':'confidence'})
  

In [0]:
name = 'Organic Baby Spinach'
get_recommendation_by_name(name)

Unnamed: 0,item_name,support,confidence,lift
416836,Pad Thai Brown Rice Noodles,0.014556,0.001939,0.034432
1504456,Organic Yellow Zucchini,0.013343,0.001777,0.028151
387269,Organic Balsamic Vinaigrette,0.012130,0.001615,0.028125
23750,Organic Bakery Tortillas Whole Spelt,0.012737,0.001696,0.027364
8217,Organic Mixed Baby Kale Salad,0.010311,0.001373,0.025416
461445,Garbanzo Beans No Salt Added,0.011524,0.001535,0.025221
207887,Sliced Beets No Salt Added,0.011524,0.001535,0.024533
292890,Pork Breakfast Sausage,0.013950,0.001858,0.024198
522734,Diced Red Onions,0.010311,0.001373,0.023442
19157,Goat Cheese Crumbles,0.071569,0.009531,0.022834


In [0]:
#item_a = []
#item_b = []
#freq_c = []
#support = []
c = 0
result_item = pd.DataFrame(columns=['item_a','item_b','freq'])
for i in items:
  if result_item[(result_item['item_a']==i[1])&(result_item['item_b']==i[0])].shape[0] == 0:
    result_item.loc[c,'item_a'] = i[0]
    result_item.loc[c,'item_b'] = i[1]
    result_item.loc[c,'freq'] = items[i]
    c=c+1
  else:
    result_item[(result_item['item_a']==i[1])&(result_item['item_b']==i[0])]['freq'] = result_item[(result_item['item_a']==i[1])&(result_item['item_b']==i[0])]['freq'] + items[i]
  
print(result_item.shape)
result_item.head()

#result_item['item_a'] = item_a
#result_item['item_b'] = item_b
#result_item['freq'] = freq_c
#support.append((items[i]/len(quali_orders))*100)
#result['support'] = support


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # Remove the CWD from sys.path while we load stuff.
