# Product Recommendation System

## Concept
We want to build a model that will learn from previous transactions to assess the relationship between items. The model will then be available to recommend items to new incomplete transactions.

In [135]:
import pandas as pd
import numpy as np

In [136]:
df = pd.read_csv("groceries.csv")

In [137]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9835 entries, 0 to 9834
Data columns (total 33 columns):
Item(s)    9835 non-null int64
Item 1     9835 non-null object
Item 2     7676 non-null object
Item 3     6033 non-null object
Item 4     4734 non-null object
Item 5     3729 non-null object
Item 6     2874 non-null object
Item 7     2229 non-null object
Item 8     1684 non-null object
Item 9     1246 non-null object
Item 10    896 non-null object
Item 11    650 non-null object
Item 12    468 non-null object
Item 13    351 non-null object
Item 14    273 non-null object
Item 15    196 non-null object
Item 16    141 non-null object
Item 17    95 non-null object
Item 18    66 non-null object
Item 19    52 non-null object
Item 20    38 non-null object
Item 21    29 non-null object
Item 22    18 non-null object
Item 23    14 non-null object
Item 24    8 non-null object
Item 25    7 non-null object
Item 26    7 non-null object
Item 27    6 non-null object
Item 28    5 non-null object
It

In [138]:
df.head()

Unnamed: 0,Item(s),Item 1,Item 2,Item 3,Item 4,Item 5,Item 6,Item 7,Item 8,Item 9,...,Item 23,Item 24,Item 25,Item 26,Item 27,Item 28,Item 29,Item 30,Item 31,Item 32
0,4,citrus fruit,semi-finished bread,margarine,ready soups,,,,,,...,,,,,,,,,,
1,3,tropical fruit,yogurt,coffee,,,,,,,...,,,,,,,,,,
2,1,whole milk,,,,,,,,,...,,,,,,,,,,
3,4,pip fruit,yogurt,cream cheese,meat spreads,,,,,,...,,,,,,,,,,
4,4,other vegetables,whole milk,condensed milk,long life bakery product,,,,,,...,,,,,,,,,,


### Data Cleaning
For our model to learn properly, we should format our data.
We will convert the "Item n" columns to boolean features: one for each item.

In [139]:
def translate_to_array(row):
    columns = ["Item " + str(i) for i in range(1, 33)]
    return [row[column] for column in columns if type(row[column]) != float]

In [140]:
df["items"] = df.apply(translate_to_array, axis=1)

In [141]:
df.head()

Unnamed: 0,Item(s),Item 1,Item 2,Item 3,Item 4,Item 5,Item 6,Item 7,Item 8,Item 9,...,Item 24,Item 25,Item 26,Item 27,Item 28,Item 29,Item 30,Item 31,Item 32,items
0,4,citrus fruit,semi-finished bread,margarine,ready soups,,,,,,...,,,,,,,,,,"[citrus fruit, semi-finished bread, margarine,..."
1,3,tropical fruit,yogurt,coffee,,,,,,,...,,,,,,,,,,"[tropical fruit, yogurt, coffee]"
2,1,whole milk,,,,,,,,,...,,,,,,,,,,[whole milk]
3,4,pip fruit,yogurt,cream cheese,meat spreads,,,,,,...,,,,,,,,,,"[pip fruit, yogurt, cream cheese, meat spreads]"
4,4,other vegetables,whole milk,condensed milk,long life bakery product,,,,,,...,,,,,,,,,,"[other vegetables, whole milk, condensed milk,..."


In [142]:
def get_unique_items(df):
    transactions = list(df["items"])
    items = []
    for t in transactions:
        for item in t:
            items.append(item)
    return list(set(items))

In [143]:
items = get_unique_items(df)

In [144]:
def categorize_items(df, items):
    for item in items:
        df[item] = df["items"].apply(lambda transaction: int(item in transaction))

In [145]:
categorize_items(df, items)

In [146]:
df.head()

Unnamed: 0,Item(s),Item 1,Item 2,Item 3,Item 4,Item 5,Item 6,Item 7,Item 8,Item 9,...,organic products,nuts/prunes,rum,baking powder,beverages,pasta,ketchup,sliced cheese,organic sausage,misc. beverages
0,4,citrus fruit,semi-finished bread,margarine,ready soups,,,,,,...,0,0,0,0,0,0,0,0,0,0
1,3,tropical fruit,yogurt,coffee,,,,,,,...,0,0,0,0,0,0,0,0,0,0
2,1,whole milk,,,,,,,,,...,0,0,0,0,0,0,0,0,0,0
3,4,pip fruit,yogurt,cream cheese,meat spreads,,,,,,...,0,0,0,0,0,0,0,0,0,0
4,4,other vegetables,whole milk,condensed milk,long life bakery product,,,,,,...,0,0,0,0,0,0,0,0,0,0


In [147]:
def drop_previous_item_columns(df):
    columns = ["Item " + str(i) for i in range(1, 33)]
    return df.drop(columns=columns)

In [148]:
df = drop_previous_item_columns(df)

In [149]:
df.head(6)

Unnamed: 0,Item(s),items,flour,sweet spreads,dish cleaner,salt,chocolate,frankfurter,brandy,margarine,...,organic products,nuts/prunes,rum,baking powder,beverages,pasta,ketchup,sliced cheese,organic sausage,misc. beverages
0,4,"[citrus fruit, semi-finished bread, margarine,...",0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1,3,"[tropical fruit, yogurt, coffee]",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,[whole milk],0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,"[pip fruit, yogurt, cream cheese, meat spreads]",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,4,"[other vegetables, whole milk, condensed milk,...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,5,"[whole milk, butter, yogurt, rice, abrasive cl...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Output of our model
Client A
T1.[onions, salt]
T2.[beer]
T3.[salt, waffles] -> next time, the client will probably want x

What is our goal? 1. Make him stay in our shop 2.Promotion 3.Suggesting what he needs the most to add to his physical shopping list=convenience 4.Could use a pricing model to find the best promotion

We promote new items as well as items he is used to buying

How to find which items to promote?
* we can look into what similar client buy (k-neareast-neighbors)
* we can look into what this client tends to buy (probabilistic, Naive bayes?)

### Reducing Bias Towards Common Items
Our model will inherently be biased towards common items. We will therefore prioritize rare items to balance it.

In [150]:
def get_occurences(item, df):
    return df[item].sum()
    
item_occurences = {item: get_occurences(item, df) for item in items}
max_occurences = max(item_occurences.values())
item_lift_factors = {}
for item in item_occurences.keys():
    item_lift_factors[item] = max_occurences/item_occurences[item]
item_lift_factors

{'flour': 14.695906432748538,
 'sweet spreads': 28.235955056179776,
 'dish cleaner': 24.398058252427184,
 'salt': 23.70754716981132,
 'chocolate': 5.149590163934426,
 'frankfurter': 4.332758620689655,
 'brandy': 61.292682926829265,
 'margarine': 4.362847222222222,
 'processed cheese': 15.417177914110429,
 'decalcifier': 167.53333333333333,
 'snack products': 83.76666666666667,
 'cereals': 44.875,
 'liquor': 23.05504587155963,
 'soap': 96.65384615384616,
 'turkey': 31.4125,
 'spread cheese': 22.845454545454544,
 'pip fruit': 3.377688172043011,
 'chocolate marshmallow': 28.235955056179776,
 'popcorn': 35.394366197183096,
 'herbs': 15.70625,
 'abrasive cleaner': 71.8,
 'hard cheese': 10.427385892116183,
 'baby cosmetics': 418.8333333333333,
 'seasonal products': 17.95,
 'bathroom cleaner': 93.07407407407408,
 'cleaner': 50.26,
 'rubbing alcohol': 251.3,
 'potted plants': 14.782352941176471,
 'specialty vegetables': 147.8235294117647,
 'artif. sweetener': 78.53125,
 'chewing gum': 12.14009

In [151]:
max_occurences

2513

In [152]:
# Returns the proportion of unique items that are in both transactions
def get_transaction_similarity(transaction_1, transaction_2):
    n_unique_items = len(set(transaction_1).union(set(transaction_2)))
    n_common_items = len(set(transaction_1).intersection(set(transaction_2)))
    return n_common_items / n_unique_items

In [255]:
def row_number_to_transaction(row_number, df):
    output = df[df.index == row_number]["items"].iloc[0]
    return output

# Sorts a dictionary by its values
def sort_dictio(d, descending=True):
    return {k: v for k, v in sorted(d.items(), key=lambda item: item[1], reverse = descending)}

In [154]:
# To be usable, a transaction must not be a subset of the current transaction
def transaction_is_usable(current_tr, tr, transaction_size_factor_threshold):
    is_not_subset = len(set(tr).difference(set(current_transaction))) > 0
#     if is_not_subset:
#         return transaction_size_factor_threshold < abs(len(current_tr) - len(tr)) / len(current_tr) < 1
#     return False
    return is_not_subset

# Returns every transaction that is usable, as defined in the previous method
def get_usable_transactions(df, current_transaction, transaction_size_factor_threshold):
    usable_transactions = {}
    for index, row in df.iterrows():
        tr = row["items"]
        if transaction_is_usable(current_transaction, tr, transaction_size_factor_threshold):
            usable_transactions[index] = tr
    return usable_transactions

# Returns a dictionary associating each transaction to its similarity with the current transaction
def get_similarity_per_transaction(current_transaction, usable_transactions):
    similarity_per_transaction = {}
    for row_number, u_tr in usable_transactions.items():
        similarity_per_transaction[row_number] = get_transaction_similarity(current_transaction, u_tr)
    similarity_per_transaction = sort_dictio(similarity_per_transaction)
    return similarity_per_transaction

def select_k_most_similar(similarity_per_transaction, k):
    k_nearest_neighbors = {}
    for k, v in tuple(similarity_per_transaction.items())[:k]:
        k_nearest_neighbors[k] = v
    return k_nearest_neighbors 

# Returns a dictionary containing the k most similar transactions as
# {transaction_row_number: similarity_factor}
def select_k_nearest_transactions(df, k, current_transaction, transaction_size_factor_threshold):
    usable_transactions = get_usable_transactions(df, current_transaction, transaction_size_factor_threshold)    
    similarity_per_transaction = get_similarity_per_transaction(current_transaction, usable_transactions)
    k_nearest_neighbors = select_k_most_similar(similarity_per_transaction, k)
    return k_nearest_neighbors

In [155]:
current_transaction = list(df["items"].iloc[10])
print(current_transaction)
current_transaction = current_transaction[:-1]
k_nearest_neighbors_dict = select_k_nearest_transactions(df, 10, current_transaction, .3)
k_nearest_neighbors_dict

neighbor_transactions = [row_number_to_transaction(row_number, df) for row_number in k_nearest_neighbors_dict.keys()]
neighbor_transactions

['tropical fruit', 'other vegetables', 'white bread', 'bottled water', 'chocolate']


[['tropical fruit',
  'other vegetables',
  'white bread',
  'bottled water',
  'chocolate'],
 ['tropical fruit', 'other vegetables', 'whipped/sour cream', 'white bread'],
 ['tropical fruit',
  'other vegetables',
  'yogurt',
  'bottled water',
  'male cosmetics'],
 ['tropical fruit',
  'other vegetables',
  'bottled water',
  'bottled beer',
  'napkins'],
 ['tropical fruit',
  'other vegetables',
  'whole milk',
  'mustard',
  'bottled water'],
 ['tropical fruit',
  'other vegetables',
  'whole milk',
  'dessert',
  'white bread',
  'bottled water',
  'misc. beverages',
  'cake bar'],
 ['tropical fruit',
  'other vegetables',
  'packaged fruit/vegetables',
  'butter',
  'bottled water'],
 ['tropical fruit',
  'berries',
  'root vegetables',
  'other vegetables',
  'bottled water',
  'shopping bags'],
 ['tropical fruit',
  'other vegetables',
  'rolls/buns',
  'white bread',
  'bottled beer',
  'potted plants'],
 ['tropical fruit',
  'pip fruit',
  'root vegetables',
  'other vegetable

In [236]:
import random

def recommend_from_neighbors(current_transaction, k_nearest_neighbors_dict, df, apply_lift=False, debug=False):
    neighbor_transactions = [row_number_to_transaction(row_number, df) for row_number in k_nearest_neighbors_dict.keys()]
    flatten_items = [item for transaction in neighbor_transactions for item in transaction]
#     if len(flatten_items) == 0 or type(flatten_items[0]) == list:
#         return np.nan
    if debug:
        print(neighbor_transactions)
        print(flatten_items)
    
    # sorting new items by occurences
    item_counts = {}
    for item in set(flatten_items):
        if item not in current_transaction:
            # count number of occurences
            item_counts[item] = flatten_items.count(item)
    if len(item_counts) == 0:
        return np.nan
            
    if debug:
        print(item_counts)
            
    # implementing "lift"
    if apply_lift:
        for item, occurences in item_counts.items():
            item_counts[item] *= item_lift_factors[item]
        return [item for item in item_counts.keys() if item_counts[item] == max(item_counts.values())]
     
    # without lift, returns one of the items that have the most occurences
    max_occurences = max(item_counts.values())
    return random.choice([item for item in item_counts.keys() if item_counts[item] == max_occurences])

In [157]:
recommend_from_neighbors(current_transaction, k_nearest_neighbors_dict, df)

'whole milk'

In [229]:
def recommend(current_transaction, df, k, transaction_size_factor_threshold=.3, debug=False):
    if len(current_transaction) == 0:
        return np.nan
    k_nearest_neighbors_dict = select_k_nearest_transactions(df, k, current_transaction, transaction_size_factor_threshold)
    if debug:
        print("Current transaction: " + str(current_transaction))
        print(k_nearest_neighbors_dict)
        
    if len(k_nearest_neighbors_dict) == 0:
        return np.nan
    return recommend_from_neighbors(current_transaction, k_nearest_neighbors_dict, df, debug=debug)

In [159]:
current_transaction = list(df["items"].iloc[11])
k = 10
transaction_size_factor_threshold = .3
recommend(current_transaction, df, k, transaction_size_factor_threshold, True)

Current transaction: ['citrus fruit', 'tropical fruit', 'whole milk', 'butter', 'curd', 'yogurt', 'flour', 'bottled water', 'dishes']
{3692: 0.5, 4096: 0.5, 6974: 0.5, 2489: 0.45454545454545453, 2550: 0.45454545454545453, 7120: 0.45454545454545453, 8756: 0.42857142857142855, 7816: 0.4166666666666667, 707: 0.4, 740: 0.4}
{'whipped/sour cream': 2, 'root vegetables': 2, 'shopping bags': 1, 'zwieback': 1, 'pudding powder': 1, 'frankfurter': 1, 'domestic eggs': 1, 'coffee': 1, 'rolls/buns': 2, 'spread cheese': 1, 'bottled beer': 1, 'pastry': 1, 'pork': 1, 'soda': 1, 'onions': 1, 'semi-finished bread': 1, 'jam': 1, 'other vegetables': 1, 'sliced cheese': 1, 'soft cheese': 1}


'whipped/sour cream'

## Testing

In [185]:
def recommend_row(row, df, k, transaction_size_factor_threshold=.3):
#     items = [item for item in row["items"] if item not in [random.choice(row["items"])]]
    items = row["items"][:-1]
    return recommend(items, df, k, transaction_size_factor_threshold)

# test_sample = df.sample(n=30, random_state=1)
test_sample = df.loc[:200, :].copy()
test_sample["recommendation"] = test_sample.apply(lambda row: recommend_row(row, test_sample, k, transaction_size_factor_threshold), axis=1)

In [161]:
test_sample["recommendation_is_accurate"] = test_sample.apply(lambda row: row["recommendation"] in row["items"], axis=1)
test_sample[["items", "recommendation", "recommendation_is_accurate"]]

Unnamed: 0,items,recommendation,recommendation_is_accurate
0,"[citrus fruit, semi-finished bread, margarine,...",curd,False
1,"[tropical fruit, yogurt, coffee]",coffee,True
2,[whole milk],,False
3,"[pip fruit, yogurt, cream cheese, meat spreads]",tropical fruit,False
4,"[other vegetables, whole milk, condensed milk,...",root vegetables,False
...,...,...,...
196,[canned beer],,False
197,"[pork, beef, pip fruit, herbs, spices]",whipped/sour cream,False
198,"[frankfurter, citrus fruit, UHT-milk, margarin...",whole milk,False
199,"[sausage, bottled beer, liquor (appetizer)]",rolls/buns,False


In [162]:
accuracy = test_sample["recommendation_is_accurate"].sum() / len(test_sample)
str(accuracy * 100) + "%"

'9.950248756218906%'

In [164]:
def get_accuracy(df, sample_size, k, transaction_size_factor_threshold):
    test_sample = df.loc[:sample_size, :].copy()
    test_sample["recommendation"] = test_sample.apply(lambda row: recommend_row(row, test_sample, k, transaction_size_factor_threshold), axis=1)
    test_sample["recommendation_is_accurate"] = test_sample.apply(lambda row: row["recommendation"] in row["items"], axis=1)
    accuracy = test_sample["recommendation_is_accurate"].sum() / len(test_sample)
    return accuracy

# Returns a dictionary giving the accuracy obtained for each k value
# For each k value, the accuracy is tested [repeats] times to get an average
def optimize_hyperparameters(df, k_values, sample_size=50, repeats=10):
    accuracy_per_k = {}
    for j, k in enumerate(k_values):
        accuracies = []
        for i in range(repeats):
            accuracy = get_accuracy(df, sample_size, k, .1)
            accuracies.append(accuracy)
        accuracy_per_k[k] = np.mean(accuracies)
        print(str(round((j + 1) / len(k_values) * 100, 1)) + "% done")
    return accuracy_per_k

In [165]:
# accuracy_per_k = optimize_hyperparameters(df, [k for k in range(1, 11)], 300)
accuracy_per_k

{1: 0.6013289036544851,
 2: 0.3803986710963455,
 3: 0.24651162790697678,
 4: 0.19966777408637876,
 5: 0.17906976744186048,
 6: 0.17176079734219268,
 7: 0.14285714285714288,
 8: 0.12890365448504984,
 9: 0.0983388704318937,
 10: 0.0893687707641196}

In [None]:
recommend(["pip fruit", "yogurt", "cream cheese", "meat spreads"], df, k, transaction_size_factor_threshold)

In [166]:
def generate_random_transaction_of_size(size, items):
    transaction = []
    for i in range(size):
        it = random.choice(items)
        while it in transaction:
            it = random.choice(items)
        transaction.append(it)
    return transaction

def generate_random_transaction(items):
    size = random.choice([1, 2, 3, 4, 5])
    return generate_random_transaction_of_size(size, items)

def generate_random_transactions_of_size(n, size, items):
    return [generate_random_transaction_of_size(size, items) for i in range(n)]

def generate_random_transactions(n, items):
    return [generate_random_transaction(items) for i in range(n)]

In [None]:
generate_random_transaction(items)

In [288]:
def get_accuracy_from_generated(tr_size, df, k, n_tr, debug=False):
    baskets = generate_random_transactions_of_size(n_tr, tr_size+1, items)
    baskets_one_missing = [basket[:-1] for basket in baskets]
    recommendations = [recommend(basket, df, k, debug=debug) for basket in baskets_one_missing]
#     recommendations = [basket[-1] for basket in baskets]

    result_df = pd.DataFrame({"basket": baskets, "baskets_one_missing": baskets_one_missing, "recommendation": recommendations})
    accuracy = (result_df.apply(lambda row: row["recommendation"] in row["basket"], axis=1)).sum() / len(baskets)
    return accuracy, result_df

In [289]:
accuracy, result_df = get_accuracy_from_generated(1, df, 1, 10)
print(accuracy)
result_df

0.0


Unnamed: 0,basket,baskets_one_missing,recommendation
0,"[frozen potato products, coffee]",[frozen potato products],
1,"[soap, toilet cleaner]",[soap],
2,"[canned beer, dishes]",[canned beer],
3,"[organic sausage, ice cream]",[organic sausage],rolls/buns
4,"[dog food, baby cosmetics]",[dog food],
5,"[cocoa drinks, specialty cheese]",[cocoa drinks],
6,"[yogurt, grapes]",[yogurt],berries
7,"[curd, popcorn]",[curd],rolls/buns
8,"[nut snack, cookware]",[nut snack],
9,"[mustard, candy]",[mustard],


In [293]:
tr_sizes = {}
for size in range(1, 33):
    tr_sizes[size] = (df["items"].apply(len) == size).sum() / len(df)
tr_sizes

{1: 0.21952211489578038,
 2: 0.16705643111337062,
 3: 0.13207930859176412,
 4: 0.10218607015760041,
 5: 0.08693441789527198,
 6: 0.06558210472801221,
 7: 0.05541433655312659,
 8: 0.04453482460599898,
 9: 0.03558718861209965,
 10: 0.025012709710218607,
 11: 0.018505338078291814,
 12: 0.011896288764616167,
 13: 0.007930859176410779,
 14: 0.007829181494661922,
 15: 0.005592272496187087,
 16: 0.004677173360447382,
 17: 0.0029486527707168276,
 18: 0.0014234875444839859,
 19: 0.0014234875444839859,
 20: 0.0009150991357397051,
 21: 0.0011184544992374173,
 22: 0.0004067107269954245,
 23: 0.0006100660904931368,
 24: 0.00010167768174885612,
 25: 0.0,
 26: 0.00010167768174885612,
 27: 0.00010167768174885612,
 28: 0.00010167768174885612,
 29: 0.0003050330452465684,
 30: 0.0,
 31: 0.0,
 32: 0.00010167768174885612}

In [206]:
def recommend_train_test(train, test, k):
    test["recommendation"] = test.apply(lambda row: recommend_row(row, train, k), axis=1)
    test["recommendation_is_accurate"] = test_sample.apply(lambda row: row["recommendation"] in row["items"], axis=1)
    accuracy = test["recommendation_is_accurate"].sum() / len(test)
    return accuracy

# Returns a train and a test set, 
def get_subsets(k, test_fold_index, df):
    fold_size = round(len(df) / k)
    low_index = (test_fold_index-1) * fold_size
    high_index = test_fold_index * fold_size + 1
    test = df.iloc[low_index:high_index, :].copy()
    train = df[~df.isin(test)].copy().dropna()
    return train, test

def fold_validate(k, test_fold_index, df):
    train, test = get_subsets(k, test_fold_index, df)
    accuracy = recommend_train_test(train, test, k)
    return accuracy

def k_fold_cross_validate(k, df):
    accuracies = []
    for i in range(1, k + 1):
        accuracy = fold_validate(k, i, df)
        accuracies.append(accuracy)
    return accuracies, np.mean(accuracies)

In [183]:
train, test = get_subsets(10, 8, df)

In [184]:
len(train), len(test)


(8850, 985)

In [204]:
fold_validate(10, 1, df.iloc[:500, :])

0.0196078431372549

In [223]:
k_fold_cross_validate(10, df.iloc[:150, :])

([0.0, 0.0625, 0.0, 0.1875, 0.0625, 0.125, 0.0625, 0.0, 0.1875, 0.2], 0.08875)

In [103]:
fold_validate(10, 10, [i for i in range(1, 101)])

[91, 92, 93, 94, 95, 96, 97, 98, 99, 100]

In [189]:
df.head()
df[df.index == 0]

Unnamed: 0,Item(s),items,flour,sweet spreads,dish cleaner,salt,chocolate,frankfurter,brandy,margarine,...,organic products,nuts/prunes,rum,baking powder,beverages,pasta,ketchup,sliced cheese,organic sausage,misc. beverages
0,4,"[citrus fruit, semi-finished bread, margarine,...",0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
