In [191]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import pandas as pd
import numpy as np

In [192]:
df = pd.read_csv("groceries.csv")

In [193]:
df.head()

Unnamed: 0,Item(s),Item 1,Item 2,Item 3,Item 4,Item 5,Item 6,Item 7,Item 8,Item 9,...,Item 23,Item 24,Item 25,Item 26,Item 27,Item 28,Item 29,Item 30,Item 31,Item 32
0,4,citrus fruit,semi-finished bread,margarine,ready soups,,,,,,...,,,,,,,,,,
1,3,tropical fruit,yogurt,coffee,,,,,,,...,,,,,,,,,,
2,1,whole milk,,,,,,,,,...,,,,,,,,,,
3,4,pip fruit,yogurt,cream cheese,meat spreads,,,,,,...,,,,,,,,,,
4,4,other vegetables,whole milk,condensed milk,long life bakery product,,,,,,...,,,,,,,,,,


In [194]:
# # A B C:
# A B -> C
# A C -> B
# B C -> A

In [195]:
def translate_to_array(row):
    columns = ["Item " + str(i) for i in range(1, 33)]
    return list(set([row[column] for column in columns if type(row[column]) != float]))

df["basket"] = df.apply(translate_to_array, axis=1)

In [196]:
def get_unique_items(df):
    transactions = list(df["basket"])
    items = []
    for t in transactions:
        for item in t:
            items.append(item)
    return list(set(items))

In [197]:
items = get_unique_items(df)

In [198]:
def get_dummies_of_list_feature(df, list_feature, items):
    for item in items:
        df[item] = list_feature.apply(lambda l: int(item in l))

In [199]:
get_dummies_of_list_feature(df, df["basket"], items)

In [200]:
def drop_previous_item_columns(df):
    columns = ["Item " + str(i) for i in range(1, 33)]
    return df.drop(columns=columns)

In [201]:
df = drop_previous_item_columns(df)

In [202]:
df.head()

Unnamed: 0,Item(s),basket,sauces,baking powder,tidbits,cereals,cake bar,meat spreads,citrus fruit,cookware,...,turkey,syrup,chocolate marshmallow,frozen fruits,UHT-milk,baby cosmetics,roll products,baby food,dessert,canned vegetables
0,4,"[ready soups, margarine, semi-finished bread, ...",0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,3,"[yogurt, tropical fruit, coffee]",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,[whole milk],0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,"[pip fruit, meat spreads, yogurt, cream cheese]",0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,4,"[whole milk, long life bakery product, other v...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [203]:
# returns a list or tuples that are rules
# [A, B] -> C
()
def get_rules(basket):
    if len(basket) < 2:
        return []
    rules = []
    for i in range(len(basket)):
        to_recommend = basket[i]
        premise = basket[:i] + basket[i+1:]
        rules.append((premise, to_recommend))
    return rules

In [204]:
# returns a DataFrame containing all the rules infered from baskets
def get_rules_df(df):
    rules_per_row = df["basket"].apply(get_rules)
    premises = [rule[0] for rules in rules_per_row for rule in rules]
    recommendations = [rule[1] for rules in rules_per_row for rule in rules]
    rules_df = pd.DataFrame({
        "premise": premises,
        "recommendation": recommendations
    })
    get_dummies_of_list_feature(rules_df, rules_df["premise"], items)
    return rules_df.drop(columns=["premise"])

In [205]:
rules_df = get_rules_df(df)
rules_df

Unnamed: 0,recommendation,sauces,baking powder,tidbits,cereals,cake bar,meat spreads,citrus fruit,cookware,yogurt,...,turkey,syrup,chocolate marshmallow,frozen fruits,UHT-milk,baby cosmetics,roll products,baby food,dessert,canned vegetables
0,ready soups,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,margarine,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,semi-finished bread,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,citrus fruit,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,yogurt,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41203,shopping bags,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
41204,tropical fruit,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
41205,chicken,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
41206,vinegar,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Model

In [206]:
def remove_rare_occurrences(df, feature, min_occurrences):
    occurrences = df[feature].value_counts()
    for item in df[feature].unique():
        if occurrences[item] < min_occurrences:
            df = df[df[feature] != item]
    return df

In [207]:
# rules_df = remove_rare_occurrences(rules_df, "recommendation", 50)
# len(rules_df)

In [215]:
from sklearn.model_selection import train_test_split

def get_train_test_split(df):
    target = "recommendation"
    X = df.drop([target], axis=1)
    y = df[target]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=1)
    return X_train, X_test, y_train, y_test 

In [171]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()
scores = cross_val_score(lr, X_train, y_train, cv=10)
accuracy = scores.mean()
print(accuracy) # accuracy is measured as R²: coefficient of determination [0:1]



0.1049285549060474


In [209]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV

def get_best_model(X_train, y_train, models): 
    for model in models:
        print(model['name'])
        print('-'*len(model['name']))

        grid = GridSearchCV(model["estimator"],
                            param_grid=model["hyperparameters"],
                            cv=10)
        grid.fit(X_train, y_train)
        model["best_params"] = grid.best_params_
        model["best_score"] = grid.best_score_
        model["best_model"] = grid.best_estimator_

        print("Best Score: {}".format(model["best_score"]))
        print("Best Parameters: {}\n".format(model["best_params"]))

    return models

In [216]:
sub_df = rules_df[:10000].copy()
sub_df = remove_rare_occurrences(sub_df, "recommendation", 50)
print(sub_df.shape[0])
X_train, X_test, y_train, y_test = get_train_test_split(sub_df)

8206


In [219]:
# Dictionary telling the function which model types to use and which hyperparameter combinations to try
# n_jobs = -1 tells the model that it can be trained using all cores of the CPU, reducing computing time
models = [
    {
        "name": "LogisticRegression",
        "estimator": LogisticRegression(),
        "hyperparameters":
            {
                "n_jobs": [-1]
            }
    },
    {
        "name": "KNeighborsClassifier",
        "estimator": KNeighborsClassifier(),
        "hyperparameters":
            {
                "n_neighbors": range(1, 20),
                "weights": ["distance", "uniform"],
                "algorithm": ["ball_tree", "kd_tree", "brute"],
                "p": [1,2],
                "n_jobs": [-1],
                "solver": ["sag"]
            }
    },
    {
        "name": "DecisionTreeClassifier",
        "estimator": DecisionTreeClassifier(),
        "hyperparameters":
        {
                "max_depth": [2, 5, 10, 20, 40, 80],
                "max_features": ["log2", "sqrt"],
                "min_samples_leaf": range(1, 6),
                "min_samples_split": [2, 3, 4, 5, 8, 12, 16]
        }
    },
    {
        "name": "RandomForestRegressor",
        "estimator": RandomForestClassifier(random_state=1),
        "hyperparameters":
            {
                "n_estimators": [100],
                "max_depth": [2, 5, 10, 20, 40, 80],
                "max_features": ["log2", "sqrt"],
                "min_samples_leaf": range(1, 6),
                "min_samples_split": [2, 3, 4, 5, 8, 12, 16],
                "n_jobs": [-1]
            }
    }
]

models = get_best_model(X_train, y_train, models)

LogisticRegression
------------------


  " = {}.".format(effective_n_jobs(self.n_jobs)))
  " = {}.".format(effective_n_jobs(self.n_jobs)))
  " = {}.".format(effective_n_jobs(self.n_jobs)))
  " = {}.".format(effective_n_jobs(self.n_jobs)))
  " = {}.".format(effective_n_jobs(self.n_jobs)))
  " = {}.".format(effective_n_jobs(self.n_jobs)))
  " = {}.".format(effective_n_jobs(self.n_jobs)))
  " = {}.".format(effective_n_jobs(self.n_jobs)))
  " = {}.".format(effective_n_jobs(self.n_jobs)))
  " = {}.".format(effective_n_jobs(self.n_jobs)))
  " = {}.".format(effective_n_jobs(self.n_jobs)))


Best Score: 0.12583790371724557
Best Parameters: {'n_jobs': -1}

KNeighborsClassifier
--------------------




Best Score: 0.059567336989640465
Best Parameters: {'algorithm': 'kd_tree', 'n_jobs': -1, 'n_neighbors': 18, 'p': 1, 'weights': 'uniform'}

DecisionTreeClassifier
----------------------




Best Score: 0.0888177940280317
Best Parameters: {'max_depth': 5, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 4}

RandomForestRegressor
---------------------




Best Score: 0.11090798293723339
Best Parameters: {'max_depth': 80, 'max_features': 'sqrt', 'min_samples_leaf': 3, 'min_samples_split': 16, 'n_estimators': 100, 'n_jobs': -1}

