In [1]:
import pandas as pd
import numpy as np

import random

import plotly.offline as py
import plotly.graph_objects as go

import networkx as nx
from networkx.readwrite import json_graph
import matplotlib.pyplot as plt
import seaborn as sns
import itertools
import collections

from tqdm import tqdm_notebook as tqdm

import pickle

In [3]:
# import dash_html_components

In [277]:
train_file ='instacart_data/order_products__train.csv'
products_file = 'instacart_data/products.csv'
def return_dfs(train_file, products_file, train_percent, products_cutoff = 0, orders_q1 = 5,
              orders_q2 = 9):
    ''' Function that returns two dataframes for 2 segments of users based basket size
    Args:   train_file - the training csv file
            products_file - the products csv file
            train_percent - percentage of the train file sampled for this (smaller % makes viz possible)
            products_cutoff - only products appearing MORE often than this are included
            orders_q1 - first cutoff point for number of items in a basket
            orders_q2 - second cutoff point for number of items in a basket
    
    '''
    orders = pd.read_csv(train_file, usecols = ['order_id', 'product_id'])
    products = pd.read_csv(products_file)
    
    # Get a wide range of orders
    order_ids = orders.order_id.unique()
    # Select a sample of the orders
    order_ids = random.sample(set(order_ids), int(len(order_ids)*train_percent))
    # Reduce the size of the initial orders data
    orders = orders[orders['order_id'].isin(order_ids)]
    
    
    # Take a look at the distribution of product counts
    counts = orders.groupby('product_id').count()
    counts.rename(columns = {'order_id':'count'}, inplace = True)
    counts.reset_index(inplace = True)
    # Remove the products occuring less often that products_cutoff
    product_ids = counts.product_id[counts['count'] > products_cutoff]
    
    # Filter for baskets of a certain size
    counts = orders.groupby('order_id').count()
    counts.rename(columns = {'product_id':'count'}, inplace = True)
    counts.reset_index(inplace = True)
    # Only keep baskets below orders_q1 size and between orders_q1 and orders_q2 size
    order_ids_Q1 = counts.order_id[counts['count'] <= orders_q1]
    order_ids_Q2  = counts.order_id[(counts['count'] <= orders_q2) & (counts['count'] > orders_q1)]
    
    # Create two dataframes for the orders
    orders_small = orders[orders['order_id'].isin(order_ids_Q1)]
    orders_small = orders_small[orders_small['product_id'].isin(product_ids)]
    orders_small = orders_small.merge(products.loc[:, ['product_id', 'product_name']], how = 'left')
    # To simplify what the orders look like, I've replaced 'bag of organic bananas' with just 'bananas'
    orders_small['product_name'] = orders_small['product_name'].replace({'Bag of Organic Bananas': 'Banana'})
    orders_small['product_name'] = orders_small['product_name'].str.replace('Organic ', '')

    orders_large = orders[orders['order_id'].isin(order_ids_Q2)]
    orders_large = orders_large[orders_large['product_id'].isin(product_ids)]
    orders_large = orders_large.merge(products.loc[:, ['product_id', 'product_name']], how = 'left')

    orders_large['product_name'] = orders_large['product_name'].replace({'Bag of Organic Bananas': 'Banana'})
    orders_large['product_name'] = orders_large['product_name'].str.replace('Organic ', '')

In [145]:
paired_products_small = []

# Create the pairwise product combinations
for order_id in tqdm(order_ids_Q1):
    tmp_df = orders_small[orders_small['order_id'] == order_id]
    paired_products_small.extend(list(itertools.combinations(tmp_df.iloc[:, 2], 2)))
    
paired_products_large = []

# Create the pairwise product combinations
for order_id in tqdm(order_ids_Q2):
    tmp_df = orders_large[orders_large['order_id'] == order_id]
    paired_products_large.extend(list(itertools.combinations(tmp_df.iloc[:, 2], 2)))
    
counts_small = collections.Counter(paired_products_small)

counts_large = collections.Counter(paired_products_large)

food_df_small = pd.DataFrame(counts_small.most_common(1000),
                      columns = ['products', 'counts'])


food_df_large = pd.DataFrame(counts_large.most_common(4000),
                      columns = ['products', 'counts'])

HBox(children=(IntProgress(value=0, max=3875), HTML(value='')))

In [292]:
# d = food_df_small.set_index('products').T.to_dict('records')
# d = food_df_large.set_index('products').T.to_dict('records')


In [293]:
G = nx.Graph()

for key, val in d[0].items():
    G.add_edge(key[0], key[1], weight = val)
    
nodes = list(G.nodes)
len(nodes)

In [255]:
# Prune the plot so we only have items that are matched with at least two others
for node in nodes:
    try:
        if G.degree[node] <= 1:
            G.remove_node(node)
    except:
        print(f'error with node {node}')

nodes = list(G.nodes)
len(nodes)

312

In [2]:
with open('large_graph.pickle', 'rb') as f:
    G_init= pickle.load(f)

In [37]:
# from itertools.recipes import take
# list(nx.all_neighbors(G_init, 'Chicken Thighs'))
list(G_init.neighbors('Banana'))
w_=[]
weights = {}
for nodes in list(G_init.edges(str('India Pale Ale'))):
#     w_.append(G_init.get_edge_data(nodes[0], nodes[1])['weight'])
    weights[nodes[1]] = G_init.get_edge_data(nodes[0], nodes[1])['weight']
# weights['India Pale Ale']=w_
weights_sorted = {k:v for k, v in sorted(weights.items(), key=lambda x: x[1],reverse=True)}
list(weights_sorted.keys())[0:1000]

['Banana',
 'Brown Ale',
 'Chardonnay',
 'Premium Belgian Lager',
 'Beer',
 'Limes',
 "Mixed 12 Pack Lion's Share Ale",
 'Amber Ale',
 'Sauvignon Blanc',
 'Prosecco Sparkling Wine',
 'Premium Lager Beer',
 'Variety Pack Hard Cider',
 'Belgian White Wheat Ale']

# Build the Word2Vec model

In [16]:
from gensim.models import Word2Vec

In [131]:
list(nx.all_neighbors(G_large, 'Beer'))

['Banana',
 'India Pale Ale',
 'Hass Avocado',
 'Brown Ale',
 'Premium Belgian Lager',
 "Little Sumpin' Sumpin' Ale",
 'Half & Half',
 'Premium Lager Beer',
 'Crisp Hard Cider Crisp Apple',
 'Belgian White Wheat Ale',
 'Belgium Beer',
 'Variety Pack Hard Cider']

In [3]:
# with open('large_graph.pickle', 'wb') as f:
#     pickle.dump(G, f)

with open('large_graph.pickle', 'rb') as f:
    G_large = pickle.load(f)

In [97]:
# target_node = 'Beer'
# weights = []
# for nodes in list(G_large.edges(target_node)):
#     weights.append(G_large.get_edge_data(target_node, nodes[1])['weight'])
# weights
# G_large.adjacency()
# G_large.adj['Beer']
# weights = {}
for node in tqdm(G_large.nodes()):
    w_ = []
    for nodes in list(G_large.edges(str(node))):
        w_.append(G_large.get_edge_data(nodes[0], nodes[1])['weight'])
    weights[node]=w_


In [100]:
def random_walk(graph, node, weighted=False, n_steps = 5):
    ''' Function that takes a random walk along a graph'''
    local_path = [str(node),]
    target_node = node
    
    # Take n_steps random walk away from the node (can return to the node)
    for _ in range(n_steps):
        neighbours = list(nx.all_neighbors(graph, target_node))
        # See the difference between doing this with and without edge weight - it takes many, many times longer
        if weighted:
            # sample in a weighted manner
            target_node = random.choices(neighbours, weights[target_node])[0]
        else:
            target_node = random.choice(neighbours)
        local_path.append(str(target_node))
        
    return local_path
    

In [108]:
walk_paths_unweighted = []

i = 0

for node in tqdm(G_large.nodes()):
    for _ in range(10):
        walk_paths_unweighted.append(random_walk(G_large, node, weighted=False))

HBox(children=(IntProgress(value=0, max=6028), HTML(value='')))

In [102]:
# Instantiate the embedder
embedder = Word2Vec(window = 4, sg=1, negative=10, alpha=0.03, min_alpha=0.0001, seed=42)
# Build the vocab
embedder.build_vocab(walk_paths, progress_per=2)
# Train teh embedder to build the word embeddings
embedder.train(walk_paths, total_examples=embedder.corpus_count, epochs=20, report_delay=1)

(5649395, 7233600)

In [109]:
# Instantiate the embedder
embedder_unweighted = Word2Vec(window = 4, sg=1, negative=10, alpha=0.03, min_alpha=0.0001, seed=42)
# Build the vocab
embedder_unweighted.build_vocab(walk_paths_unweighted, progress_per=2)
# Train teh embedder to build the word embeddings
embedder_unweighted.train(walk_paths_unweighted, total_examples=embedder.corpus_count, epochs=20, report_delay=1)

(5943178, 7233600)

In [9]:
x_ = embedder_weighted.wv.most_similar("Italian Sausage Pizza", topn=20)
[i[0] for i in x_]

['Mint Chip Ice Cream',
 'Baba Ghannouge Eggplant Dip',
 'Blackened Chicken Salad',
 'Dark Chocolate Pumpkin Seed Bark',
 'Cream Top Peach on the Bottom Yogurt',
 'Flat Fillets of Anchovies',
 'Soups Lentil',
 'Cocoa Krispies Cereal',
 'Vanilla Yogurt',
 'All Natural Fresh Buttermilk Ranch Dressing Marinade',
 'Medium Roast Ground Coffee',
 'Condensed Milk, Sweetened, Full Cream',
 'Fat Free Vitamin A & D Milk',
 'Berry Bounty Trail Mix',
 'Spam Original Luncheon Meat',
 'European Style Lightly Salted Butter',
 'Chunky Classic Chicken Noodle Soup',
 'Cream Top Blueberry Yogurt',
 'Milk Chocolate Sea Salt Caramels',
 'Earl Grey Tea']

In [139]:
embedder_unweighted.wv.most_similar('Sauvignon Blanc', topn = 20)

[('California Red Wine', 0.8779231905937195),
 ('Pinot Grigio', 0.8489992618560791),
 ('Pinot Noir Wine', 0.829694390296936),
 ('Chardonnay Wine', 0.769227147102356),
 ('Pinot Noir', 0.7511599659919739),
 ('Prosecco Sparkling Wine', 0.7239189743995667),
 ("Mixed 12 Pack Lion's Share Ale", 0.7162939310073853),
 ('Chardonnay', 0.7140623331069946),
 ('India Pale Ale', 0.7064837217330933),
 ('Amber Ale', 0.6999796032905579),
 ('Variety Pack Hard Cider', 0.6870161294937134),
 ('Premium Belgian Lager', 0.6858783960342407),
 ('Brown Ale', 0.6724100112915039),
 ('Malbec', 0.6694297790527344),
 ('Premium Lager Beer', 0.6670681238174438),
 ('Belgian White Wheat Ale', 0.638666033744812),
 ('Crisp Hard Cider Crisp Apple', 0.6262491941452026),
 ("Little Sumpin' Sumpin' Ale", 0.6190401911735535),
 ('Cabernet Sauvignon', 0.6110880374908447),
 ('Belgium Beer', 0.6095533967018127)]

In [138]:
list(nx.all_neighbors(G_large, 'Sauvignon Blanc'))

['Limes',
 'Pinot Noir',
 'Cabernet Sauvignon',
 'Sparkling Mineral Water',
 'Banana',
 'India Pale Ale',
 'Prosecco Sparkling Wine',
 'Half & Half',
 'Sour Cream',
 'Pinot Noir Wine',
 'Chardonnay',
 'Jalapeno Peppers',
 'California Red Wine',
 'Strawberries',
 'Pinot Grigio']

In [38]:
# with open('embedder_unweighted.pickle', 'wb') as f:
#     pickle.dump(embedder_unweighted, f)
    
# with open('embedder_weighted.pickle', 'wb') as f:
#     pickle.dump(embedder, f)

with open('embedder_weighted.pickle', 'rb') as f:
    embedder_weighted = pickle.load(f)

In [45]:
# find_ingredient('Cream Cheese', nodes)
embedder_weighted.wv.most_similar(positive=['India Pale Ale', 'Beer'],
                              negative=['Sauvignon Blanc'])

[("Little Sumpin' Sumpin' Ale", 0.7678930759429932),
 ('Crisp Hard Cider Crisp Apple', 0.7571238279342651),
 ('Belgian White Wheat Ale', 0.7442802786827087),
 ('Belgium Beer', 0.7372687458992004),
 ('Variety Pack Hard Cider', 0.7306048274040222),
 ('Brown Ale', 0.7268184423446655),
 ('Premium Belgian Lager', 0.7055443525314331),
 ('Premium Lager Beer', 0.6921089887619019),
 ('Amber Ale', 0.6911778450012207),
 ("Mixed 12 Pack Lion's Share Ale", 0.6248722076416016)]

In [14]:
# list(weighted_neighbours.keys())
shopping_list = traverse_graph(item = 'Boneless Skinless Chicken Breast', traversals = 20, random_ = True)
shopping_list

['Boneless Skinless Chicken Breast',
 'Organic Garlic',
 'Sparkling Water Grapefruit',
 'Strawberries',
 '2% Reduced Fat Organic Milk',
 'Organic Baby Spinach',
 'Organic Red Chard Greens',
 'Organic Lacinato (Dinosaur) Kale',
 'Organic Blueberries',
 'Organic Spring Mix',
 'Organic Whole Milk',
 'Sea Salt Pita Chips',
 'Organic Avocado',
 'Organic Bread with 21 Whole Grains',
 'Banana',
 'Raspberries',
 'Organic Half & Half',
 'Organic 2% Reduced Fat Milk',
 'Organic Fuji Apple',
 'No Salt Added Black Beans',
 'Boneless Skinless Chicken Breasts']

# Using Plotly to make the graph interactive
The code below makes an interactive map using plotly

In [272]:
# Get the nodes for plotting the graph below
nodes = [node for node in G.nodes()]
pos = nx.spring_layout(G)
len(nodes)

306

In [273]:
edge_x = []
edge_y = []
for edge in G.edges():
    x0, y0 = pos[edge[0]]
    x1, y1 = pos[edge[1]]
    edge_x.append(x0)
    edge_x.append(x1)
    edge_x.append(None)
    edge_y.append(y0)
    edge_y.append(y1)
    edge_y.append(None)

edge_trace = go.Scatter(
    x=edge_x, y=edge_y,
    line=dict(width=0.5, color='#888'),
    hoverinfo='none',
    mode='lines')

node_x = []
node_y = []
for node in G.nodes():
    x, y = pos[node]
    node_x.append(x)
    node_y.append(y)

node_trace = go.Scatter(
    x=node_x, y=node_y,
    mode='markers+text',
    hoverinfo='text',
    text = nodes,
    textfont=dict(
        family="sans serif",
        size=10
    ),
    marker=dict(
        showscale=True,
        colorscale='YlGnBu',
        reversescale=True,
        color=[],
        size=10,
        colorbar=dict(
            thickness=10,
            title='Node Connections',
            xanchor='left',
            titleside='right'
        ),
        line_width=2))

node_adjacencies = []
node_text = []
for node, adjacencies in enumerate(G.adjacency()):
    node_adjacencies.append(len(adjacencies[1]))
    node_text.append(f'{str(len(adjacencies[1]))} connections')

node_trace.marker.color = node_adjacencies
node_trace.hovertext = node_text



In [274]:
fig = go.Figure(data=[edge_trace, node_trace],
             layout=go.Layout(
                title='<br>Graph of shopping cart items',
                titlefont_size=16,
                showlegend=False,
                hovermode='closest',
                margin=dict(b=20,l=5,r=5,t=40),
                annotations=[ dict(
                    text="some text",
                    showarrow=False,
                    xref="paper", yref="paper",
                    x=0.005, y=-0.002 ) ],
                xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
                yaxis=dict(showgrid=False, zeroline=False, showticklabels=False))
                )


fig.show()

In [14]:
a_list = ["the longest one ever", "first", "longer", "even longer still"]
a_list.sort(key = len)
print(a_list)

['first', 'longer', 'even longer still', 'the longest one ever']
