In [7]:
import warnings
warnings.filterwarnings('ignore') # Import warnings to avoid SSL cert error

import ast, sys, string, re, os, json # from python standard library
from ast import literal_eval
from collections import OrderedDict

import pandas as pd # data formatting and cleaning
import numpy as np

In [8]:
sales_data = pd.read_csv('clean_transaction_data.csv') 

FileNotFoundError: [Errno 2] No such file or directory: 'clean_transaction_data.csv'

# Preparing network data structures
* Want to examine relationship between top 10 best sellers
* When looking at bottom 30 worst sellers, not many of them are bought together, or share transactions
* Should be shown along side how often they're bought, in absolute value. Network only highlights popularity between products, not the absolute popularity of each product individually

In [3]:
top10 = sales_data.PRODUCT_NAME.value_counts().head(10).index
bestsellers = sales_data[sales_data['PRODUCT_NAME'].isin(top10)]

bottom40 = sales_data.PRODUCT_NAME.value_counts().tail(40).index
worstsellers = sales_data[sales_data['PRODUCT_NAME'].isin(bottom40)]


# take cleaned InvoiceNo data
def encoder(x):
    """For MBA to work, need 1 if product ordered and 0 otherwise. Instead of having the Product ID, we replace it with a 1"""
    if x != 0: # usually the cell will have the Product ID number if a product is selected in a InvoiceNo. We want '1' instead.
        return 1
    else:
        return 0 

def create_transaction_data(dataframe): 
    """Function takes your data frame as an argument and turns into InvoiceNo object that market basket analysis can be conducted on. 
    It requires, at minimum, a InvoiceNo/InvoiceNo ID and name of item/product team."""  
       
    basket = dataframe.groupby(['InvoiceNo', 'PRODUCT_NAME'])['PROD_ID'].sum()
    basket = basket.unstack()
    basket = basket.reset_index()
    basket = basket.fillna(0)
    basket = basket.set_index('InvoiceNo')
    basket = basket.applymap(encoder)

    return basket


def return_adj_matrix(basket, encode):
    
    adj = pd.concat([basket.groupby(col).sum() for col in basket.columns]).loc[1] # loc[1] chooses the values where the column is 1
    # following 3 lines fix issue: first column being sent to end of df
    cols = list(adj.columns)
    cols = [cols[-1]] + cols[:-1]
    adj = adj[cols]
    # above code makes diagonal elements NaN, so just convert it to zero: redundant anyway in network context
    adj.fillna(0, inplace = True) 
    
    if encode == True: # condition to binarise the data
        adj = adj.applymap(encoder) # implement binarisation
    # if you don't want binarisation (i.e. want weighted adjacency matrix, below code runs)
    # setting encode to False yields a WEIGHTED adjacency matrix
    adj = adj.set_index(basket.columns) 
    cols = adj.columns
    adj[cols] = adj[cols].astype(int)
    
    return adj


basket = create_transaction_data(dataframe=bestsellers)
adj_matrix = return_adj_matrix(basket=basket, encode=False)

nodes = pd.DataFrame(data = adj_matrix.columns, columns= ['PRODUCT_NAME']) 
nodes["PROD_ID"] = ""
nodes = nodes.merge(bestsellers, on = 'PRODUCT_NAME', how='inner')

columns_titles = ["PRODUCT_NAME", "PROD_ID_y"]
nodes = nodes.reindex(columns=columns_titles)
nodes = nodes.rename(columns={"PRODUCT_NAME": "NAME", "PROD_ID_y": "PROD_ID"})
nodes["PROD_ID"] = nodes["PROD_ID"].astype(int)
nodes.drop_duplicates(keep = 'first', inplace= True)
nodes.reset_index(inplace=True, drop=True)

if all(nodes.NAME.unique() == adj_matrix.columns.unique()):
    print("Product team names in 'nodes' object match product team column headers in 'adj_matrix'. \nProceed with network visualisation")
# Have you select right dataframe to match IDs on? Check if you're matching adj. matrix best sellers with best sellers

Product team names in 'nodes' object match product team column headers in 'adj_matrix'. 
Proceed with network visualisation


In [4]:
nodes.to_csv(r'C:\Users\NJM\Desktop\Computing\Deployment\Practice run\Recommendation\Network\nodes.csv', 
                  index = False, 
                  header = True, encoding='UTF-8-sig')
adj_matrix.to_csv(r'C:\Users\NJM\Desktop\Computing\Deployment\Practice run\Recommendation\Network\adj_matrix.csv', 
                  index = False, 
                  header = True, encoding='UTF-8-sig')