In [None]:
The basics of market basket analysis
Market basket analysis uses lists of transactions to identify useful associations between items. Such associations can be written in the form of a rule that has an antecedent and a consequent. Let's assume a small grocery store has asked you to look at their transaction data. After some analysis, you find the rule given below.

{cereal}--- {milk}


{cereal} is the antecedent, {milk} is the consequent, and both are items.

In [None]:
"""
In [1]:
groceries.head()
Out[1]:

                 Transaction
0         milk,bread,biscuit
1  bread,milk,biscuit,cereal
2                  bread,tea
3             jam,bread,milk
4                tea,biscuit
"""

In [None]:
# Import pandas under the alias pd
import pandas as pd

# Load transactions from pandas
groceries = pd.read_csv(groceries_path)

# Split transaction strings into lists
transactions = groceries['Transaction'].apply(lambda t: t.split(','))

# Convert DataFrame column into list of strings
transactions = list(transactions)

# Print the list of transactions
print(transactions)
'''
[['milk', 'bread', 'biscuit'], ['bread', 'milk', 'biscuit', 'cereal'], ['bread', 'tea'], ['jam', 'bread', 'milk'], ['tea', 'biscuit'], ['bread', 'tea'], ['tea', 'cereal'], ['bread', 'tea', 'biscuit'], ['jam', 'bread', 'tea'], ['bread', 'milk'], ['coffee', 'orange', 'biscuit', 'cereal'], ['coffee', 'orange', 'biscuit', 'cereal'], ['coffee', 'sugar'], ['bread', 'coffee', 'orange'], ['bread', 'sugar', 'biscuit'], ['coffee', 'sugar', 'cereal'], ['bread', 'sugar', 'biscuit'], ['bread', 'coffee', 'sugar'], ['bread', 'coffee', 'sugar'], ['tea', 'milk', 'coffee', 'cereal']]
'''

In [None]:
# Import permutations from the itertools module
from itertools import permutations

# Define the set of groceries
flattened = [i for t in transactions for i in t]
groceries = list(set(flattened))

# Generate all possible rules
rules = list(permutations(groceries, 2))

# Print the set of rules
print(rules)

# Print the number of rules
print(len(rules))


"""
[('cereal', 'biscuit'), ('cereal', 'milk'), ('cereal', 'orange'), ('cereal', 'coffee'), ('cereal', 'bread'), ('cereal', 'jam'), ('cereal', 'sugar'), ('cereal', 'tea'), ('biscuit', 'cereal'), ('biscuit', 'milk'), ('biscuit', 'orange'), ('biscuit', 'coffee'), ('biscuit', 'bread'), ('biscuit', 'jam'), ('biscuit', 'sugar'), ('biscuit', 'tea'), ('milk', 'cereal'), ('milk', 'biscuit'), ('milk', 'orange'), ('milk', 'coffee'), ('milk', 'bread'), ('milk', 'jam'), ('milk', 'sugar'), ('milk', 'tea'), ('orange', 'cereal'), ('orange', 'biscuit'), ('orange', 'milk'), ('orange', 'coffee'), ('orange', 'bread'), ('orange', 'jam'), ('orange', 'sugar'), ('orange', 'tea'), ('coffee', 'cereal'), ('coffee', 'biscuit'), ('coffee', 'milk'), ('coffee', 'orange'), ('coffee', 'bread'), ('coffee', 'jam'), ('coffee', 'sugar'), ('coffee', 'tea'), ('bread', 'cereal'), ('bread', 'biscuit'), ('bread', 'milk'), ('bread', 'orange'), ('bread', 'coffee'), ('bread', 'jam'), ('bread', 'sugar'), ('bread', 'tea'), ('jam', 'cereal'), ('jam', 'biscuit'), ('jam', 'milk'), ('jam', 'orange'), ('jam', 'coffee'), ('jam', 'bread'), ('jam', 'sugar'), ('jam', 'tea'), ('sugar', 'cereal'), ('sugar', 'biscuit'), ('sugar', 'milk'), ('sugar', 'orange'), ('sugar', 'coffee'), ('sugar', 'bread'), ('sugar', 'jam'), ('sugar', 'tea'), ('tea', 'cereal'), ('tea', 'biscuit'), ('tea', 'milk'), ('tea', 'orange'), ('tea', 'coffee'), ('tea', 'bread'), ('tea', 'jam'), ('tea', 'sugar')]
72

"""

## Support 

In [None]:
"""
transactions


[['milk', 'bread', 'biscuit'],
 ['bread', 'milk', 'biscuit', 'cereal'],
 ['bread', 'tea'],
 ['jam', 'bread', 'milk'],
 ['tea', 'biscuit'],
 ['bread', 'tea'],
 ['tea', 'cereal'],
 ['bread', 'tea', 'biscuit'],
 ['jam', 'bread', 'tea'],
 ['bread', 'milk'],
 ['coffee', 'orange', 'biscuit', 'cereal'],
 ['coffee', 'orange', 'biscuit', 'cereal'],
 ['coffee', 'sugar'],
 ['bread', 'coffee', 'orange'],
 ['bread', 'sugar', 'biscuit'],
 ['coffee', 'sugar', 'cereal'],
 ['bread', 'sugar', 'biscuit'],
 ['bread', 'coffee', 'sugar'],
 ['bread', 'coffee', 'sugar'],
 ['tea', 'milk', 'coffee', 'cereal']]
"""

 

In [None]:
# Import the transaction encoder function from mlxtend
from mlxtend.preprocessing import TransactionEncoder
import pandas as pd

# Instantiate transaction encoder and identify unique items
encoder = TransactionEncoder().fit(transactions)

# One-hot encode transactions
onehot = encoder.transform(transactions)

# Convert one-hot encoded data to DataFrame
onehot = pd.DataFrame(onehot, columns = encoder.columns_)

# Print the one-hot encoded transaction dataset
print(onehot)
"""
In [5]:
onehot
Out[5]:

    biscuit  bread  cereal  coffee    jam   milk  orange  sugar    tea
0      True   True   False   False  False   True   False  False  False
1      True   True    True   False  False   True   False  False  False
2     False   True   False   False  False  False   False  False   True
3     False   True   False   False   True   True   False  False  False
4      True  False   False   False  False  False   False  False   True
5     False   True   False   False  False  False   False  False   True
6     False  False    True   False  False  False   False  False   True
7      True   True   False   False  False  False   False  False   True
8     False   True   False   False   True  False   False  False   True
9     False   True   False   False  False   True   False  False  False
10     True  False    True    True  False  False    True  False  False
11     True  False    True    True  False  False    True  False  False
12    False  False   False    True  False  False   False   True  False
13    False   True   False    True  False  False    True  False  False
14     True   True   False   False  False  False   False   True  False
15    False  False    True    True  False  False   False   True  False
16     True   True   False   False  False  False   False   True  False
17    False   True   False    True  False  False   False   True  False
18    False   True   False    True  False  False   False   True  False
19    False  False    True    True  False   True   False  False   True


"""

# Compute the support
support = onehot.mean()

# Print the support
print(support)
"""
biscuit    0.40
bread      0.65
cereal     0.30
coffee     0.40
jam        0.10
milk       0.25
orange     0.15
sugar      0.30
tea        0.35
"""

## How  logical_and works in numpy 

In [None]:
# Python program explaining
# logical_and() function
import numpy as np
  
# input
arr1 = [1, 3, False, 4]
arr2 = [3, 0, True, False]
  
# output
out_arr = np.logical_and(arr1, arr2)
  
print ("Output Array : ", out_arr)
"""
Output Array :  [ True False False False]
"""

## Business Question:- In this exercise, you'll make use of that DataFrame and the support metric to help the store's owner. First, she has asked you to identify frequently purchased items, which you'll do by computing support at the item-level. And second, she asked you to check whether the rule {jam} --- {bread} has a support of over 0.05. Note that onehot has been defined and is available. Additionally, pandas has been imported under the alias pd and numpy has been imported under the alias np.

In [None]:
# Add a jam+bread column to the DataFrame onehot
onehot['jam+bread'] = np.logical_and(onehot['jam'], (onehot['bread']))

# Compute the support
support = onehot.mean()

# Print the support values
print(support)
"""
# Print the support values
print(support)
biscuit      0.40
bread        0.65
cereal       0.30
coffee       0.40
jam          0.10
milk         0.25
orange       0.15
sugar        0.30
tea          0.35
jam+bread    0.10
dtype: float64

"""

# Recommending books with support


In [None]:
"""
books 

  Hunger  Potter  Twilight
0      False    True     False
1      False    True      True
2      False   False     False
3      False    True     False
4      False   False     False
...      ...     ...       ...
8045   False   False     False
8046   False   False     False
8047   False   False      True
8048    True   False      True
8049   False   False     False

[8050 rows x 3 columns]
"""

In [None]:
# Compute support for Hunger and Potter
supportHP = np.logical_and(books['Hunger'], books['Potter']).mean()

# Compute support for Hunger and Twilight
supportHT = np.logical_and(books['Hunger'], books['Twilight']).mean()

# Compute support for Potter and Twilight
supportPT = np.logical_and(books['Potter'], books['Twilight']).mean()

# Print support values
print("Hunger Games and Harry Potter: %.2f" % supportHP)
print("Hunger Games and Twilight: %.2f" % supportHT)
print("Harry Potter and Twilight: %.2f" % supportPT)

"""
Hunger Games and Harry Potter: 0.12
Hunger Games and Twilight: 0.09
Harry Potter and Twilight: 0.14(Max)

------------- Inference -------------------
Based on the support metric, 
Harry Potter and Twilight appear to be the best options for cross-promotion. 
In the next problem, we'll consider whether we should use Harry Potter to promote Twilight or Twilight to promote Harry Potter.
"""

In [None]:
# Compute support for Potter and Twilight
supportPT = np.logical_and(books['Potter'], books['Twilight']).mean()

# Compute support for Potter
supportP = books['Potter'].mean()

# Compute support for Twilight
supportT = books['Twilight'].mean()

# Compute confidence for both rules
confidencePT = supportPT / supportP
confidenceTP = supportPT / supportT

# Print results
print('{0:.2f}, {1:.2f}'.format(confidencePT, confidenceTP))

"""
confidencePT:0.29
confidenceTP:0.55

----- Inference ---------
Even though the support is identical for the two association rules, 
the confidence is much higher for Twilight -> Harry Potter, 
since Harry Potter has a higher support than Twilight.
SO Use Twilight to promote Harry Potter,

"""


In [None]:
# Compute support for Potter and Twilight
supportPT = np.logical_and(books['Potter'], books['Twilight']).mean()

# Compute support for Potter
supportP = books['Potter'].mean()

# Compute support for Twilight
supportT = books['Twilight'].mean()

# Compute lift
lift = supportPT / (supportP * supportT)

# Print lift
print("Lift: %.2f" % lift)

"""
Lift: 1.15

---- Inference------
As it turns out, lift is greater than 1.0.Twilight to promote Harry Potter, 
This could give us some confidence that the association rule we recommended did not arise by random chance.

"""

## Conviction 

In [1]:
def conviction(antecedent, consequent):
	# Compute support for antecedent AND consequent
	supportAC = np.logical_and(antecedent, consequent).mean()

	# Compute support for antecedent
	supportA = antecedent.mean()

	# Compute support for NOT consequent
	supportnC = 1.0 - consequent.mean()

	# Compute support for antecedent and NOT consequent
	supportAnC = supportA - supportAC

    # Return conviction
	return supportA * supportnC / supportAn

In [None]:
# Compute conviction for twilight -> potter and potter -> twilight
convictionTP = conviction(twilight, potter)
convictionPT = conviction(potter, twilight)

# Compute conviction for twilight -> hunger and hunger -> twilight
convictionTH = conviction(twilight, hunger)
convictionHT = conviction(hunger, twilight)

# Compute conviction for potter -> hunger and hunger -> potter
convictionPH = conviction(potter, hunger)
convictionHP = conviction(hunger, potter)

# Print results
print('Harry Potter -> Twilight: ', convictionPT)
print('Twilight -> Potter: ', convictionTP)
'''
Harry Potter -> Twilight:  1.0534570072738598
Twilight -> Potter:  1.1550539077290998 
Inference : Twilight to promote Harry Potter
'''

# Zangs-Metric : is between -1 to 1 , -1 mean strong disassociation and 1 is strong association 

In [None]:
# Compute the support of Twilight and Harry Potter
supportT = books['Twilight'].mean()
supportP = books['Potter'].mean()

# Compute the support of both books
supportTP = np.logical_and(books['Twilight'], books['Potter']).mean()

# Complete the expressions for the numerator and denominator
numerator = supportTP - supportT*supportP
denominator = max(supportTP*(1-supportT), supportT*(supportP-supportTP))

# Compute and print Zhang's metric
zhang = numerator / denominator
print(zhang)
'''
0.17231567178855997
`if Twilight then Harry Potter'' proved robust. 
It had a positive value for Zhang's metric, indicating that the two books are not dissociated.
'''

In [None]:
# Define a function to compute Zhang's metric
def zhang(antecedent, consequent):
	# Compute the support of each book
	supportA = antecedent.mean()
	supportC = consequent.mean()

	# Compute the support of both books
	supportAC = np.logical_and(antecedent, consequent).mean()

	# Complete the expressions for the numerator and denominator
	numerator = supportAC - supportA*supportC
	denominator = max(supportAC*(1-supportA), supportA*(supportC-supportAC))

	# Return Zhang's metric
	return numerator / denominator

## The founder of the ebook start-up has returned for additional consulting services. She has sent you a list of itemsets she's investigating and has asked you to determine whether any of them contain items that are dissociated. When you're finished, she has asked that you add the metric you use to a column in the rules DataFrame, which is available to you, and currently contains columns for antecedents and consequents.

The itemsets are available as a list of lists called itemsets. Each list contains the antecedent first and the consequent second. You also have access to the books DataFrame from previous exercises

In [None]:
"""
books 


      Hunger  Potter  Twilight  Mockingbird  Gatsby
0      False    True     False         True    True
1      False    True      True        False    True
2      False   False     False         True   False
3      False    True     False        False    True
4      False   False     False        False    True
...      ...     ...       ...          ...     ...
8045   False   False     False         True    True
8046   False   False     False         True   False
8047   False   False      True        False   False
8048    True   False      True        False   False
8049   False   False     False         True   False

----------------------------------------------
In [5]:
itemsets
Out[5]:

[['Potter', 'Hunger'],
 ['Twilight', 'Hunger'],
 ['Mockingbird', 'Hunger'],
 ['Gatsby', 'Hunger'],
 ['Potter', 'Twilight'],
 ['Potter', 'Mockingbird'],
 ['Potter', 'Gatsby'],
 ['Mockingbird', 'Twilight'],
 ['Gatsby', 'Twilight'],
 ['Mockingbird', 'Gatsby']]


"""

In [None]:
# Define an empty list for Zhang's metric
zhangs_metric = []

# Loop over lists in itemsets
for itemset in itemsets:
    # Extract the antecedent and consequent columns
	antecedent = books[itemset[0]]
	consequent = books[itemset[1]]
    #print('antecedent',antecedent)
	#print('consequent',consequent)
    # Complete Zhang's metric and append it to the list
	zhangs_metric.append(zhang(antecedent, consequent))
    
# Print results
rules['zhang'] = zhangs_metric
print(rules)
"""

 antecedents  consequents     zhang
    0       Potter       Hunger -0.306049
    1     Twilight       Hunger  0.109357
    2  Mockingbird       Hunger -0.525436
    3       Gatsby       Hunger -0.550446
    4       Potter     Twilight  0.245118
    5       Potter  Mockingbird -0.065537
    6       Potter       Gatsby -0.165572
    7  Mockingbird     Twilight -0.319008
    8       Gatsby     Twilight -0.370875
    9  Mockingbird       Gatsby  0.466460

"""

## Multi-Metric Filtering 

In [None]:
"""
In [1]:
rules
Out[1]:

          antecedents                      consequents  antecedent support  consequent support   support  confidence      lift  leverage  conviction
0            (Hunger)                         (Potter)            0.319130            0.477516  0.123851    0.388089  0.812725 -0.028539    0.853857
1            (Potter)                         (Hunger)            0.477516            0.319130  0.123851    0.259365  0.812725 -0.028539    0.919305
2            (Hunger)                       (Twilight)            0.319130            0.256770  0.089193    0.279486  1.088468  0.007249    1.031527
3          (Twilight)                         (Hunger)            0.256770            0.319130  0.089193    0.347363  1.088468  0.007249    1.043260
4            (Hunger)                    (Mockingbird)            0.319130            0.476522  0.096273    0.301674  0.633075 -0.055799    0.749619
..                ...                              ...                 ...                 ...       ...         ...       ...       ...         ...
145  (Potter, Gatsby)          (Twilight, Mockingbird)            0.127702            0.098261  0.024348    0.190661  1.940360  0.011800    1.114168
146        (Twilight)    (Potter, Mockingbird, Gatsby)            0.256770            0.089814  0.024348    0.094823  1.055779  0.001286    1.005535
147     (Mockingbird)       (Twilight, Potter, Gatsby)            0.476522            0.034161  0.024348    0.051095  1.495687  0.008069    1.017845
148          (Potter)  (Twilight, Mockingbird, Gatsby)            0.477516            0.036273  0.024348    0.050989  1.405678  0.007027    1.015506
149          (Gatsby)  (Twilight, Potter, Mockingbird)            0.295155            0.062981  0.024348    0.082492  1.309778  0.005759    1.021264

[150 rows x 9 columns]
"""

In [None]:
# Preview the rules DataFrame using the .head() method
print(rules.head())

# Select the subset of rules with antecedent support greater than 0.05
rules = rules[rules['antecedent support'] > 0.05]

# Select the subset of rules with a consequent support greater than 0.02
rules = rules[rules['consequent support'] > 0.02]

# Select the subset of rules with a conviction greater than 1.01
rules = rules[rules['conviction'] > 1.01]

# Print remaining rules
print(rules)

In [None]:
"""
 antecedents    consequents  antecedent support  consequent support   support  confidence      lift  leverage  conviction
0    (Hunger)       (Potter)            0.319130            0.477516  0.123851    0.388089  0.812725 -0.028539    0.853857
1    (Potter)       (Hunger)            0.477516            0.319130  0.123851    0.259365  0.812725 -0.028539    0.919305
2    (Hunger)     (Twilight)            0.319130            0.256770  0.089193    0.279486  1.088468  0.007249    1.031527
3  (Twilight)       (Hunger)            0.256770            0.319130  0.089193    0.347363  1.088468  0.007249    1.043260
4    (Hunger)  (Mockingbird)            0.319130            0.476522  0.096273    0.301674  0.633075 -0.055799    0.749619
               antecedents                      consequents  antecedent support  consequent support   support  confidence      lift  leverage  conviction
2                 (Hunger)                       (Twilight)            0.319130            0.256770  0.089193    0.279486  1.088468  0.007249    1.031527
3               (Twilight)                         (Hunger)            0.256770            0.319130  0.089193    0.347363  1.088468  0.007249    1.043260
8               (Twilight)                         (Potter)            0.256770            0.477516  0.140621    0.547654  1.146881  0.018009    1.155054
9                 (Potter)                       (Twilight)            0.477516            0.256770  0.140621    0.294485  1.146881  0.018009    1.053457
18           (Mockingbird)                         (Gatsby)            0.476522            0.295155  0.186087    0.390511  1.323070  0.045439    1.156452
..                     ...                              ...                 ...                 ...       ...         ...       ...       ...         ...
143  (Potter, Mockingbird)               (Twilight, Gatsby)            0.219503            0.053540  0.024348    0.110922  2.071754  0.012596    1.064541
145       (Potter, Gatsby)          (Twilight, Mockingbird)            0.127702            0.098261  0.024348    0.190661  1.940360  0.011800    1.114168
147          (Mockingbird)       (Twilight, Potter, Gatsby)            0.476522            0.034161  0.024348    0.051095  1.495687  0.008069    1.017845
148               (Potter)  (Twilight, Mockingbird, Gatsby)            0.477516            0.036273  0.024348    0.050989  1.405678  0.007027    1.015506
149               (Gatsby)  (Twilight, Potter, Mockingbird)            0.295155            0.062981  0.024348    0.082492  1.309778  0.005759    1.021264

[82 rows x 9 columns]



------ Inference:
ou have now successfully performed multi-metric filtering. 
In the final exercise in this chapter, you'll go even further by including an advanced metric.

"""

In [None]:
# Set the lift threshold to 1.5
rules = rules[rules['lift'] > 1.5]

# Set the conviction threshold to 1.0
rules = rules[rules['conviction']>1]

# Set the threshold for Zhang's rule to 0.65
rules = rules[rules['zhang']>0.65]

# Print rule
print(rules[['antecedents','consequents']])


'''

<script.py> output:
                     antecedents               consequents
    115    (Potter, Mockingbird)          (Hunger, Gatsby)
    119            (Mockingbird)  (Hunger, Potter, Gatsby)
    127    (Hunger, Mockingbird)        (Twilight, Gatsby)
    129  (Twilight, Mockingbird)          (Hunger, Gatsby)
    143    (Potter, Mockingbird)        (Twilight, Gatsby)


'''

## Performing aggregation
After completing minor consulting jobs for a library and an ebook seller, you've finally received your first big market basket analysis project: advising an online novelty gifts retailer on cross-promotions. Since the retailer has never previously hired a data scientist, it would like you to start the project by exploring its transaction data. It has asked you to perform aggregation for all signs in the dataset and also compute the support for this category. Note that pandas has been imported for you as pd. Additionally, the data has been imported in one-hot encoded format as onehot.

In [None]:

"""
onehot

      50'S CHRISTMAS GIFT BAG LARGE   DOLLY GIRL BEAKER   I LOVE LONDON MINI BACKPACK   RED SPOT GIFT BAG LARGE   SPACEBOY BABY GIFT SET  12 MESSAGE CARDS WITH ENVELOPES  12 PENCIL SMALL TUBE WOODLAND  ...  ZINC FOLKART SLEIGH BELLS  ZINC METAL HEART DECORATION  ZINC T-LIGHT HOLDER STAR LARGE  ZINC T-LIGHT HOLDER STARS SMALL  ZINC WILLIE WINKIE  CANDLE STICK  amazon adjust  check
0                              False               False                         False                     False                    False                            False                          False  ...                      False                        False                           False                            False                             False          False  False
1                              False               False                         False                     False                    False                            False                          False  ...                      False                        False                           False                            False                             False          False  False
2                              False               False                         False                     False                    False                            False                          False  ...                      False                        False                           False                            False                             False          False  False
3                              False               False                         False                     False                    False                            False                          False  ...                      False                        False                           False                            False                             False          False  False
4                              False               False                         False                     False                    False                            False                          False  ...                      False                        False                           False                            False                             False          False  False
...                              ...                 ...                           ...                       ...                      ...                              ...                            ...  ...                        ...                          ...                             ...                              ...                               ...            ...    ...
1325                           False               False                         False                     False                    False                            False                          False  ...                      False                        False                           False                            False                             False          False  False
1326                           False               False                         False                     False                    False                            False                          False  ...                      False                        False                           False                            False                             False          False  False
1327                           False               False                         False                     False                    False                            False                          False  ...                      False                        False                           False                            False                             False          False  False
1328                           False               False                         False                     False                    False                            False                          False  ...                      False                        False                           False                            False                             False          False  False
1329                           False               False                         False                     False                    False                            False                          False  ...                      False                        False                           False                            False                             False          False  False

[1330 rows x 1039 columns]

"""

In [None]:
# Select the column headers for sign items
sign_headers = [i for i in onehot.columns if i.lower().find('sign')>=0]
"""
In [6]:
sign_headers
Out[6]:

['60 CAKE CASES DOLLY GIRL DESIGN',
 'AREA PATROLLED METAL SIGN',
 'BAKING SET SPACEBOY DESIGN',
 'BATHROOM METAL SIGN',
 'BEWARE OF THE CAT METAL SIGN ',
 'BIRDS MOBILE VINTAGE DESIGN',
 'CERAMIC BOWL WITH LOVE HEART DESIGN',
 'CERAMIC CAKE DESIGN SPOTTED MUG',
 'CHARLOTTE BAG APPLES DESIGN',
 'CHARLOTTE BAG SUKI DESIGN',
 'CHILDRENS APRON APPLES DESIGN',
 'CHILDRENS APRON SPACEBOY DESIGN',
 'COFFEE MUG PEARS  DESIGN',
 'COOK WITH WINE METAL SIGN ',
 'COTTON APRON PANTRY DESIGN',
 'FAIRY CAKE DESIGN UMBRELLA',
 'FRENCH BLUE METAL DOOR SIGN 3',
 'FRENCH BLUE METAL DOOR SIGN 5',
 'FRENCH BLUE METAL DOOR SIGN 6',
 'FRENCH BLUE METAL DOOR SIGN 9',
 'FRENCH BLUE METAL DOOR SIGN No',
 'FRENCH TOILET SIGN BLUE METAL',
 'FRENCH WC SIGN BLUE METAL',
 'GIN + TONIC DIET METAL SIGN',
 'HAND OVER THE CHOCOLATE   SIGN ',
 'HAND WARMER BABUSHKA DESIGN',
 'HAND WARMER BIRD DESIGN',
 'HAND WARMER OWL DESIGN',
 'HAND WARMER SCOTTY DOG DESIGN',
 'HOME SWEET HOME METAL SIGN ',
 'JUMBO BAG DOLLY GIRL DESIGN',
 'JUMBO BAG SPACEBOY DESIGN',
 'KITTENS DESIGN FLANNEL',
 'LADIES & GENTLEMEN METAL SIGN',
 'LAUNDRY 15C METAL SIGN',
 'LUNCH BAG ALPHABET DESIGN',
 'LUNCH BAG APPLE DESIGN',
 'LUNCH BAG SPACEBOY DESIGN ',
 'LUNCH BAG SUKI DESIGN ',
 'LUNCH BAG VINTAGE LEAF DESIGN',
 'MEMO BOARD COTTAGE DESIGN',
 'METAL SIGN DROP YOUR PANTS',
 'METAL SIGN EMPIRE TEA',
 'METAL SIGN HIS DINNER IS SERVED',
 'MONEY BOX KINGS CHOICE DESIGN',
 'MONEY BOX POCKET MONEY DESIGN',
 'N0 SINGING METAL SIGN',
 'NO JUNK MAIL METAL SIGN',
 'PARTY METAL SIGN ',
 'PEG BAG APPLES DESIGN',
 'PLEASE ONE PERSON METAL SIGN',
 'POTTERING IN THE SHED METAL SIGN',
 'RECIPE BOX PANTRY YELLOW DESIGN',
 'RED CHARLIE+LOLA PERSONAL DOORSIGN',
 'RIBBON REEL HEARTS DESIGN ',
 'RIBBON REEL LACE DESIGN ',
 'SET 20 NAPKINS FAIRY CAKES DESIGN ',
 'SET OF 3 CAKE TINS PANTRY DESIGN ',
 'SET OF 36 DOILIES PANTRY DESIGN',
 'SET OF 36 DOILIES SPACEBOY DESIGN ',
 'SET OF 6 SPICE TINS PANTRY DESIGN',
 'SET OF 60 PANTRY DESIGN CAKE CASES ',
 'SMALL DOLLY MIX DESIGN ORANGE BOWL',
 'STRIPES DESIGN TEDDY',
 'TOILET SIGN OCCUPIED OR VACANT',
 'WASHROOM METAL SIGN',
 'WORLD WAR 2 GLIDERS ASSTD DESIGNS',
 'WRAP ALPHABET DESIGN',
 'WRAP POPPIES  DESIGN',
 "YOU'RE CONFUSING ME METAL SIGN "]

"""



# Select columns of sign items using sign_headers
sign_columns = onehot[sign_headers]


# Perform aggregation of sign items into sign category
signs = sign_columns.sum(axis = 1) >= 1.0

# Print support for signs
print('Share of Signs: %.2f' % signs.mean())

"""
If you look at the printed statement, you'll notice that support for signs is 0.10, 
which suggests that signs are an important category of items for the retailer.
"""

## Defining an aggregation function
Surprised by the high share of sign items in its inventory, the retailer decides that it makes sense to do further aggregation for different categories to explore the data better. This seems trivial to you, but the retailer has not previously been able to perform even a basic descriptive analysis of its transaction and items.

The retailer asks you to perform aggregation for the candles, bags, and boxes categories. To simplify the task, you decide to write a function. It will take a string that contains an item's category. It will then output a DataFrame that indicates whether each transaction includes items from that category. Note that pandas has been imported for you as pd. Additionally, the data has been imported in one-hot encoded format as onehot.

In [None]:
def aggregate(item):
	# Select the column headers for sign items in onehot
	item_headers = [i for i in onehot.columns if i.lower().find(item)>=0]

	# Select columns of sign items
	item_columns = onehot[item_headers]

	# Return category of aggregated items
	return item_columns.sum(axis = 1) >= 1.0

# Aggregate items for the bags, boxes, and candles categories  
bags = aggregate('bag')
boxes = aggregate('box')
candles = aggregate('candle')


"""
In [1]:
bags 
Out[1]:

0       False
1        True
2       False
3       False
4        True
        ...  
1325    False
1326    False
1327    False
1328     True
1329    False
Length: 1330, dtype: bool

"""

## Pruning and Apriori
In the video, we introduced the Apriori algorithm, which made use of the Apriori principle to prune itemsets. The Apriori principle tells us that subsets of frequent itemsets are frequent. Thus, if we find an infrequent itemset, which we'll call {X}, then it must be the case that {X, Y} is also infrequent, so we may eliminate it without computing its support.

In this exercise, you'll be given itemsets and information about the frequency of its subsets. You will need to decide whether the information is sufficient to prune the itemset or whether we need to compute its support.


In [None]:
# Import apriori from mlxtend
from mlxtend.frequent_patterns import apriori

# Compute frequent itemsets using the Apriori algorithm
frequent_itemsets = apriori(onehot, 
                            min_support = 0.006, 
                            max_len = 3, 
                            use_colnames = True)

# Print a preview of the frequent itemsets
print(frequent_itemsets.head())


"""
<script.py> output:
        support                              itemsets
    0  0.006767          (HOT WATER BOTTLE KEEP CALM)
    1  0.007519             (JUMBO BAG RED RETROSPOT)
    2  0.006015     (PAPER CHAIN KIT 50'S CHRISTMAS )
    3  0.006015                      (POPCORN HOLDER)
    4  0.006767  (WHITE HANGING HEART T-LIGHT HOLDER)

"""

## Selecting a support threshold
The manager of the online gift store looks at the results you provided from the previous exercise and commends you for the good work. She does, however, raise an issue: all of the itemsets you identified contain only one item. She asks whether it would be possible to use a less restrictive rule and to generate more itemsets, possibly including those with multiple items.

After agreeing to do this, you think about what might explain the lack of itemsets with more than 1 item. It can't be the max_len parameter, since that was set to three. You decide it must be support and decide to test two different values, each time checking how many additional itemsets are generated. Note that pandas is available as pd and the one-hot encoded data is available as onehot.

In [None]:
# Import apriori from mlxtend
from mlxtend.frequent_patterns import apriori


# Compute frequent itemsets using a support of 0.003 and length of 3
frequent_itemsets_1 = apriori(onehot, min_support = 0.003, 
                            max_len = 3, use_colnames = True)

# Compute frequent itemsets using a support of 0.001 and length of 3
frequent_itemsets_2 = apriori(onehot, min_support = 0.001, 
                            max_len = 3, use_colnames = True)

# Print the number of freqeuent itemsets
print(len(frequent_itemsets_1), len(frequent_itemsets_2))

"""
91 429
>>
generated by the Apriori algorithm using a support value of 0.002

"""

## Pruning with lift
Once again, you report back to the novelty gift store manager. This time, you tell her that you identified no rules when you used a higher support threshold for the Apriori algorithm and only two rules when you used a lower threshold. She commends you for the good work, but asks you to consider using another metric to reduce the two rules to one.

You remember that lift had a simple interpretation: values greater than 1 indicate that items co-occur more than we would expect if they were independently distributed across transactions. You decide to use lift, since that message will be simple to convey. Note that pandas is available as pd and the one-hot encoded transaction data is available as onehot. Additionally, apriori has been imported from mlxtend.

In [None]:
# Import the association rules function
from mlxtend.frequent_patterns import association_rules

# Compute frequent itemsets using the Apriori algorithm
frequent_itemsets = apriori(onehot, min_support = 0.001, 
                            max_len = 2, use_colnames = True)

# Compute all association rules for frequent_itemsets
rules = association_rules(frequent_itemsets, 
                            metric = "lift", 
                         	min_threshold = 1.0)

# Print association rules
print(rules)
"""
<script.py> output:
                       antecedents                  consequents  antecedent support  consequent support   support  confidence       lift  leverage  conviction
    0    (JUMBO BAG RED RETROSPOT)  (BIRTHDAY CARD, RETRO SPOT)            0.007519            0.002256  0.001504    0.200000  88.666667  0.001487    1.247180
    1  (BIRTHDAY CARD, RETRO SPOT)    (JUMBO BAG RED RETROSPOT)            0.002256            0.007519  0.001504    0.666667  88.666667  0.001487    2.977444

It looks like you've ended up with two association rules once again, both with lift values greater than 1.0.

"""

## Pruning with confidence
Once again, you've come up short: you found multiple useful rules, but can't narrow it down to one. Even worse, the two rules you found used the same itemset, but just swapped the antecedents and consequents. You decide to see whether pruning by another metric might allow you to narrow things down to a single association rule.

What would be the right metric? Both lift and support are identical for all rules that can be generated from an itemset, so you decide to use confidence instead, which differs for rules produced from the same itemset. Note that pandas is available as pd and the one-hot encoded transaction data is available as onehot. Additionally, apriori has been imported from mlxtend.

In [None]:
"""

onehot


       50'S CHRISTMAS GIFT BAG LARGE   DOLLY GIRL BEAKER   I LOVE LONDON MINI BACKPACK   RED SPOT GIFT BAG LARGE   SPACEBOY BABY GIFT SET  12 MESSAGE CARDS WITH ENVELOPES  12 PENCIL SMALL TUBE WOODLAND  ...  ZINC FOLKART SLEIGH BELLS  ZINC METAL HEART DECORATION  ZINC T-LIGHT HOLDER STAR LARGE  ZINC T-LIGHT HOLDER STARS SMALL  ZINC WILLIE WINKIE  CANDLE STICK  amazon adjust  check
0                              False               False                         False                     False                    False                            False                          False  ...                      False                        False                           False                            False                             False          False  False
1                              False               False                         False                     False                    False                            False                          False  ...                      False                        False                           False                            False                             False          False  False
2                              False               False                         False                     False                    False                            False                          False  ...                      False                        False                           False                            False                             False          False  False
3                              False               False                         False                     False                    False                            False                          False  ...                      False                        False                           False                            False                             False          False  False
4                              False               False                         False                     False                    False                            False                          False  ...                      False                        False                           False                            False                             False          False  False
...                              ...                 ...                           ...                       ...                      ...                              ...                            ...  ...                        ...                          ...                             ...                              ...                               ...            ...    ...
1325                           False               False                         False                     False                    False                            False                          False  ...                      False                        False                           False                            False                             False          False  False
1326                           False               False                         False                     False                    False                            False                          False  ...                      False                        False                           False                            False                             False          False  False
1327                           False               False                         False                     False                    False                            False                          False  ...                      False                        False                           False                            False                             False          False  False
1328                           False               False                         False                     False                    False                            False                          False  ...                      False                        False                           False                            False                             False          False  False
1329                           False               False                         False                     False                    False                            False                          False  ...                      False                        False                           False                            False                             False          False  False

[1330 rows x 1039 columns]

"""

In [None]:
# Import the association rules function
from mlxtend.frequent_patterns import apriori, association_rules

# Compute frequent itemsets using the Apriori algorithm
frequent_itemsets = apriori(onehot, min_support = 0.0015, 
                            max_len = 2, use_colnames = True)

# Compute all association rules using confidence
rules = association_rules(frequent_itemsets, 
                            metric = "confidence", 
                            min_threshold = 0.5)

# Print association rules
print(rules)
"""
                   antecedents                consequents  antecedent support  consequent support   support  confidence       lift  leverage  conviction
0  (BIRTHDAY CARD, RETRO SPOT)  (JUMBO BAG RED RETROSPOT)            0.002256            0.007519  0.001504    0.666667  88.666667  0.001487    2.977444

Notice that we have narrowed things down to just a single rule. We can recommend this to the manager.

"""


## Aggregation and filtering
In the video, we helped a gift store manager arrange the sections in her physical retail location according to association rules. The layout of the store forced us to group sections into two pairs of product types. After applying advanced filtering techniques, we proposed the floor layout below.

The image shows the store layout that was selected in the video.
The store manager is now asking you to generate another floorplan proposal, but with a different criterion: each pair of sections should contain one high support product and one low support product. The data, aggregated, has been aggregated and one-hot encoded for you. Additionally, apriori() and association_rules() have been imported from mlxtend.

In [None]:
# Apply the apriori algorithm with a minimum support of 0.0001
frequent_itemsets = apriori(aggregated, min_support = 0.0001, use_colnames = True)

# Generate the initial set of rules using a minimum support of 0.0001
rules = association_rules(frequent_itemsets, 
                          metric = "support", min_threshold = 0.0001)

# Set minimum antecedent support to 0.35
rules = rules[rules['antecedent support'] > 0.35]

# Set maximum consequent support to 0.35
rules = rules[rules['consequent support'] < 0.35]

# Print the remaining rules
print(rules)

"""
   antecedents     consequents  antecedent support  consequent support   support  confidence      lift  leverage  conviction
0        (bag)           (box)            0.466307            0.256065  0.021563    0.046243  0.180590 -0.097841    0.780005
2        (bag)        (candle)            0.466307            0.088949  0.010782    0.023121  0.259940 -0.030696    0.932615
8       (sign)           (box)            0.355795            0.256065  0.018868    0.053030  0.207097 -0.072239    0.785596
10      (sign)        (candle)            0.355795            0.088949  0.008086    0.022727  0.255510 -0.023561    0.932238
15      (sign)   (bag, candle)            0.355795            0.010782  0.005391    0.015152  1.405303  0.001555    1.004437
16       (bag)  (sign, candle)            0.466307            0.008086  0.005391    0.011561  1.429672  0.001620    1.003515

"""

In [None]:
"""
----- Inference ---
you'll find both bag -> box and sign -> candles. 
We can tell the store manager that the original proposal is also acceptable under this new criterion.
"""

## Applying Zhang's rule
In Chapter 2, we learned that Zhang's rule is a continuous measure of association between two items that takes values in the [-1,+1] interval. A -1 value indicates a perfectly negative association and a +1 value indicates a perfectly positive association. In this exercise, you'll determine whether Zhang's rule can be used to refine a set of rules a gift store is currently using to promote products.

Note that the frequent itemsets have been computed for you and are available as frequent_itemsets. Additionally, zhangs_rule() has been defined and association_rules() have been imported from mlxtend. You will start by re-computing the original set of rules. After that, you will apply Zhang's metric to select only those rules with a high and positive association.

In [None]:
# Generate the initial set of rules using a minimum lift of 1.00
rules = association_rules(frequent_itemsets, metric = "lift", min_threshold = 1.00)

# Set antecedent support to 0.005
rules = rules[rules['antecedent support'] > 0.005]

# Set consequent support to 0.005
rules = rules[rules['consequent support'] > 0.005]

# Compute Zhang's rule
rules['zhang'] = zhangs_rule(rules)

# Set the lower bound for Zhang's rule to 0.98
rules = rules[rules['zhang'] > 0.98]
print(rules[['antecedents', 'consequents']])

In [None]:
"""
print(rules[['antecedents', 'consequents']])
                              antecedents                           consequents
26                  (BROCADE RING PURSE )      (PANTRY MAGNETIC  SHOPPING LIST)
27       (PANTRY MAGNETIC  SHOPPING LIST)                 (BROCADE RING PURSE )
84           (HAND WARMER RED LOVE HEART)             (JUMBO BAG PINK POLKADOT)
85              (JUMBO BAG PINK POLKADOT)          (HAND WARMER RED LOVE HEART)
88           (HAND WARMER RED LOVE HEART)  (WOOD 2 DRAWER CABINET WHITE FINISH)
89   (WOOD 2 DRAWER CABINET WHITE FINISH)          (HAND WARMER RED LOVE HEART)
148                        (WICKER STAR )                (RED STAR CARD HOLDER)
149                (RED STAR CARD HOLDER)                        (WICKER STAR )
152      (RIBBON REEL CHRISTMAS PRESENT )  (WOODEN TREE CHRISTMAS SCANDINAVIAN)
153  (WOODEN TREE CHRISTMAS SCANDINAVIAN)      (RIBBON REEL CHRISTMAS PRESENT )

"""

In [None]:
"""
Inferences: Notice that 10 items had a Zhang's metric value of over 0.98,
which suggests that the items are nearly perfectly associated in the data. 
In general, when we see such strong associations,
we'll want to think carefully about what explains them. 
We might, for instance, investigate whether the items be purchased separately or whether they are bundled in a way that prevents this.

"""

## Advanced filtering with multiple metrics
Earlier, we used data from an online novelty gift store to find antecedents that could be used to promote a targeted consequent. Since the set of potential rules was large, we had to rely on the Apriori algorithm and multi-metric filtering to narrow it down. In this exercise, we'll examine the full set of rules and find a useful one, rather than targeting a particular antecedent.

Note that the data has been loaded, preprocessed, and one-hot encoded, and is available as onehot. Additionally apriori() and association_rules() have been imported from mlxtend. In this exercise, you'll apply the Apriori algorithm to identify frequent itemsets. You'll then recover the set of association rules from the itemsets and apply multi-metric filtering.

In [None]:
# Apply the Apriori algorithm with a minimum support threshold of 0.001
frequent_itemsets = apriori(onehot, min_support = 0.001, use_colnames = True)

# Recover association rules using a minium support threshold of 0.001
rules = association_rules(frequent_itemsets, metric = 'support', min_threshold = 0.001)

# Apply a 0.002 antecedent support threshold, 0.60 confidence threshold, and 2.50 lift threshold
filtered_rules = rules[(rules['antecedent support'] > 0.002) &
						(rules['consequent support'] > 0.01) &
						(rules['confidence'] > 0.60) &
						(rules['lift'] > 2.50)]

# Print remaining rule
print(filtered_rules[['antecedents','consequents']])

"""
                    antecedents                consequents
23  (BIRTHDAY CARD, RETRO SPOT)  (JUMBO BAG RED RETROSPOT)
"""

