In [121]:
import numpy as np
import pandas as pd
import pyfpgrowth 


df = pd.read_csv('www.csv', sep='\t') 
print(f"Task 1: {df.shape[0]} records with {df.shape[1] - 2} attributes excluding User and Query")

print('Number of rows before removing duplicates:', len(df)) 
df = df.drop_duplicates('Query', keep='first') 
df.reset_index(drop=True, inplace=True) 
print('Number of rows after removing duplicates:', len(df)) 

Task 1: 9999 records with 500 attributes excluding User and Query
Number of rows before removing duplicates: 9999
Number of rows after removing duplicates: 9816


In [122]:
# Convert the Query column to a list of lists 
associations = df['Query'].apply(lambda x: x.split()).tolist()
num_records = len(associations) # list of lists
patterns = pyfpgrowth.find_frequent_patterns(associations,100) #itemsets w/ min-support 100

print(f"Task 2: The most frequent keyword is {max(patterns, key=patterns.get)}", 
      f"occuring in {max(patterns.values())} queries ")

min_support = 100 / num_records 
print(f"Task 2: a support count of 100 records correspond to a support value of {min_support}") 

print(f"Task 2: with this min-support, we find {len(patterns)} frequent itemsets", 
     f"the max size of which is {max(len(i) for i in patterns)}")

Task 2: The most frequent keyword is ('of',) occuring in 955 queries 
Task 2: a support count of 100 records correspond to a support value of 0.010187449062754686
Task 2: with this min-support, we find 28 frequent itemsets the max size of which is 2


In [123]:
min_supports = [0.001, 0.004, 0.007, 0.010]
num_itemsets = []
maxsize_itemsets = []
for min_support in min_supports:
    sigma = num_records * min_support
    patterns = pyfpgrowth.find_frequent_patterns(associations, sigma) 
    num_itemsets.append(len(patterns))
    maxsize_itemsets.append(max(len(i) for i in patterns))
print(f"Task 3 : Min-support values: {min_supports}\n"
      f"Task 3 : Number of frequent itemsets: {num_itemsets}\n"
      f"Task 3 : Max size of frequent itemset: {maxsize_itemsets}\n"
      "Task 3: Choosing min-support to be 0.004 we discover 156 frequent itemsets")


Task 3 : Min-support values: [0.001, 0.004, 0.007, 0.01]
Task 3 : Number of frequent itemsets: [2281, 156, 57, 29]
Task 3 : Max size of frequent itemset: [10, 4, 2, 2]
Task 3: Choosing min-support to be 0.004 we discover 156 frequent itemsets


In [124]:
### TASK 4

min_support = 0.004
support_count = min_support*num_records
confidence = 0.8 

patterns = pyfpgrowth.find_frequent_patterns(associations, support_count) 
rules = pyfpgrowth.generate_association_rules(patterns, confidence) 

print(f"Task 4: Number of rules with min-confidence 0.8: {len(rules)}\n"
        "Task 4: The rule 'york' -> 'new' has higher confidence than 'new' -> 'york', \n"
        "\tthis is because the word 'york' is almost always accompanied by 'new',\n"
        "\twhile the word 'new' more commonly appears alone.\n")

print("========== RULES ===========")
for k, v in rules.items():
    print(f"{k} : {v}")
print("===========================")

Task 4: Number of rules with min-confidence 0.8: 14
Task 4: The rule 'york' -> 'new' has higher confidence than 'new' -> 'york', 
	this is because the word 'york' is almost always accompanied by 'new',
	while the word 'new' more commonly appears alone.

('estate',) : (('real',), 0.9166666666666666)
('york',) : (('new',), 0.9139784946236559)
('sale',) : (('for',), 0.8333333333333334)
('i', 'to') : (('the',), 2.309090909090909)
('the', 'to') : (('i',), 1.5301204819277108)
('i', 'the', 'the') : (('to',), 3.106382978723404)
('i', 'the', 'to') : ((), 1.3804347826086956)
('the', 'the', 'to') : (('i',), 2.4745762711864407)
('a', 'how') : (('to',), 0.9230769230769231)
('in', 'to') : (('the',), 1.0681818181818181)
('in', 'of', 'the') : ((), 0.8679245283018868)
('in', 'the', 'the') : (('of',), 1.0)
('of', 'the', 'the') : (('in',), 0.92)
('in', 'of') : (('the',), 0.828125)


In [125]:
min_confidence_range = [0.1, 0.3, 0.5, 0.7, 0.9]  
num_rules = []

for min_conf in min_confidence_range: 
    rules = pyfpgrowth.generate_association_rules(patterns, min_conf) 
    num_rules.append(len(rules))

print(f"Task 5 : Minimum confidence values: {min_confidence_range}\n"
      f"Task 5 : Number of rules: {num_rules}")

Task 5 : Minimum confidence values: [0.1, 0.3, 0.5, 0.7, 0.9]
Task 5 : Number of rules: [31, 26, 18, 15, 11]


In [200]:
####### TASK 6 ###################

min_support = 0.001
min_conf = 0.5

#### do not touch!
sigma = num_records * min_support
patterns = pyfpgrowth.find_frequent_patterns(associations, sigma) 
rules = pyfpgrowth.generate_association_rules(patterns, min_conf) 
support = {key: value / len(df) for key, value in patterns.items()} 


## OPTIONAL: filter to only keep entries where key is a single word
support = {k[0]:v for k, v in support.items() if len(k) == 1 }
rules = {k[0]:v for k, v in rules.items() if len(k) == 1 }

### print filtered rules
for k, v in rules.items():
    print(f"{k} : {v}")

into : (('the',), 1.1)
chamber : (('of',), 0.9090909090909091)
humane : (('society',), 1.0)
idol : (('american',), 0.9166666666666666)
many : (('how',), 1.0)
african : (('american',), 0.8333333333333334)
dc : (('washington',), 0.9166666666666666)
force : (('air',), 0.8333333333333334)
commerce : (('of',), 0.9230769230769231)
sims : (('2',), 0.9230769230769231)
diego : (('san',), 1.0)
april : (('2006',), 0.7142857142857143)
t : (('the',), 0.6666666666666666)
listen : (('to',), 0.8)
just : (('to',), 0.8)
antonio : (('san',), 0.8666666666666667)
way : (('the',), 0.9333333333333333)
like : (('the',), 0.8666666666666667)
last : (('the',), 0.6470588235294118)
social : (('security',), 0.5555555555555556)
cheat : (('codes',), 0.6666666666666666)
middle : (('school',), 0.6666666666666666)
if : (('you',), 0.5789473684210527)
town : (('of',), 0.5263157894736842)
dept : (('of',), 0.5)
. : (('com',), 0.5)
orange : (('county',), 0.5)
space : (('my',), 0.6666666666666666)
over : (('the',), 0.61904761

In [210]:
#### Unexpected results:
        #f"{rules[keyword][0]}\n"
        #f"{support}\n"
keywords = ['union', 'april', 'town']
for keyword in keywords:
    if keyword in support and keyword in rules:
        print(f"Keyword: '{keyword}'\n"
            f"Support: {support[keyword]}\n"
            f"Rule: '{[w for w in rules[keyword][0]]}' with confidence {rules[keyword][1]}\n")
        for i in range(len(rules[keyword][0])):
            consequent = rules[keyword][0][i]
            if consequent in support:
                lift = rules[keyword][1] / support[consequent]
                print(f"Lift of '{consequent}': {lift}\n")
            else:
                print(f"Support for '{consequent}' not found, unable to calculate lift\n")
    else:
        print(f"Keyword: '{keyword}' not found among frequent patterns or rules")

Keyword: 'union'
Support: 0.003973105134474328
Rule: '['credit']' with confidence 0.5641025641025641
Lift of 'credit': 89.31017369727047

Keyword: 'april'
Support: 0.0014262428687856561
Rule: '['2006']' with confidence 0.7142857142857143
Lift of '2006': 80.59113300492612

Keyword: 'town'
Support: 0.0019356153219233904
Rule: '['of']' with confidence 0.5263157894736842
Lift of 'of': 5.409754753375585



In [187]:
#### Try a specific keyword:
keywords = ['vegas', 'york', 'estate']

for keyword in keywords:
    if keyword in support and keyword in rules:
        print(f"Keyword: '{keyword}'\n"
            f"Support: {support[keyword]}\n"
            f"Rule: '{rules[keyword][0][0]}' with confidence {rules[keyword][1]}\n"
            f"Lift of '{rules[keyword][0][0]}': {rules[keyword][1]/support[rules[keyword][0][0]]}")
    else:
        print(f"Keyword: '{keyword}' not found among frequent patterns or rules")

Keyword: 'vegas'
Support: 0.004278728606356968
Rule: 'las' with confidence 0.9047619047619048
Lift of 'las': 197.35873015873014
Keyword: 'york'
Support: 0.009474327628361858
Rule: 'new' with confidence 0.9139784946236559
Lift of 'new': 40.41267073525138
Keyword: 'estate'
Support: 0.006112469437652812
Rule: 'real' with confidence 0.9166666666666666
Lift of 'real': 128.54285714285714
