In [1]:
#Import required libraries

#GENERAL STRUCTURE:
#Generate candidate set
#Check all subsets of each candidate. If a subset is not frequent, remove that candidate
#Search data for count of each candidate set
#Compare to min_support. If >= min_support, add to master list of freq_sets.

import pandas as pd
import itertools as itertools

In [2]:
salesData = pd.read_csv("Datasets/SuperCenterDataNew.csv", header=None)

In [3]:
salesData = salesData.head(150)

In [4]:
#aprior class holds attributes to be stored in an array of objects
class apriori:
    def __init__(self,aSet,frequency):
        self.aSet = aSet
        self.frequency = frequency

In [5]:
salesData = salesData.fillna(0)

In [6]:
#master list holds array of all frequent item objects
masterArr = []

In [7]:
#Input: an array of candidate objects

#checkDBfreq uses the object array to loop through each row, keeping track of how many times a subset is seen

#When finished, the frequency attribute of all apriori objects 
#in the array will contain the correct frequency for that object's set
def checkDBfreq(objArr):
    #Ensure all objects have zero frequency before looping through the data set
    for obj in objArr:
        obj.frequency = 0
        
    #For each row, consider all objects in array.
    #If the object's aSet attribute is a subset of the row, increment that object's frequency by one.
    #Continue to the next data set row and reconsider all objects...
    for index, row in salesData.iterrows():
        for obj in objArr:
            if obj.aSet.issubset(set(row)):
                obj.frequency +=1

In [8]:
#Input: objArr - array of apriori objects that contain correct frequencies
#       min_support - The minimum support set by user. Determines which objects are included in master array

#When finished, the master array will contain all apriori objects with frequency >= minimum support
def buildMasterArr(objArr, min_support):
    if len(objArr) == 0:
        return 
    for obj in objArr:
        if obj.frequency >= int(min_support):
            masterArr.append(obj)

In [9]:
#Input: objArr - Array to hold all candidate combinations before checking subsets
#       iteration - Integer that determines which frequent subsets to consider for the next group of candidates

#When finished, objArr contains the appriopriate sized powerset to consider as candidate sets
def buildCandidateSet(objArr, iteration):
    tempSet = set([])
    for obj in masterArr:
        if len(obj.aSet) == iteration - 1:
            tempSet = tempSet.union(obj.aSet)
    powerSet = itertools.combinations(tempSet, iteration)
    for item in powerSet:
        L1 = apriori(set(item), 0)
        objArr.append(L1)

In [10]:
#Input: objArr - Contains all possible candidate sets
#       iteration - Used to determine the appropriate size of subsets to generate

#checkFreqSubsets considers the subsets of each candidate set of one size smaller.
#These subsets are then compared to the master array apriori objects.
#If a subset cannot be found within the master array, then that subset is infrequent.
#If an infrequent subset is found, the corresponding candidate set is removed from the candidate array

#When finished, objArr will contain all potential candidates that should be used to search database frequencies
def checkFreqSubsets(objArr, iteration):
    for obj in objArr:
        powerSet = itertools.combinations(obj.aSet, iteration - 1)
        found = 0
        for item in powerSet:
            found = 0
            for thing in masterArr:
                if set(item).issubset(thing.aSet):
                    found = 1
            if found == 0:
                break
        if found == 0:
            objArr.remove(obj)

In [11]:
#Main function is divided into two sections
# (1), The initial frequency set of size 1 is produced. This is used as the bases for all other frequent set sizes
# (2), The loop runs until no more frequent sets are added to the master array

def main(min_support):
    
    #Start with a blank master array
    global masterArr
    masterArr = []
    
    #Compile a set of all unique values found in the dataset
    candidateSet = set()
    for index, row in salesData.iterrows():
        candidateSet = candidateSet.union(set(row))

    #Store every element in an apriori object and build the first candidate object array
    firstCandidateObjArr = []
    for item in candidateSet:
        if (item != 0):
            L1 = apriori({item}, 0)
            firstCandidateObjArr.append(L1)

    #Collect the number of occurences of each set in the dataset
    checkDBfreq(firstCandidateObjArr)
    buildMasterArr(firstCandidateObjArr, min_support)

    counter = 2 #starts at 2, since the frequent set size of 1 has already been created
    while True:
        candidateObjArr = [] #Each iteration gets its own candidateObjArr
        
        #Building the candidate set
        buildCandidateSet(candidateObjArr, counter)
        
        if counter > 2:
            #Checking subsets of candidate sets to elimate any containing non-frequent subsets
            checkFreqSubsets(candidateObjArr, counter)
            
        #Checking frequencies of candidates in the data set
        checkDBfreq(candidateObjArr)
        prevMasterLen = len(masterArr)
        
        #Adding to the master array
        buildMasterArr(candidateObjArr, min_support)
        postMasterLen = len(masterArr)
        counter +=1
        if prevMasterLen == postMasterLen:
            print("Nothing was added to master array. We are finished")
            break

    #Print all frequent sets found in the data set
    print("Freq itemsets of all sizes with min support = ", min_support, ": ")
    for obj in masterArr:
        print("Set: ", obj.aSet, " Num of occurences: ", obj.frequency)

In [12]:
again = "Y"
while again=="Y":
    #Prompt user for min_support level
    min_support = input("What is the minimum level of support? ")
    
    main(min_support)
    again = input("Run again with a different minimum support level? (Enter 'Y' to continue. Anything else to exit) ")
    

What is the minimum level of support? 3
Nothing was added to master array. We are finished
Freq itemsets of all sizes with min support =  3 : 
Set:  {1.0}  Num of occurences:  3
Set:  {9.0}  Num of occurences:  7
Set:  {11.0}  Num of occurences:  3
Set:  {32.0}  Num of occurences:  23
Set:  {36.0}  Num of occurences:  3
Set:  {38.0}  Num of occurences:  16
Set:  {39.0}  Num of occurences:  73
Set:  {48.0}  Num of occurences:  60
Set:  {55.0}  Num of occurences:  3
Set:  {65.0}  Num of occurences:  12
Set:  {76.0}  Num of occurences:  3
Set:  {79.0}  Num of occurences:  7
Set:  {89.0}  Num of occurences:  11
Set:  {101.0}  Num of occurences:  6
Set:  {107.0}  Num of occurences:  3
Set:  {110.0}  Num of occurences:  3
Set:  {123.0}  Num of occurences:  4
Set:  {147.0}  Num of occurences:  6
Set:  {4270.0}  Num of occurences:  3
Set:  {185.0}  Num of occurences:  6
Set:  {204.0}  Num of occurences:  4
Set:  {4307.0}  Num of occurences:  3
Set:  {225.0}  Num of occurences:  4
Set:  {237.0}

Run again with a different minimum support level? (Enter 'Y' to continue. Anything else to exit) Y
What is the minimum level of support? 4
Nothing was added to master array. We are finished
Freq itemsets of all sizes with min support =  4 : 
Set:  {9.0}  Num of occurences:  7
Set:  {32.0}  Num of occurences:  23
Set:  {38.0}  Num of occurences:  16
Set:  {39.0}  Num of occurences:  73
Set:  {48.0}  Num of occurences:  60
Set:  {65.0}  Num of occurences:  12
Set:  {79.0}  Num of occurences:  7
Set:  {89.0}  Num of occurences:  11
Set:  {101.0}  Num of occurences:  6
Set:  {123.0}  Num of occurences:  4
Set:  {147.0}  Num of occurences:  6
Set:  {185.0}  Num of occurences:  6
Set:  {204.0}  Num of occurences:  4
Set:  {225.0}  Num of occurences:  4
Set:  {237.0}  Num of occurences:  5
Set:  {4336.0}  Num of occurences:  4
Set:  {255.0}  Num of occurences:  4
Set:  {271.0}  Num of occurences:  6
Set:  {286.0}  Num of occurences:  4
Set:  {4393.0}  Num of occurences:  4
Set:  {301.0}  Num 

Run again with a different minimum support level? (Enter 'Y' to continue. Anything else to exit) Y
What is the minimum level of support? 7
Nothing was added to master array. We are finished
Freq itemsets of all sizes with min support =  7 : 
Set:  {9.0}  Num of occurences:  7
Set:  {32.0}  Num of occurences:  23
Set:  {38.0}  Num of occurences:  16
Set:  {39.0}  Num of occurences:  73
Set:  {48.0}  Num of occurences:  60
Set:  {65.0}  Num of occurences:  12
Set:  {79.0}  Num of occurences:  7
Set:  {89.0}  Num of occurences:  11
Set:  {310.0}  Num of occurences:  8
Set:  {405.0}  Num of occurences:  7
Set:  {413.0}  Num of occurences:  9
Set:  {475.0}  Num of occurences:  8
Set:  {4698.0}  Num of occurences:  8
Set:  {1486.0}  Num of occurences:  9
Set:  {1594.0}  Num of occurences:  8
Set:  {1677.0}  Num of occurences:  9
Set:  {14098.0}  Num of occurences:  8
Set:  {14386.0}  Num of occurences:  15
Set:  {1486.0, 39.0}  Num of occurences:  7
Set:  {32.0, 39.0}  Num of occurences:  11