In [1]:

###############################################################################
## Importing Libraries
import re
import warnings
import pandas as pd
import json
import numpy as np
import pyfpgrowth
import datetime
import time

#Set start time 
x = datetime.datetime.now()
print("Program beginning at :",x)

## Reading the transactions file

df=pd.read_csv("D:\\Data Mining and Machine Learning 2\\Project\\Original Dataset\\BigTempRb.txt",delimiter=',')


df=df[["RequestTimestamp","ResponseRgBasketId","RequestSiteId",
      "RequestBasketValue","RequestBasketId",
      "RequestNumberBasketItems","RequestBasketJsonString"]]


df1 = pd.DataFrame(columns=['ResponseRgBasketId', 'BasketID', 'ItemID','Qty','Price'])


## json parser for parsing the json basket strings
itemlist=[]
for i in range(0,len(df[["ResponseRgBasketId"]])):
    response=json.loads(df["RequestBasketJsonString"].iloc[i])
    #for nest in response['items']:
    lister = []
    for nest in response['items']:
        lister.append(nest["b"])
    itemlist.append(list(dict.fromkeys(lister)))
        

## making dataframe FROM LIST
df_tra_g = pd.DataFrame({"ItemID": itemlist})
    
    
#set time for Rules generation
y  = datetime.datetime.now()
print("Rule generation started at :",y)        
        
###################################################################
## for creating single rules files with pyfpgrowth algorithm ##
###################################################################

## input paramters
input_support = .0002 ## VALUE RANGES BETWEEN 0 AND 1
input_confidence = 0.001 ## VALUE RANGES BETWEEN 0 AND 1


## applying pyfpgrowth algorithm
support = round(input_support * len(df_tra_g["ItemID"]))
patterns = pyfpgrowth.find_frequent_patterns(df_tra_g['ItemID'], support)
rules = pyfpgrowth.generate_association_rules(patterns, input_confidence)


## making dataframe for the single rules file
df_FP = pd.DataFrame(columns=['FirstItem','TargetItem', 'Support','Confidence'])

for i,j in rules.items():
    if len(i)==1 and len(j[0])==1:
        df_FP = df_FP.append({'FirstItem':i[0],
                    'TargetItem':" ".join(j[0]),
                    'Support':0,
                    'Confidence':j[1]}, ignore_index=True)

    
    
## cleaning the dataframe from null values    
df_FP["Confidence"] = round(df_FP["Confidence"], 4)
df_FP["FirstItem"] = df_FP["FirstItem"].replace('', np.nan, regex=True)
df_FP["TargetItem"] = df_FP["TargetItem"].replace('', np.nan, regex=True)        
df_FP=df_FP.dropna()

## calculating support
N = len(df_tra_g['ItemID'])


## functions for calculaing the support 
def freq_single(row):
    il=[row["FirstItem"]]
    count=0
    for i in df_tra_g['ItemID']:
        if(set(il).issubset(set(i))):
            count=count+1
    return count

def supp(row):
    return round((row["freq"])/N, 4)


## applying the functions 
df_FP["freq"] = df_FP.apply(freq_single, axis=1)
df_FP["Support"] = df_FP.apply(supp, axis=1)
df_FP=df_FP.drop(["freq"],axis=1)


## saving the files in txt format
#df_FP.to_csv('Single_Rule.txt', header=True, index=False, sep='\t')    
    

#########################################################################
############### with PyFpgrowth, formation of double rules  ##############

## give support and confidence between range of 0 to 1
input_support = .0002 
input_confidence = 0.005


## applying the pyfpgrowth algorithm
support = round(input_support * len(df_tra_g["ItemID"]))
patterns = pyfpgrowth.find_frequent_patterns(df_tra_g['ItemID'], support)
rules_t = pyfpgrowth.generate_association_rules(patterns, input_confidence)


## making the dataframe for the double rules files
df_FP_t = pd.DataFrame(columns=['FirstItem', 'SecondItem','TargetItem', 'Support','Confidence'])


    
for i,j in rules_t.items():
    if len(i)==2 and len(j[0])==1:
        df_FP_t = df_FP_t.append({'FirstItem':i[0],
                    'SecondItem':i[1],
                    'TargetItem':" ".join(j[0]),
                    'Support':np.nan,
                    'Confidence':j[1]}, ignore_index=True)
    

## cleaning the dataframe    
df_FP_t["Confidence"] = round(df_FP_t["Confidence"], 4)
df_FP_t["FirstItem"] = df_FP_t["FirstItem"].replace('', np.nan, regex=True)
df_FP_t["SecondItem"] = df_FP_t["SecondItem"].replace('', np.nan, regex=True)        
df_FP_t["TargetItem"] = df_FP_t["TargetItem"].replace('', np.nan, regex=True)        

## calculating support
N = len(df_tra_g['ItemID'])


def freq(row):
    il=[row["FirstItem"],row["SecondItem"]]
    count=0
    for i in df_tra_g['ItemID']:
        if(set(il).issubset(set(i))):
            count=count+1
    return count
df_FP_t["freq"] = df_FP_t.apply(freq, axis=1)


df_FP_t["Support"] = df_FP_t.apply(supp, axis=1)
df_FP_t=df_FP_t.drop(["freq"],axis=1)
df_FP_t=df_FP_t.dropna()



z  = datetime.datetime.now()
print("Rule generation completes at :",z)
print("Total time taken for generating single and double rules was :",z-y)



df_FP.to_csv('Single_Rule_4MB.txt', header=True, index=False, sep='\t')    

## saving the files in txt format
df_FP_t.to_csv('Rules_Dataframe_4MB.txt', header=True, index=False, sep='\t')    

#Set end time
a  = datetime.datetime.now()
print("Program ending at :",a)
print("Total time taken for program completion :",a-x)


Program beginning at : 2020-05-28 16:05:50.409800
Rule generation started at : 2020-05-28 16:05:52.981195
Rule generation completes at : 2020-05-28 16:06:31.527181
Total time taken for generating single and double rules was : 0:00:38.545986
Program ending at : 2020-05-28 16:06:31.573426
Total time taken for program completion : 0:00:41.163626


In [5]:
df_FP.head()

Unnamed: 0,FirstItem,TargetItem,Support,Confidence
0,145,2083803000000,0.0707,0.0226
1,5000295143975,145,0.0002,1.0
2,5014408214049,5000128861069,0.0002,1.0
3,5000128646413,145,0.0002,1.0
4,5000128839419,5000128839433,0.0002,1.0


In [4]:
df_FP_t.head()

Unnamed: 0,FirstItem,SecondItem,TargetItem,Support,Confidence
0,9770043722443,9770953098270,9770307268922,0.0004,0.5
1,9770043722443,9770954895250,9770953098270,0.0002,1.0
2,9770953098270,9770954895250,9770043722443,0.0002,1.0
3,5000128861069,5000128998031,644,0.0009,0.3333
4,5000128861069,5010026514424,5000128998031,0.0002,1.0


In [8]:
rules

{('0000000000145',): (('2083803000000',), 0.022598870056497175),
 ('5000295143975',): (('0000000000145',), 1.0),
 ('5014408214049',): (('5000128861069',), 1.0),
 ('5000128646413',): (('0000000000145',), 1.0),
 ('5000128839419',): (('5000128839433',), 1.0),
 ('5000128839433',): (('5000128839419',), 0.6666666666666666),
 ('7613034059031',): (('5000128104524',), 1.0),
 ('3046920029759',): (('5016311613414',), 0.5),
 ('5016311613414',): (('3046920029759',), 1.0),
 ('0000050248278',): (('0000000007535',), 1.0),
 ('5000128936712',): (('0000000000145',), 1.0),
 ('5000128971157',): (('0000000000145',), 0.2857142857142857),
 ('5010044007946',): (('5000128971157',), 1.0),
 ('5036589200017',): (('5000128271165',), 1.0),
 ('8719214520601',): (('0000000000145',), 1.0),
 ('1230000022002',): (('1230000022040',), 1.0),
 ('1230000022040',): (('1230000022002',), 1.0),
 ('0294290000004',): (('0295920000005',), 0.4),
 ('0295920000005',): (('0294290000004',), 1.0),
 ('4770608254049',): (('0000000000145',),