In [1]:
# import libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
data = pd.read_excel('Online retail.xlsx')
data

Unnamed: 0,"shrimp,almonds,avocado,vegetables mix,green grapes,whole weat flour,yams,cottage cheese,energy drink,tomato juice,low fat yogurt,green tea,honey,salad,mineral water,salmon,antioxydant juice,frozen smoothie,spinach,olive oil"
0,"burgers,meatballs,eggs"
1,chutney
2,"turkey,avocado"
3,"mineral water,milk,energy bar,whole wheat rice..."
4,low fat yogurt
...,...
7495,"butter,light mayo,fresh bread"
7496,"burgers,frozen vegetables,eggs,french fries,ma..."
7497,chicken
7498,"escalope,green tea"


In [3]:
# naming the columns as transactions

data = data.rename(columns = {data.columns[0] : 'Transactions'})
data.columns

Index(['Transactions'], dtype='object')

## Data Preprocessing:

In [4]:
data.shape

(7500, 1)

In [5]:
data.columns

Index(['Transactions'], dtype='object')

In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7500 entries, 0 to 7499
Data columns (total 1 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Transactions  7500 non-null   object
dtypes: object(1)
memory usage: 58.7+ KB


In [7]:
data.isna().sum()

Transactions    0
dtype: int64

In [8]:
data[data.duplicated()]

Unnamed: 0,Transactions
33,cookies
41,spaghetti
59,spaghetti
63,"turkey,eggs"
64,french fries
...,...
7490,herb & pepper
7491,"chocolate,escalope"
7494,"pancakes,light mayo"
7497,chicken


In [9]:
data.drop_duplicates(ignore_index=True,inplace=True)

In [10]:
# creating a unique column as transacton_id for easy tracing

data['Transaction_ID'] = range(1001,1001 + len(data))
data

Unnamed: 0,Transactions,Transaction_ID
0,"burgers,meatballs,eggs",1001
1,chutney,1002
2,"turkey,avocado",1003
3,"mineral water,milk,energy bar,whole wheat rice...",1004
4,low fat yogurt,1005
...,...,...
5170,"burgers,salmon,pancakes,french fries,frozen sm...",6171
5171,"turkey,burgers,dessert wine,shrimp,pasta,tomat...",6172
5172,"butter,light mayo,fresh bread",6173
5173,"burgers,frozen vegetables,eggs,french fries,ma...",6174


## Association Rule Mining:

In [11]:
import mlxtend
from mlxtend.frequent_patterns import apriori,association_rules
from mlxtend.preprocessing import TransactionEncoder

In [12]:
transaction = data['Transactions'].tolist()

In [13]:
te = TransactionEncoder()
enc_matrix = te.fit_transform(transaction)

In [14]:
te.columns_

[' ',
 '&',
 ',',
 'a',
 'b',
 'c',
 'd',
 'e',
 'f',
 'g',
 'h',
 'i',
 'j',
 'k',
 'l',
 'm',
 'n',
 'o',
 'p',
 'r',
 's',
 't',
 'u',
 'v',
 'w',
 'x',
 'y',
 'z']

In [15]:
df = pd.DataFrame(enc_matrix,columns=te.columns_)
df

Unnamed: 0,Unnamed: 1,&,",",a,b,c,d,e,f,g,...,p,r,s,t,u,v,w,x,y,z
0,False,False,True,True,True,False,False,True,False,True,...,False,True,True,True,True,False,False,False,False,False
1,False,False,False,False,False,True,False,True,False,False,...,False,False,False,True,True,False,False,False,True,False
2,False,False,True,True,False,True,True,True,False,False,...,False,True,False,True,True,True,False,False,True,False
3,True,False,True,True,True,True,False,True,False,True,...,False,True,False,True,False,False,True,False,True,False
4,True,False,False,True,False,False,False,False,True,True,...,False,True,False,True,True,False,True,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5170,True,False,True,True,True,True,True,True,True,True,...,True,True,True,True,True,False,False,False,False,True
5171,True,False,True,True,True,True,True,True,True,True,...,True,True,True,True,True,False,True,False,True,True
5172,True,False,True,True,True,False,True,True,True,True,...,False,True,True,True,True,False,False,False,True,False
5173,True,False,True,True,True,True,False,True,True,True,...,False,True,True,True,True,True,False,False,False,True


In [16]:
freq_Trans = apriori(df,min_support = 0.6,max_len=5,use_colnames=True)
freq_Trans

Unnamed: 0,support,itemsets
0,0.908986,( )
1,0.978744,"(,)"
2,0.950531,(a)
3,0.822222,(c)
4,0.983188,(e)
...,...,...
3256,0.622222,"(n, l, o, r, t)"
3257,0.606763,"(n, l, o, s, t)"
3258,0.634396,"(n, l, s, r, t)"
3259,0.629565,"(l, o, s, r, t)"


In [17]:
rules = association_rules(freq_Trans,min_threshold=1)
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,representativity,leverage,conviction,zhangs_metric,jaccard,certainty,kulczynski
0,"(h, n, l)","(,)",0.622802,0.978744,0.622802,1.0,1.021718,1.0,0.013238,inf,0.056352,0.636328,1.0,0.818164
1,"(h, n, l, )","(,)",0.607923,0.978744,0.607923,1.0,1.021718,1.0,0.012922,inf,0.054214,0.621125,1.0,0.810563
2,"(h, n, l, a)","(,)",0.616425,0.978744,0.616425,1.0,1.021718,1.0,0.013103,inf,0.055416,0.629812,1.0,0.814906
3,"(e, g, i, o)","(,)",0.638454,0.978744,0.638454,1.0,1.021718,1.0,0.013571,inf,0.058792,0.65232,1.0,0.82616
4,"(e, n, l, h)","(,)",0.621836,0.978744,0.621836,1.0,1.021718,1.0,0.013218,inf,0.056208,0.635341,1.0,0.81767
5,"(h, g, i, n)","(,)",0.602899,0.978744,0.602899,1.0,1.021718,1.0,0.012815,inf,0.053528,0.615992,1.0,0.807996
6,"(r, g, i, o)","(,)",0.617198,0.978744,0.617198,1.0,1.021718,1.0,0.013119,inf,0.055528,0.630602,1.0,0.815301
7,"(g, i, s, o)","(,)",0.615845,0.978744,0.615845,1.0,1.021718,1.0,0.01309,inf,0.055332,0.62922,1.0,0.81461
8,"(h, r, n, l)","(,)",0.603285,0.978744,0.603285,1.0,1.021718,1.0,0.012823,inf,0.05358,0.616387,1.0,0.808193
9,"(h, t, n, l)","(,)",0.605024,0.978744,0.605024,1.0,1.021718,1.0,0.01286,inf,0.053816,0.618164,1.0,0.809082


In [18]:
rules = association_rules(freq_Trans,min_threshold=0)
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,representativity,leverage,conviction,zhangs_metric,jaccard,certainty,kulczynski
0,"(,)",( ),0.978744,0.908986,0.898744,0.918263,1.010206,1.0,0.009080,1.113499,0.475294,0.908753,0.101930,0.953498
1,( ),"(,)",0.908986,0.978744,0.898744,0.988733,1.010206,1.0,0.009080,1.886574,0.111003,0.908753,0.469939,0.953498
2,(a),( ),0.950531,0.908986,0.878261,0.923968,1.016483,1.0,0.014242,1.197060,0.327798,0.895037,0.164620,0.945084
3,( ),(a),0.908986,0.950531,0.878261,0.966199,1.016483,1.0,0.014242,1.463524,0.178166,0.895037,0.316718,0.945084
4,(c),( ),0.822222,0.908986,0.761353,0.925969,1.018685,1.0,0.013965,1.229418,0.103173,0.785017,0.186607,0.881777
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
66949,(n),"(r, t, s, o)",0.875169,0.701256,0.651787,0.744756,1.062032,1.0,0.038070,1.170425,0.467900,0.704911,0.145610,0.837107
66950,(o),"(r, t, n, s)",0.853140,0.721739,0.651787,0.763986,1.058535,1.0,0.036043,1.179004,0.376539,0.706092,0.151826,0.833533
66951,(s),"(r, t, n, o)",0.920773,0.688502,0.651787,0.707870,1.028130,1.0,0.017833,1.066297,0.345339,0.680727,0.062175,0.827272
66952,(r),"(t, n, s, o)",0.910918,0.673623,0.651787,0.715528,1.062208,1.0,0.038172,1.147308,0.657429,0.698778,0.128395,0.841556


## Interview Questions:

###  1.What is lift and why is it important in Association rules?
     Lift measures how much more likely two items are bought together than by random chance.a Lift > 1 indicates a strong poditive relatioship, making it useful for finding meaningful product associations

### 2.	What is support and Confidence. How do you calculate them?
     Support tells how often a particular item or combination of items appears in all transactions
     Confidence tells how often the consequent (B) is bought when the antecedent (A) is bought - it show the reliability of the rule

### 3.	What are some limitations or challenges of Association rules mining?
    Association rule mining can produce too many rules, making it hard to find meaningful ones.
    Its compuationally expensive for large datasets and depends heavily on chosen thersholds.
    Also, it ignores item order, quantity,and real-world context of purchases.