# Importing libraries and calling the data

In [56]:
import pandas as pd
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules
from mlxtend.preprocessing import TransactionEncoder
import matplotlib.pyplot as plt
import networkx as nx

fd=pd.read_csv('Groceries_dataset.csv')

# Exploratory data analysis (EDA)

In [26]:
df.head()

Unnamed: 0,0,1,2
0,Member_number,Date,itemDescription
1,1808,21-07-2015,tropical fruit
2,2552,05-01-2015,whole milk
3,2300,19-09-2015,pip fruit
4,1187,12-12-2015,other vegetables


In [30]:
df.isnull().sum()

Customer_Number    0
Date               0
Description        0
dtype: int64

In [31]:
df.sample(10)

Unnamed: 0,Customer_Number,Date,Description
32512,2996,26-09-2015,hygiene articles
22511,3491,26-10-2015,female sanitary products
24003,3050,11-03-2015,dessert
1688,4776,08-02-2015,sausage
17802,4293,05-07-2015,rolls/buns
32624,3250,12-04-2015,liquor
21118,2063,10-09-2015,soups
4103,3491,06-06-2015,fruit/vegetable juice
37113,2053,21-09-2015,rolls/buns
28298,1205,30-04-2014,oil


In [47]:
df_grouped = df.groupby('Member_number')['itemDescription'].apply(list).reset_index()

# Data preprocessing

In [52]:
df_grouped = df.groupby('Member_number')['itemDescription'].apply(list).reset_index()
te = TransactionEncoder()
te_ary = te.fit(df_grouped['itemDescription']).transform(df_grouped['itemDescription'])
df_encoded = pd.DataFrame(te_ary, columns=te.columns_)


# Applying Apriori Algorithm 

In [92]:
frequent_itemsets = apriori(df_encoded, min_support=0.1, use_colnames=True) 
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.5)

In [93]:
print("Frequent Itemsets:\n", frequent_itemsets)
print("\nAssociation Rules:\n", rules)

Frequent Itemsets:
      support                        itemsets
0   0.119548                          (beef)
1   0.158799                  (bottled beer)
2   0.213699                 (bottled water)
3   0.135967                   (brown bread)
4   0.126475                        (butter)
5   0.165213                   (canned beer)
6   0.100564                       (chicken)
7   0.185480                  (citrus fruit)
8   0.114931                        (coffee)
9   0.120831                          (curd)
10  0.133145                 (domestic eggs)
11  0.137506                   (frankfurter)
12  0.102617             (frozen vegetables)
13  0.124936         (fruit/vegetable juice)
14  0.116983                     (margarine)
15  0.139815                    (newspapers)
16  0.376603              (other vegetables)
17  0.177527                        (pastry)
18  0.170600                     (pip fruit)
19  0.132376                          (pork)
20  0.349666                    (ro

In [95]:
frequent_itemsets=frequent_itemsets.sort_values(by='support')
frequent_itemsets.tail(20)

Unnamed: 0,support,itemsets
41,0.15059,"(yogurt, whole milk)"
39,0.151103,"(soda, whole milk)"
26,0.154695,(whipped/sour cream)
1,0.158799,(bottled beer)
5,0.165213,(canned beer)
23,0.168291,(shopping bags)
18,0.1706,(pip fruit)
17,0.177527,(pastry)
35,0.178553,"(rolls/buns, whole milk)"
7,0.18548,(citrus fruit)


# report
We used Apriori Algorithm to discover the possible patterns of a grocery dataset. Here is what we have got:
## Exploratory Data Analysis
by working with the data set, we (fortunately) found that no missed values (NULL) in our dataset, on addition to noticing that a customer can puchase many items in a day, the thing that helped us understanding the data and preparing for calculating support
## Findings of association rule learning
After deploying Apriori Algorithm to calculate support (with minimum value of 0.1), we resulted that there were some mathches. for example:
1. The people who buy other vegetables will buy whole milk (support: %19.13).
2. The people who buy rolls/buns Will also buy whole milk (support: %17.85).
3. The people who buy soda will buy whole milk (support: %15.11).
We can also notice that the whole milk has the highest support value, which means it's more propable to be bought with other items 