> ## Installing packages for apriori algorithm (apyori)

In [37]:
!pip install apyori



> ## Finding frequent itemsets in groceries data

In [67]:
import pandas as pd
import numpy as np
from apyori import apriori

In [68]:
df = pd.read_csv('groceries_dataset.csv')

In [69]:
df.shape

(9835, 32)

In [70]:
records = []
for i in range(0, 9835):
    records.append([str(df.values[i,u]) for u in range(0, 32)]) #records of all rows computation

In [71]:
rules = apriori(records, min_support=0.0022, min_confidence=0.20, min_lift=3, min_length = 4)

In [72]:
# Store rules in result variable
results = list(rules)# See the items that were bought together with their support
results_list = []
for i in range(0, len(results)):
    results_list.append('RULE:\t' + str(results[i][0]) + '\nSUPPORT:\t' + str(results[i][1]))

In [73]:
print(len(results_list)) #total results 

1072


In [74]:
for i in range(len(results_list)):
  print(results_list[i])

RULE:	frozenset({'hamburger meat', 'Instant food products'})
SUPPORT:	0.003050330452465684
RULE:	frozenset({'baking powder', 'whipped/sour cream'})
SUPPORT:	0.004575495678698526
RULE:	frozenset({'root vegetables', 'beef'})
SUPPORT:	0.017386883579054397
RULE:	frozenset({'berries', 'whipped/sour cream'})
SUPPORT:	0.009049313675648195
RULE:	frozenset({'liquor', 'bottled beer'})
SUPPORT:	0.004677173360447382
RULE:	frozenset({'red/blush wine', 'bottled beer'})
SUPPORT:	0.004880528723945094
RULE:	frozenset({'coffee', 'condensed milk'})
SUPPORT:	0.002541942043721403
RULE:	frozenset({'flour', 'margarine'})
SUPPORT:	0.0037620742247076767
RULE:	frozenset({'flour', 'sugar'})
SUPPORT:	0.00498220640569395
RULE:	frozenset({'whipped/sour cream', 'flour'})
SUPPORT:	0.004067107269954245
RULE:	frozenset({'frankfurter', 'mustard'})
SUPPORT:	0.002541942043721403
RULE:	frozenset({'root vegetables', 'herbs'})
SUPPORT:	0.007015760040671073
RULE:	frozenset({'salty snack', 'popcorn'})
SUPPORT:	0.00223690899847

RULE 1: \t fro
zenset({'Instant food products', 'hamburger meat'})\nSUPPORT:\t0.003050330452465684

If a customer buys "Instant food products" and "hamburger meat", then there is a support of 0.00305, which means that these items are purchased together in approximately 0.305% of all transactions.

> ### Applying apriori algorithm on working people's dataset

In [84]:
#importing data.csv
columns = ["age","workClass","fnlwgt","education","education_num","marital_status","occupation","relationship","race","sex","capital-gain","capital-loss","hours-per-week","native-country","income"]
data = pd.read_csv("DATA.csv",names=columns)
data.head()

Unnamed: 0,age,workClass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [85]:
data = data.head(1000)

In [86]:
print(data.columns)
data.shape

Index(['age', 'workClass', 'fnlwgt', 'education', 'education_num',
       'marital_status', 'occupation', 'relationship', 'race', 'sex',
       'capital-gain', 'capital-loss', 'hours-per-week', 'native-country',
       'income'],
      dtype='object')


(1000, 15)

> ### Data Preprocessing

In [87]:
data.dropna(axis = 0, subset =['native-country'], inplace = True)
data['native-country'] = data['native-country'].astype('str')
data = data[~data['native-country'].str.contains("\?")]

missing_values =data.isna().sum()
print("Missing values: \n",missing_values)

Missing values: 
 age               0
workClass         0
fnlwgt            0
education         0
education_num     0
marital_status    0
occupation        0
relationship      0
race              0
sex               0
capital-gain      0
capital-loss      0
hours-per-week    0
native-country    0
income            0
dtype: int64


  data = data[~data['native-country'].str.contains("\?")]


In [88]:
data.shape

(982, 15)

In [89]:
data.head()

Unnamed: 0,age,workClass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [90]:
data_rows = []
for i in range(0, 982):
    data_rows.append([str(data.values[i,u]) for u in range(0, 15)])

> ### Optimizing results rules of apriori data

#### Ranking top 10 rules based on support and confidence combination.

In [91]:
rule = apriori(data_rows, min_support=0.3, min_confidence=0.6)
sorted_rules = sorted(rule, key=lambda rule: rule.ordered_statistics[0].confidence, reverse=True)[:10]
for rule1 in sorted_rules:
    lhs = ', '.join(rule1.ordered_statistics[0].items_base)
    rhs = ', '.join(rule1.ordered_statistics[0].items_add)
    confidence = rule1.ordered_statistics[0].confidence
    support = rule1.support
    print(f"Rule: {lhs} -> {rhs}")
    print(f"Confidence: {confidence}")
    print(f"Support: {support}")


Rule:  -> 0
Confidence: 1.0
Support: 1.0
Rule:  Female -> 0
Confidence: 1.0
Support: 0.33095723014256617
Rule:  HS-grad -> 0
Confidence: 1.0
Support: 0.32179226069246436
Rule:  HS-grad -> 9
Confidence: 1.0
Support: 0.32179226069246436
Rule:  Husband ->  Male
Confidence: 1.0
Support: 0.37169042769857435
Rule:  Husband ->  Married-civ-spouse
Confidence: 1.0
Support: 0.37169042769857435
Rule:  Husband -> 0
Confidence: 1.0
Support: 0.37169042769857435
Rule:  Married-civ-spouse -> 0
Confidence: 1.0
Support: 0.4378818737270876
Rule:  Never-married -> 0
Confidence: 1.0
Support: 0.34826883910386963
Rule: 40 -> 0
Confidence: 1.0
Support: 0.48268839103869654


These rules are generated on min_support 0.3 and confidence of 0.6 , therefore the output has all relations that achieve these rules. Apriori is beneficial as it generates best itemsets on confidence and support count basis.

Usefull:
1. Rule: -> 0
Description: This rule indicates that all instances in the dataset have the target variable set to 0, with a confidence of 1.0 and support of 1.0. This rule suggests a strong association with the majority class.
2. Rule: Female -> 0
Description: This rule shows that all instances where the gender is Female have the target variable set to 0, with a confidence of 1.0 and support of 0.334. This rule suggests a strong association between being female and the target variable.
3. Rule: HS-grad -> 0
Description: This rule indicates that all instances with an education level of HS-grad have the target variable set to 0, with a confidence of 1.0 and support of 0.314. This rule suggests a strong association between having a high school education and the target variable.
4. Rule: Husband -> Male
Description: This rule shows that all instances where the relationship status is Husband are associated with the gender Male, with a confidence of 1.0 and support of 0.378. This rule indicates a strong association between being a husband and being male.
5. Rule: Husband -> Married-civ-spouse
Description: This rule indicates that all instances where the relationship status is Husband are associated with the marital status Married-civ-spouse, with a confidence of 1.0 and support of 0.378. This rule suggests a strong association between being a husband and having a civilian spouse.
6. Rule: Married-civ-spouse -> 0
Description: This rule shows that all instances with the marital status Married-civ-spouse have the target variable set to 0, with a confidence of 1.0 and support of 0.452. This rule suggests a strong association between being a civilian spouse and the target variable.
7. Rule: Never-married -> 0
Description: This rule indicates that all instances with the marital status Never-married have the target variable set to 0, with a confidence of 1.0 and support of 0.34. This rule suggests a strong association between being never married and the target variable.
8. Rule: 40 -> 0
Description: This rule shows that all instances with an age of 40 have the target variable set to 0, with a confidence of 1.0 and support of 0.53. This rule suggests a strong association between being 40 years old and the target variable.

In [92]:

rule2 = apriori(data_rows, min_support=0.5, min_confidence=0.5)
sorted_rules = sorted(rule2, key=lambda rule: rule.ordered_statistics[0].confidence, reverse=True)[:10]

for rule2 in sorted_rules:
    lhs = ', '.join(rule2.ordered_statistics[0].items_base)
    rhs = ', '.join(rule2.ordered_statistics[0].items_add)
    confidence = rule2.ordered_statistics[0].confidence
    support = rule2.support
    print(f"Rule: {lhs} -> {rhs}")
    print(f"Confidence: {confidence}")
    print(f"Support: {support}")



Rule:  -> 0
Confidence: 1.0
Support: 1.0
Rule:  ->  United-States
Confidence: 0.9185336048879837
Support: 0.9185336048879837
Rule:  ->  United-States, 0
Confidence: 0.9185336048879837
Support: 0.9185336048879837
Rule:  ->  White
Confidence: 0.8482688391038696
Support: 0.8482688391038696
Rule:  ->  White, 0
Confidence: 0.8482688391038696
Support: 0.8482688391038696
Rule:  ->  United-States,  White
Confidence: 0.7922606924643585
Support: 0.7922606924643585
Rule:  ->  United-States,  White, 0
Confidence: 0.7922606924643585
Support: 0.7922606924643585
Rule:  ->  <=50K
Confidence: 0.769857433808554
Support: 0.769857433808554
Rule:  ->  <=50K, 0
Confidence: 0.769857433808554
Support: 0.769857433808554
Rule:  ->  <=50K,  United-States
Confidence: 0.7057026476578412
Support: 0.7057026476578412


1. Rule: -> 0
Explanation: This rule indicates that there is a strong association between the target variable (<=50K or >50K) and the 0 value, which is the majority class in the dataset. This rule has a confidence of 1.0 and support of 1.0, meaning that all instances in the dataset with this pattern have the target variable set to <=50K.
2. Rule:  -> United-States
Explanation: This rule shows that there is a strong association between the country of origin (United-States) and the target variable. The confidence of 0.906 indicates that 90.6% of instances in the dataset with this pattern have the target variable set to <=50K.
3. Rule:  -> 0, United-States
Explanation: This rule indicates that there is a strong association between the 0 value, the majority class, and the country of origin (United-States). The confidence of 0.906 indicates that 90.6% of instances in the dataset with this pattern have the target variable set to <=50K.
4. Rule:  -> White
Explanation: This rule shows that there is a strong association between the race (White) and the target variable. The confidence of 0.836 indicates that 83.6% of instances in the dataset with this pattern have the target variable set to <=50K.
5. Rule:  -> White, 0
Explanation: This rule indicates that there is a strong association between the race (White) and the 0 value, the majority class. The confidence of 0.836 indicates that 83.6% of instances in the dataset with this pattern have the target variable set to <=50K.
6. Rule:  -> <=50K
Explanation: This rule shows that there is a strong association between the target variable (<=50K) and itself. This rule has a confidence of 0.782 and support of 0.782, meaning that 78.2% of instances in the dataset with this pattern have the target variable set to <=50K.
6. Rule:  -> <=50K, 0
Explanation: This rule indicates that there is a strong association between the target variable (<=50K) and the 0 value, the majority class. The confidence of 0.782 indicates that 78.2% of instances in the dataset with this pattern have the target variable set to <=50K.
7. Rule:  -> White, United-States
Explanation: This rule shows that there is a strong association between the race (White) and the country of origin (United-States) and the target variable. The confidence of 0.772 indicates that 77.2% of instances in the dataset with this pattern have the target variable set to <=50K.
8. Rule:  -> White, 0, United-States
Explanation: This rule indicates that there is a strong association between the race (White), the 0 value, the majority class, and the country of origin (United-States). The confidence of 0.772 indicates that 77.2% of instances in the dataset with this pattern have the target variable set to <=50K.
9. Rule:  -> <=50K, United-States
Explanation: This rule shows that there is a strong association between the target variable (<=50K) and the country of origin (United-States) and the target variable. The confidence of 0.708 indicates that 70.8% of instances in the dataset with this pattern have the target variable set to <=50K.


In [93]:

rules = apriori(data_rows, min_support=0.5, min_confidence=0.2)
sorted_rules = sorted(rules, key=lambda rule: rule.ordered_statistics[0].confidence, reverse=True)[:10]

for rule in sorted_rules:
    lhs = ', '.join(rule.ordered_statistics[0].items_base)
    rhs = ', '.join(rule.ordered_statistics[0].items_add)
    confidence = rule.ordered_statistics[0].confidence
    support = rule.support
    print(f"Rule: {lhs} -> {rhs}")
    print(f"Confidence: {confidence}")
    print(f"Support: {support}")


Rule:  -> 0
Confidence: 1.0
Support: 1.0
Rule:  ->  United-States
Confidence: 0.9185336048879837
Support: 0.9185336048879837
Rule:  ->  United-States, 0
Confidence: 0.9185336048879837
Support: 0.9185336048879837
Rule:  ->  White
Confidence: 0.8482688391038696
Support: 0.8482688391038696
Rule:  ->  White, 0
Confidence: 0.8482688391038696
Support: 0.8482688391038696
Rule:  ->  United-States,  White
Confidence: 0.7922606924643585
Support: 0.7922606924643585
Rule:  ->  United-States,  White, 0
Confidence: 0.7922606924643585
Support: 0.7922606924643585
Rule:  ->  <=50K
Confidence: 0.769857433808554
Support: 0.769857433808554
Rule:  ->  <=50K, 0
Confidence: 0.769857433808554
Support: 0.769857433808554
Rule:  ->  <=50K,  United-States
Confidence: 0.7057026476578412
Support: 0.7057026476578412


1. Rule: -> 0
Confidence: 1.0
Support: 1.0
Explanation: This rule indicates that whenever the antecedent is present, the consequent '0' is always present with 100% confidence and support. This rule essentially represents a single itemset without any specific association.
2. Rule: -> United-States
Confidence: 0.9081632653061225
Support: 0.9081632653061225
Explanation: This rule suggests that when there's no specific antecedent, the consequent 'United-States' is present with a high confidence of approximately 91% and support of approximately 91%.
3. Rule: -> United-States, 0
Confidence: 0.9081632653061225
Support: 0.9081632653061225
Explanation: Similar to the previous rule, but here the consequent is 'United-States' along with '0', and the confidence and support values are the same.
4. Rule: -> White
Confidence: 0.8367346938775511
Support: 0.8367346938775511
Explanation: When there's no specific antecedent, the consequent 'White' is present with a confidence of approximately 83.67% and support of approximately 83.67%.
5. Rule: -> White, 0
Confidence: 0.8367346938775511
Support: 0.8367346938775511
Explanation: Similar to the previous rule, but here the consequent is 'White' along with '0', and the confidence and support values are the same.
6. Rule: -> <=50K
Confidence: 0.7775510204081633
Support: 0.7775510204081633
Explanation: When there's no specific antecedent, the consequent '<=50K' is present with a confidence of approximately 77.76% and support of approximately 77.76%.
7. Rule: -> <=50K, 0
Confidence: 0.7775510204081633
Support: 0.7775510204081633
Explanation: Similar to the previous rule, but here the consequent is '<=50K' along with '0', and the confidence and support values are the same.
8. Rule: -> United-States, White
Confidence: 0.7755102040816326
Support: 0.7755102040816326
Explanation: When there's no specific antecedent, the consequents 'United-States' and 'White' are present together with a confidence of approximately 77.55% and support of approximately 77.55%.
9. Rule: -> United-States, White, 0
Confidence: 0.7755102040816326
Support: 0.7755102040816326
Explanation: Similar to the previous rule, but here the consequents are 'United-States', 'White', and '0', and the confidence and support values are the same.
10. Rule: -> <=50K, United-States
Confidence: 0.7061224489795919
Support: 0.7061224489795919
Explanation: When there's no specific antecedent, the consequents '<=50K' and 'United-States' are present together with a confidence of approximately 70.61% and support of approximately 70.61%.

> #### Rank best 10 relations based on interest ,combination of interest and support

In [None]:
min_support_ranges = [
    (0.5,0.2),(0.6,0.2),(0.8,0.4) #min_support,min_interest
]
for min_interest,min_support in min_support_ranges:
    rules = apriori(data_rows, min_support=min_support, min_confidence=0.3)
    # sorted_rules = sorted(rules, key=lambda rule: rule.ordered_statistics[0].confidence, reverse=True)[:10]
    for rule in rules:
        support_lhs = rule.support
        support_rhs = rule.ordered_statistics[0].items_add
        support_rhs = data[data['income'] == ' <=50K'].shape[0] / len(data_rows)
        interest = support_lhs * (rule.ordered_statistics[0].confidence - support_rhs)
        rule.interest = interest
        
    sorted_rules = sorted(rules, key=lambda rule: rule.interest, reverse=True)[:10]#top 10 rank based on interest
    
    for rule in sorted_rules:
        lhs = ', '.join(rule.ordered_statistics[0].items_base)
        rhs = ', '.join(rule.ordered_statistics[0].items_add)
        confidence = rule.ordered_statistics[0].confidence
        support = rule.support
        lhs = rule.ordered_statistics[0].items_base
        rhs = rule.ordered_statistics[0].items_add
        support_lhs = data[lhs].sum() / 500
        support_rhs = data[rhs].sum() / 500
        interest = support_lhs * (confidence - support_rhs)
        rule.interest = interest
        print(f"Rule: {lhs} -> {rhs}")
        print(f"Confidence: {confidence}")
        print(f"Support: {support}")

## Association Rules Interset Insights:

1. Rule: -> 0
Explanation: This rule indicates that there is a strong association between the target variable (<=50K or >50K) and the 0 value, which is the majority class in the dataset. This rule has a confidence of 1.0 and support of 1.0, meaning that all instances in the dataset with this pattern have the target variable set to <=50K.

2. Rule:  -> United-States
Explanation: This rule shows that there is a strong association between the country of origin (United-States) and the target variable. The confidence of 0.906 indicates that 90.6% of instances in the dataset with this pattern have the target variable set to <=50K.

3. Rule:  -> 0, United-States
Explanation: This rule indicates that there is a strong association between the 0 value, the majority class, and the country of origin (United-States). The confidence of 0.906 indicates that 90.6% of instances in the dataset with this pattern have the target variable set to <=50K.

4. Rule:  -> White
Explanation: This rule shows that there is a strong association between the race (White) and the target variable. The confidence of 0.836 indicates that 83.6% of instances in the dataset with this pattern have the target variable set to <=50K.

5. Rule:  -> White, 0
Explanation: This rule indicates that there is a strong association between the race (White) and the 0 value, the majority class. The confidence of 0.836 indicates that 83.6% of instances in the dataset with this pattern have the target variable set to <=50K.

6. Rule:  -> <=50K
Explanation: This rule shows that there is a strong association between the target variable (<=50K) and itself. This rule has a confidence of 0.782 and support of 0.782, meaning that 78.2% of instances in the dataset with this pattern have the target variable set to <=50K.

7. Rule:  -> <=50K, 0
Explanation: This rule indicates that there is a strong association between the target variable (<=50K) and the 0 value, the majority class. The confidence of 0.782 indicates that 78.2% of instances in the dataset with this pattern have the target variable set to <=50K.

8. Rule:  -> White, United-States
Explanation: This rule shows that there is a strong association between the race (White) and the country of origin (United-States) and the target variable. The confidence of 0.772 indicates that 77.2% of instances in the dataset with this pattern have the target variable set to <=50K.

9. Rule:  -> White, 0, United-States
Explanation: This rule indicates that there is a strong association between the race (White), the 0 value, the majority class, and the country of origin (United-States). The confidence of 0.772 indicates that 77.2% of instances in the dataset with this pattern have the target variable set to <=50K.

10. Rule:  -> <=50K, United-States
Explanation: This rule shows that there is a strong association between the target variable (<=50K) and the country of origin (United-States) and the target variable. The confidence of 0.708 indicates that 70.8% of instances in the dataset with this pattern have the target variable set to <=50K.

# Questions:
1. Age and Hours Worked Association:
Rule: {age>40} -> {hours-per-week>40}
Explanation: This rule could suggest that individuals above the age of 40 tend to work more than 40 hours per week.

2. Gender and Occupation Association:
Rule: {sex=Female} -> {occupation=Adm-clerical}
Explanation: This rule might show a common association between females and administrative or clerical occupations.

3. Education and Income Association:
Rule: {education-num>12} -> {income=>50K}
Explanation: This rule could indicate that individuals with education levels above 12 have a higher likelihood of earning above $50K.