In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from mlxtend.frequent_patterns import apriori, association_rules
from mlxtend.preprocessing import TransactionEncoder
pd.set_option('display.max_rows',500)

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
data = pd.read_csv("Titanic.csv")
print(data.shape)
data.head()

(2201, 4)


Unnamed: 0,Class,Gender,Age,Survived
0,3rd,Male,Child,No
1,3rd,Male,Child,No
2,3rd,Male,Child,No
3,3rd,Male,Child,No
4,3rd,Male,Child,No


In [8]:
for i in data.columns:
    print(i)
    print(data[i].value_counts())
    print()

Class
Crew    885
3rd     706
1st     325
2nd     285
Name: Class, dtype: int64

Gender
Male      1731
Female     470
Name: Gender, dtype: int64

Age
Adult    2092
Child     109
Name: Age, dtype: int64

Survived
No     1490
Yes     711
Name: Survived, dtype: int64



### Data Pre-Processing

In [9]:
df = pd.get_dummies(data)
df.head()

Unnamed: 0,Class_1st,Class_2nd,Class_3rd,Class_Crew,Gender_Female,Gender_Male,Age_Adult,Age_Child,Survived_No,Survived_Yes
0,0,0,1,0,0,1,0,1,1,0
1,0,0,1,0,0,1,0,1,1,0
2,0,0,1,0,0,1,0,1,1,0
3,0,0,1,0,0,1,0,1,1,0
4,0,0,1,0,0,1,0,1,1,0


### Apriori Algorithm

In [10]:
frequent_items = apriori(df, min_support=0.1, use_colnames=True)
frequent_items

Unnamed: 0,support,itemsets
0,0.14766,(Class_1st)
1,0.129487,(Class_2nd)
2,0.320763,(Class_3rd)
3,0.40209,(Class_Crew)
4,0.213539,(Gender_Female)
5,0.786461,(Gender_Male)
6,0.950477,(Age_Adult)
7,0.676965,(Survived_No)
8,0.323035,(Survived_Yes)
9,0.144934,"(Class_1st, Age_Adult)"


In [11]:
rules = association_rules(frequent_items, metric='lift',min_threshold=1.3)
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(Gender_Female),(Survived_Yes),0.213539,0.323035,0.156293,0.731915,2.265745,0.087312,2.525187
1,(Survived_Yes),(Gender_Female),0.323035,0.213539,0.156293,0.483826,2.265745,0.087312,1.523634
2,"(Gender_Female, Age_Adult)",(Survived_Yes),0.193094,0.323035,0.143571,0.743529,2.301699,0.081195,2.639542
3,"(Survived_Yes, Age_Adult)",(Gender_Female),0.297138,0.213539,0.143571,0.48318,2.262724,0.080121,1.521732
4,(Gender_Female),"(Survived_Yes, Age_Adult)",0.213539,0.297138,0.143571,0.67234,2.262724,0.080121,2.145099
5,(Survived_Yes),"(Gender_Female, Age_Adult)",0.323035,0.193094,0.143571,0.444444,2.301699,0.081195,1.452431
6,"(Gender_Male, Age_Adult)","(Survived_No, Class_Crew)",0.757383,0.30577,0.304407,0.40192,1.31445,0.072822,1.160764
7,"(Survived_No, Class_Crew)","(Gender_Male, Age_Adult)",0.30577,0.757383,0.304407,0.995542,1.31445,0.072822,54.427079


In [35]:
rules = association_rules(frequent_items, metric='lift',min_threshold=1.1)
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(Class_3rd),(Survived_No),0.320763,0.676965,0.239891,0.747875,1.104747,0.022745,1.281251
1,(Survived_No),(Class_3rd),0.676965,0.320763,0.239891,0.354362,1.104747,0.022745,1.05204
2,(Gender_Male),(Class_Crew),0.786461,0.40209,0.39164,0.497978,1.238474,0.075412,1.191004
3,(Class_Crew),(Gender_Male),0.40209,0.786461,0.39164,0.974011,1.238474,0.075412,8.216621
4,(Survived_No),(Class_Crew),0.676965,0.40209,0.30577,0.451678,1.123325,0.033569,1.090436
5,(Class_Crew),(Survived_No),0.40209,0.676965,0.30577,0.760452,1.123325,0.033569,1.348519
6,(Gender_Female),(Survived_Yes),0.213539,0.323035,0.156293,0.731915,2.265745,0.087312,2.525187
7,(Survived_Yes),(Gender_Female),0.323035,0.213539,0.156293,0.483826,2.265745,0.087312,1.523634
8,(Gender_Male),(Survived_No),0.786461,0.676965,0.619718,0.787984,1.163995,0.087312,1.523634
9,(Survived_No),(Gender_Male),0.676965,0.786461,0.619718,0.915436,1.163995,0.087312,2.525187


In [14]:
rules["consequents"][0]

frozenset({'Survived_Yes'})

In [15]:
rules["consequents"].apply(lambda x: str(x))

0                  frozenset({'Survived_Yes'})
1                 frozenset({'Gender_Female'})
2                  frozenset({'Survived_Yes'})
3                 frozenset({'Gender_Female'})
4     frozenset({'Survived_Yes', 'Age_Adult'})
5    frozenset({'Gender_Female', 'Age_Adult'})
6     frozenset({'Survived_No', 'Class_Crew'})
7      frozenset({'Gender_Male', 'Age_Adult'})
Name: consequents, dtype: object

In [16]:
rules["consequents"].apply(lambda x: str(x).split("'"))

0                    [frozenset({, Survived_Yes, })]
1                   [frozenset({, Gender_Female, })]
2                    [frozenset({, Survived_Yes, })]
3                   [frozenset({, Gender_Female, })]
4     [frozenset({, Survived_Yes, , , Age_Adult, })]
5    [frozenset({, Gender_Female, , , Age_Adult, })]
6     [frozenset({, Survived_No, , , Class_Crew, })]
7      [frozenset({, Gender_Male, , , Age_Adult, })]
Name: consequents, dtype: object

In [17]:
rules["consequents"].apply(lambda x: str(x).split("'")[1])

0     Survived_Yes
1    Gender_Female
2     Survived_Yes
3    Gender_Female
4     Survived_Yes
5    Gender_Female
6      Survived_No
7      Gender_Male
Name: consequents, dtype: object

In [18]:
rules['new_col']=rules["consequents"].apply(lambda x: str(x).split("'")[1])

In [20]:
rules[rules['new_col']=='Survived_Yes']

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,new_col
0,(Gender_Female),(Survived_Yes),0.213539,0.323035,0.156293,0.731915,2.265745,0.087312,2.525187,Survived_Yes
2,"(Gender_Female, Age_Adult)",(Survived_Yes),0.193094,0.323035,0.143571,0.743529,2.301699,0.081195,2.639542,Survived_Yes
4,(Gender_Female),"(Survived_Yes, Age_Adult)",0.213539,0.297138,0.143571,0.67234,2.262724,0.080121,2.145099,Survived_Yes


In [26]:
data[(data['Gender']=='Female')&(data['Age']=='Adult')&(data['Survived']=='Yes')].value_counts()

Class  Gender  Age    Survived
1st    Female  Adult  Yes         140
2nd    Female  Adult  Yes          80
3rd    Female  Adult  Yes          76
Crew   Female  Adult  Yes          20
dtype: int64

In [36]:
data[(data['Gender']=='Female')&(data['Age']=='Adult')&(data['Survived']=='Yes')].shape

(316, 4)

In [28]:
data.shape

(2201, 4)

In [33]:
(140+80+76+20)/2201

0.14357110404361653

In [34]:
rules.sort_values('lift',ascending=False)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,new_col
2,"(Gender_Female, Age_Adult)",(Survived_Yes),0.193094,0.323035,0.143571,0.743529,2.301699,0.081195,2.639542,Survived_Yes
5,(Survived_Yes),"(Gender_Female, Age_Adult)",0.323035,0.193094,0.143571,0.444444,2.301699,0.081195,1.452431,Gender_Female
0,(Gender_Female),(Survived_Yes),0.213539,0.323035,0.156293,0.731915,2.265745,0.087312,2.525187,Survived_Yes
1,(Survived_Yes),(Gender_Female),0.323035,0.213539,0.156293,0.483826,2.265745,0.087312,1.523634,Gender_Female
3,"(Survived_Yes, Age_Adult)",(Gender_Female),0.297138,0.213539,0.143571,0.48318,2.262724,0.080121,1.521732,Gender_Female
4,(Gender_Female),"(Survived_Yes, Age_Adult)",0.213539,0.297138,0.143571,0.67234,2.262724,0.080121,2.145099,Survived_Yes
6,"(Gender_Male, Age_Adult)","(Survived_No, Class_Crew)",0.757383,0.30577,0.304407,0.40192,1.31445,0.072822,1.160764,Survived_No
7,"(Survived_No, Class_Crew)","(Gender_Male, Age_Adult)",0.30577,0.757383,0.304407,0.995542,1.31445,0.072822,54.427079,Gender_Male


## Another Example

In [4]:
items_list = [['Milk','Onion','Nutmeg','Kidney Beans', 'Eggs', 'Yogurt'],
             ['Dill', 'Onion', 'Nutmeg', 'Kidney Beans', 'Eggs', 'Yogurt'],
             ['Milk', 'Apple', 'Kidney Beans', 'Eggs'],
             ['Milk', 'Unicorn', 'Corn', 'Kidney Beans', 'Yogurt'],
             ['Corn', 'Onion', 'Onion', 'Kidney Beans', 'Ice Cream', 'Eggs']]

In [5]:
items_list

[['Milk', 'Onion', 'Nutmeg', 'Kidney Beans', 'Eggs', 'Yogurt'],
 ['Dill', 'Onion', 'Nutmeg', 'Kidney Beans', 'Eggs', 'Yogurt'],
 ['Milk', 'Apple', 'Kidney Beans', 'Eggs'],
 ['Milk', 'Unicorn', 'Corn', 'Kidney Beans', 'Yogurt'],
 ['Corn', 'Onion', 'Onion', 'Kidney Beans', 'Ice Cream', 'Eggs']]

In [6]:
te = TransactionEncoder()
te_ary = te.fit(items_list).transform(items_list)
df = pd.DataFrame(te_ary, columns = te.columns_)
df

Unnamed: 0,Apple,Corn,Dill,Eggs,Ice Cream,Kidney Beans,Milk,Nutmeg,Onion,Unicorn,Yogurt
0,False,False,False,True,False,True,True,True,True,False,True
1,False,False,True,True,False,True,False,True,True,False,True
2,True,False,False,True,False,True,True,False,False,False,False
3,False,True,False,False,False,True,True,False,False,True,True
4,False,True,False,True,True,True,False,False,True,False,False


In [7]:
frequent_itemsets = apriori(df, min_support=0.3, use_colnames=True)
frequent_itemsets['length'] = frequent_itemsets['itemsets'].apply(lambda x: len(x))
frequent_itemsets

Unnamed: 0,support,itemsets,length
0,0.4,(Corn),1
1,0.8,(Eggs),1
2,1.0,(Kidney Beans),1
3,0.6,(Milk),1
4,0.4,(Nutmeg),1
5,0.6,(Onion),1
6,0.6,(Yogurt),1
7,0.4,"(Corn, Kidney Beans)",2
8,0.8,"(Kidney Beans, Eggs)",2
9,0.4,"(Milk, Eggs)",2


In [8]:
rules = association_rules(frequent_itemsets, min_threshold=1)
rules.head()

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(Corn),(Kidney Beans),0.4,1.0,0.4,1.0,1.0,0.0,inf
1,(Eggs),(Kidney Beans),0.8,1.0,0.8,1.0,1.0,0.0,inf
2,(Nutmeg),(Eggs),0.4,0.8,0.4,1.0,1.25,0.08,inf
3,(Onion),(Eggs),0.6,0.8,0.6,1.0,1.25,0.12,inf
4,(Milk),(Kidney Beans),0.6,1.0,0.6,1.0,1.0,0.0,inf
