In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

import dhs_util
from dhs_util import *

In [3]:
def get_retention_cohort(df):
    """
    - @author Ying Li
    - input: transaction dataframe, DHS service usage dataframe
    - output: a dataframe representing a retention cohort
    - preconditions: at monthly level because the DHS data is at month level    
    """
    recipient = df.groupby(['id']).agg(
        first_date = ('date', 'min'), 
    ).reset_index()
    df_retention = pd.merge(df, recipient, on = 'id', how = 'left')
    df_retention['elapsed'] = df_retention['date'].dt.month - df_retention['first_date'].dt.month
    df_retention_count = df_retention.groupby(["first_date", "elapsed"]).agg(
        num_active = ("id", "nunique"),
    ).reset_index()
    df_retention_count = df_retention_count.pivot(index = "first_date", columns="elapsed", values='num_active')
    df_retention_ratio = df_retention_count.reset_index()
    df_retention_ratio = df_retention_count.div(df_retention_ratio.iloc[:,1].to_numpy(),axis = 0)
    
    return df_retention_count, df_retention_ratio


def get_id_service_matrix(df):
    df_temp = df.groupby(["id","serv"],observed=False).agg(
        num_serv = ('service', 'nunique') # this will be 1 or 0, "service" is categorical 
    ).reset_index()
    df_serv = df_temp.pivot_table(observed=False,
        values='num_serv', index=["id"],
        columns="serv", aggfunc="sum"
    ).reset_index()
    return df_serv

In [4]:
os.chdir(r'D:\data485')
df = pd.read_csv('dhs_service_records_synthesized_final.csv')

df = dhs_preprocessing(df)
df, service_map = add_service_label(df)
df = add_age_bin(df)

recipient = get_recipient_attribute(df)
retention_cohort_count, retention_cohort_ratio = get_retention_cohort(df)
df_id_serv = get_id_service_matrix(df)

In [30]:
from mlxtend.preprocessing import *
from mlxtend.frequent_patterns import association_rules
from mlxtend.frequent_patterns import fpgrowth
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import fpmax
from mlxtend.frequent_patterns import hmine

serv_oneHot = df_id_serv.iloc[:,1:23] > 0  # this converts value into True or False
min_freq = 1000
min_support = min_freq/serv_oneHot.shape[0]
min_confidence = 0.6
min_rule_support = 0.2
min_lift = 0.15

Use mlxtend package

In [8]:
def serv_rules(freq_itemsets,metrics,threshold):
    asso_rules = association_rules(freq_itemsets, metric=metrics, min_threshold=threshold)
    return asso_rules.sort_values(by='lift', ascending=False)[['antecedents', 'consequents', 'support', 'confidence', 'lift']]

def predict(antecedent, rules, consequents_only = False):
    preds = rules[rules['antecedents'] == antecedent] # get the rules for this antecedent
    if consequents_only:
        preds = preds['consequents'].apply(iter).apply(next) # a way to convert a frozen set with one element to string
    return preds

In [9]:
freq_itemset_apriori = apriori(serv_oneHot, use_colnames=True, min_support=min_support)\
    .sort_values(by="support", ascending=False)
freq_itemset_fpgrowth = fpgrowth(serv_oneHot,min_support=min_support,use_colnames=True)
freq_itemset_fpmax = fpmax(serv_oneHot,min_support=min_support,use_colnames=True)

rules_apriori = serv_rules(freq_itemset_apriori,"confidence",0.60)
rules_fpgrowth = serv_rules(freq_itemset_fpgrowth,"confidence",0.60)
rules_fpmax = association_rules(freq_itemset_fpmax, metric="confidence", min_threshold=0.001, support_only=True)

In [10]:
# Find itemsets with support greater than 0.01
frequent_itemsets_apriori = apriori(serv_oneHot, min_support=0.01, use_colnames=True)
print(frequent_itemsets_apriori)

     support              itemsets
0   0.013687                 (S03)
1   0.012649                 (S05)
2   0.153844                 (S09)
3   0.040882                 (S11)
4   0.941422                 (S12)
5   0.011915                 (S13)
6   0.103528                 (S14)
7   0.024002                 (S15)
8   0.012967                 (S17)
9   0.013573                 (S18)
10  0.031396                 (S19)
11  0.013260                 (S21)
12  0.012284            (S12, S03)
13  0.018307            (S09, S11)
14  0.139131            (S12, S09)
15  0.103528            (S09, S14)
16  0.019431            (S15, S09)
17  0.013468            (S18, S09)
18  0.032106            (S12, S11)
19  0.011448            (S12, S13)
20  0.094436            (S12, S14)
21  0.022561            (S12, S15)
22  0.010384            (S12, S18)
23  0.013460            (S18, S14)
24  0.016508       (S12, S09, S11)
25  0.094436       (S12, S09, S14)
26  0.018280       (S12, S15, S09)
27  0.010285       (

In [11]:
# Find itemsets with support greater than 0.0001
frequent_itemsets_fpgrowth = fpgrowth(serv_oneHot, min_support=0.0001, use_colnames=True)
print(frequent_itemsets_fpgrowth)

      support                        itemsets
0    0.941422                           (S12)
1    0.153844                           (S09)
2    0.040882                           (S11)
3    0.103528                           (S14)
4    0.013687                           (S03)
..        ...                             ...
930  0.000111       (S12, S03, S02, S14, S10)
931  0.000111  (S12, S09, S14, S02, S03, S10)
932  0.000124                      (S12, S22)
933  0.000305                      (S01, S12)
934  0.000199                      (S12, S08)

[935 rows x 2 columns]


In [32]:
frequent_itemsets_hmine = hmine(serv_oneHot, min_support=0.0001, use_colnames=True)
print(frequent_itemsets_hmine)

      support         itemsets
0    0.000654            (S01)
1    0.000305       (S01, S12)
2    0.013687            (S03)
3    0.000431       (S04, S03)
4    0.000146  (S04, S02, S03)
..        ...              ...
930  0.000141       (S19, S21)
931  0.002471            (S20)
932  0.000126       (S20, S21)
933   0.01326            (S21)
934  0.002205            (S22)

[935 rows x 2 columns]


Observations: 
Since we set different minimum support rates for apriori, the result itemset has way fewer rows than the later ones. We set the same minimum support rate for fpgrowth and hmine, we can see the orders of rows are different because of the difference between two mining algorithms.

In [15]:
frequent_itemsets_fpmax = fpmax(serv_oneHot, min_support=0.0001, use_colnames=True)
print(frequent_itemsets_fpmax)

     support                        itemsets
0   0.000199                      (S12, S08)
1   0.000305                      (S01, S12)
2   0.000124                      (S12, S22)
3   0.000133            (S12, S04, S02, S03)
4   0.000131       (S12, S03, S09, S04, S14)
..       ...                             ...
59  0.000122  (S12, S09, S11, S21, S14, S18)
60  0.000210  (S12, S09, S15, S21, S14, S18)
61  0.000204       (S12, S09, S19, S14, S18)
62  0.000169       (S12, S09, S15, S19, S14)
63  0.000247       (S12, S09, S11, S19, S14)

[64 rows x 2 columns]


fpmax has the same minimum support rate as the previous two but only ends up with 64 rows of results. This is because it only focuses on obtaining maximal itemsets.

Extract itemsets of a specific length

In [13]:
n = 1 
filtered_itemsets_apriori = frequent_itemsets_apriori[frequent_itemsets_apriori['itemsets'].apply(lambda x: len(x) == n)]
filtered_itemsets_apriori


Unnamed: 0,support,itemsets
0,0.013687,(S03)
1,0.012649,(S05)
2,0.153844,(S09)
3,0.040882,(S11)
4,0.941422,(S12)
5,0.011915,(S13)
6,0.103528,(S14)
7,0.024002,(S15)
8,0.012967,(S17)
9,0.013573,(S18)


In [33]:
n = 2
filtered_itemsets_hmine = frequent_itemsets_hmine[frequent_itemsets_hmine['itemsets'].apply(lambda x: len(x) == n)]
print(filtered_itemsets_hmine)

      support    itemsets
1    0.000305  (S01, S12)
3    0.000431  (S04, S03)
13   0.001229  (S05, S03)
31   0.004142  (S02, S03)
63    0.00029  (S07, S03)
..        ...         ...
926  0.000311  (S18, S19)
927  0.000519  (S18, S21)
929  0.000124  (S20, S19)
930  0.000141  (S19, S21)
932  0.000126  (S20, S21)

[124 rows x 2 columns]


In [17]:
n = 5
filtered_itemsets_fpmax = frequent_itemsets_fpmax[frequent_itemsets_fpmax['itemsets'].apply(lambda x: len(x) == n)]
print(filtered_itemsets_fpmax)

     support                   itemsets
4   0.000131  (S12, S03, S09, S04, S14)
18  0.000163  (S12, S18, S09, S14, S10)
19  0.000157  (S12, S09, S06, S21, S10)
21  0.000124  (S12, S09, S15, S21, S10)
22  0.000208  (S12, S09, S15, S14, S10)
23  0.000139  (S12, S09, S21, S14, S10)
24  0.000197  (S12, S09, S11, S14, S10)
25  0.000103  (S16, S09, S06, S21, S14)
31  0.000103  (S12, S09, S11, S07, S19)
33  0.000120  (S12, S09, S11, S07, S03)
37  0.000131  (S16, S09, S19, S14, S18)
38  0.000142  (S12, S16, S09, S19, S14)
45  0.000120  (S12, S13, S03, S09, S14)
46  0.000111  (S12, S13, S09, S17, S14)
47  0.000150  (S12, S13, S09, S15, S14)
49  0.000150  (S12, S13, S09, S11, S14)
50  0.000111  (S12, S09, S05, S14, S18)
53  0.000109  (S12, S09, S17, S11, S21)
61  0.000204  (S12, S09, S19, S14, S18)
62  0.000169  (S12, S09, S15, S19, S14)
63  0.000247  (S12, S09, S11, S19, S14)


Rules

In [19]:

def serv_rules(freq_itemsets,metrics,threshold):
    asso_rules = association_rules(freq_itemsets, metric=metrics, min_threshold=threshold)
    return asso_rules.sort_values(by='lift', ascending=False)[['antecedents', 'consequents', 'support', 'confidence', 'lift']]

def predict(antecedent, rules, consequents_only = False):
    preds = rules[rules['antecedents'] == antecedent] # get the rules for this antecedent
    if consequents_only:
        preds = preds['consequents'].apply(iter).apply(next) # a way to convert a frozen set with one element to string
    return preds

In [20]:
new_fpgrowth = fpgrowth(serv_oneHot, min_support=0.05, use_colnames=True)
rules_growth = serv_rules(new_fpgrowth, 'lift', 0.5)
rules_growth

Unnamed: 0,antecedents,consequents,support,confidence,lift
6,"(S12, S09)",(S14),0.094436,0.678758,6.556292
11,(S14),"(S12, S09)",0.094436,0.912184,6.556292
2,(S09),(S14),0.103528,0.672938,6.500073
3,(S14),(S09),0.103528,1.0,6.500073
7,"(S12, S14)",(S09),0.094436,1.0,6.500073
10,(S09),"(S12, S14)",0.094436,0.613843,6.500073
4,(S12),(S14),0.094436,0.100312,0.968942
5,(S14),(S12),0.094436,0.912184,0.968942
8,"(S09, S14)",(S12),0.094436,0.912184,0.968942
9,(S12),"(S09, S14)",0.094436,0.100312,0.968942


In [21]:
new_fpmax = fpmax(serv_oneHot, min_support=0.0001, use_colnames=True)
rules_max = serv_rules(new_fpgrowth, 'support', 0.0003)
rules_max

Unnamed: 0,antecedents,consequents,support,confidence,lift
6,"(S12, S09)",(S14),0.094436,0.678758,6.556292
11,(S14),"(S12, S09)",0.094436,0.912184,6.556292
2,(S09),(S14),0.103528,0.672938,6.500073
3,(S14),(S09),0.103528,1.0,6.500073
7,"(S12, S14)",(S09),0.094436,1.0,6.500073
10,(S09),"(S12, S14)",0.094436,0.613843,6.500073
4,(S12),(S14),0.094436,0.100312,0.968942
5,(S14),(S12),0.094436,0.912184,0.968942
8,"(S09, S14)",(S12),0.094436,0.912184,0.968942
9,(S12),"(S09, S14)",0.094436,0.100312,0.968942


Prediction

In [24]:
predict({"S06"}, rules_apriori, consequents_only=False)

Unnamed: 0,antecedents,consequents,support,confidence,lift
86,(S06),(S21),0.004582,0.784477,59.162639


In [25]:
predict({"S06"}, rules_fpmax, consequents_only=False)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
37,(S06),"(S12, S09, S21)",,,0.002486,,,,,


In [29]:
new_fpgrowth = fpgrowth(serv_oneHot, min_support=0.01, use_colnames=True)
rules_max = serv_rules(new_fpgrowth, 'confidence', 0.6)
predict({"S06"}, rules_fpgrowth, consequents_only=False)

Unnamed: 0,antecedents,consequents,support,confidence,lift
115,(S06),(S21),0.004582,0.784477,59.162639


After running predict with three differet rules, S21 is the first one should be offered with S06. Also, S12 and S09 can also be offered with S06 together.