In [0]:
import pandas as pd
import numpy as np
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

In [0]:
def get_clean_data(file_name):
  df = pd.read_csv(file_name, sep='\t', engine='python')

  # Replace the empty values by NaN value
  df = df.replace(r'^\s*$', np.nan, regex=True)

  # Find the number of valid values for each column
  counts = df.count().to_frame('count')
  print("NUmber of columns in the given data = ", counts.shape[0])

  # Select the columns that have 3/4 th values as valid
  filter_columns_df = counts.loc[counts['count']>=int(df.shape[0]*0.70)]
  filter_columns_list = filter_columns_df.index.tolist()
  print("Number of columns chosen for analysis = ", filter_columns_df.shape[0])

  # Drop the remaining columns from data frame
  df = df[filter_columns_list]
  return df

In [0]:
def filter_dataframe(dataframe, filter):
  return dataframe[filter]

In [0]:
def format_data(format_df):
  dataframe = format_df.copy()
  if 'EW6' in dataframe.columns:
    bins= [0, 18, 30, 40, 50, 60, 150]
    labels = [0, 1, 2, 3, 4, 5]
    dataframe['EW6'] = pd.cut(dataframe['EW6'], bins=bins, labels=labels, right=False)
    dataframe.head()

  if 'EW8' in dataframe.columns:
    dataframe['EW8'].fillna(0, inplace=True)
    dataframe['EW8'] = dataframe['EW8'].astype(int)
    dataframe.loc[dataframe['EW8'].between(1,10), 'EW8'] = 1
    dataframe.loc[dataframe['EW8'].between(11,12), 'EW8'] = 2
    dataframe.loc[dataframe['EW8'].between(13,14), 'EW8'] = 3

  if 'EW10' in dataframe.columns:
    dataframe['EW10'].fillna(0, inplace=True)
    dataframe['EW10'] = dataframe['EW10'].astype(int)
    dataframe.loc[dataframe['EW10'].between(1,3), 'EW10'] = 1
    dataframe.loc[dataframe['EW10'].between(4,5), 'EW10'] = 2

  if 'EW15A' in dataframe.columns:
    dataframe['EW15A'].fillna(0, inplace=True)
    dataframe['EW15A'] = dataframe['EW15A'].astype(int)
    dataframe.loc[dataframe['EW15A'].between(1,10), 'EW15A'] = 1
    dataframe.loc[dataframe['EW15A'].between(11,12), 'EW15A'] = 2
    dataframe.loc[dataframe['EW15A'].between(13,14), 'EW15A'] = 3

  if 'EW15B' in dataframe.columns:
    dataframe['EW15B'].fillna(0, inplace=True)
    dataframe['EW15B'] = dataframe['EW15B'].astype(int)
    dataframe.loc[dataframe['EW15B'].between(1,10), 'EW15B'] = 1
    dataframe.loc[dataframe['EW15B'].between(11,12), 'EW15B'] = 2
    dataframe.loc[dataframe['EW15B'].between(13,14), 'EW15B'] = 3

  if 'MH1A' in dataframe.columns:
    bins= [1, 18, 30, 40, 50, 60]
    labels = [1, 2, 3, 4, 5]     
    dataframe['MH1A'].fillna(0, inplace=True)
    dataframe['MH1A'] = dataframe['MH1A'].astype(int)
    dataframe['MH1A'] = pd.cut(dataframe['MH1A'], bins=bins, labels=labels, right=False)

  if 'COPC' in dataframe.columns:
    dataframe['COPC'] = df['COPC'].astype(float)
    quant = dataframe['COPC'].quantile([0.25,0.75])
    dataframe.loc[dataframe['COPC'] < quant[0.25], 'COPC'] = 1
    dataframe.loc[dataframe['COPC'].between(quant[0.25],quant[0.75]), 'COPC'] = 2
    dataframe.loc[dataframe['COPC'] >  quant[0.75], 'COPC'] = 3
    dataframe['COPC'].unique()

  if 'INCOMEPC' in dataframe.columns:
    quant = dataframe['INCOMEPC'].quantile([0.25,0.75])
    dataframe.loc[dataframe['INCOMEPC'] < quant[0.25], 'INCOMEPC'] = 1
    dataframe.loc[dataframe['INCOMEPC'].between(quant[0.25],quant[0.75]), 'INCOMEPC'] = 2
    dataframe.loc[dataframe['INCOMEPC'] >  quant[0.75], 'INCOMEPC'] = 3
    dataframe['INCOMEPC'].unique()
  
  return dataframe

In [0]:
def is_nan(x):
  return (x is np.nan or x != x)

In [0]:
def get_transaction_df(df_filtered):
  transactions = []
  for index, row in df_filtered.iterrows():
    tran = []
    for col in df_filtered.columns:
      if not is_nan(row[col]):
        tran.append(col + "_" + str(row[col]))
    transactions.append(tran)

  te = TransactionEncoder()
  te_ary = te.fit(transactions).transform(transactions)
  trans_df = pd.DataFrame(te_ary, columns=te.columns_)
  return trans_df

In [0]:
def get_frequent_itemsets(trans_df, min_sup):
  frequent_itemsets = apriori(trans_df, min_support=min_sup, use_colnames=True)
  frequent_itemsets['length'] = frequent_itemsets['itemsets'].apply(lambda x: len(x))
  return frequent_itemsets

In [0]:
def get_assoc_rules(freq_itemsets, min_conf):
  rules = association_rules(freq_itemsets, metric="confidence", min_threshold=min_conf)
  rules["antecedent_len"] = rules["antecedents"].apply(lambda x: len(x))
  return rules

In [0]:
def get_lookup():
  lookup = dict()
  lookup['EW5'] = "Relationship to Household head"
  lookup['EW6'] = "Age (in years)"
  lookup['EW7Y'] = "Year of birth"
  lookup['EW8'] = "Years of education completed"
  lookup['EW9'] = "N children alive"
  lookup['EW10'] = "Your general health"
  lookup['EW13A'] = "Mother lives in household"
  lookup['EW13B'] = "Father lives in household"
  lookup['EW14A'] = "Mother attended school"
  lookup['EW15A'] = "Mother education"
  lookup['EW15B'] = "Father education"
  lookup['EW16A'] = "Mother literate"
  lookup['EW16B'] = "Father literate"
  lookup['EW16C'] = "Mother-in-law literate"
  lookup['EW18A'] = "Brother highest education"
  lookup['EW18B'] = "Sister highest education"
  lookup['HB1'] = "glasses of milk daily harmful during pregnancy"
  lookup['HB2'] = "Belief: Men physically weak months after sterilization"
  lookup['HB3'] = "Belief: 1st milk after birth good for baby"
  lookup['HB4'] = "Belief: Chulha smoke good for health"
  lookup['HB5'] = "Belief: Child diarrhea more to drink"
  lookup['HB6'] = "Belief: Illness spread through impure water"
  lookup['HB7'] = "Belief: How Malaria spreads"
  lookup['HB8'] = "Belief: Pregnancy most likely during menstrual cycle"
  lookup['AI1'] = "HIV/AIDS Awareness"
  lookup['GR9F'] = "Can visit health centre alone (permission needed or not)"
  lookup['GR13A'] = "Past 5 years: Been to metro city (beside current residence)"
  lookup['GR16A'] = "Past 5 years: Been to another state"
  lookup['GR17A'] = "Past 5 years: Been abroad"
  lookup['GR18A'] = "Are you a member of a: Mahila Mandal"
  lookup['GR18B'] = "Are you a member of a: Self-help group"
  lookup['GR22'] = "Family outings to cinema, mela, or restaurant"
  lookup['GR27A'] = "Family member has bank account"
  lookup['MH1A'] = "Age at marriage (in years)"
  lookup['MH1E'] = "Age first started menarche (in years)"
  lookup['MH2'] = "Marriage status"
  lookup['MH4A'] = "Who chose your husband"
  lookup['MH4B'] = "Did you have any say in choosing your husband"
  lookup['FP2A'] = "Currently use contraceptives"
  lookup['URBAN2011'] = "Urban residence from census 2011"
  lookup['METRO'] = "Largest 6 metro areas 0/1"
  lookup['ID11'] = "Religion"
  lookup['ID13'] = "Caste category"
  lookup['COPC'] = "Household expenditure /capita"
  lookup['INCOMEPC'] = "per capita income"
  lookup['NPERSONS'] = "N in household"
  lookup['ED2'] = "Education: Literacy"
  lookup['ED4'] = "Education: Attended school"
  lookup['boyEducated'] = "Boy Educated"
  lookup['girlEducated'] = "Girl Educated"
  return lookup

In [0]:
def format_item(item):
  return item.split("_")

In [0]:
def print_freq_itemsets(freq_itemsets, lookup):
  for index, row in freq_itemsets.iterrows():
    items = row['itemsets']
    for a in items:
      item = format_item(a)
      print(lookup[item[0]]," \"", item[1], "\", ", end =" ") 
    print("  Support : ", row['support'])

In [0]:
def print_assoc_rules(assoc_rules, lookup):
  for index, row in assoc_rules.iterrows():
    ant = row['antecedents']
    con = row['consequents']
    for a in ant:
      item = format_item(a)
      print(lookup[item[0]]," \"", item[1], "\", ", end =" ") 
    print(" =====> ", end =" ") 
    for c in con:
      item = format_item(c)
      print(lookup[item[0]]," \"", item[1], "\", ", end =" ") 
    print()

In [0]:
def print_items(items, lookup):
  for item in items:
    print(item, " ==> ", lookup[item])

In [0]:
df = get_clean_data("36151-0003-Data.tsv")
df.head()

NUmber of columns in the given data =  580
Number of columns chosen for analysis =  350


Unnamed: 0,SURVEY,STATEID,DISTID,PSUID,HHID,HHSPLITID,PERSONID,IDPSU,IDHH,IDPERSON,GE10A,GE10B,GE11,GE12,GE13,CD3D,CD3M,CD3Y,CD3DATE,CD4A,CD4B,CD4C,EW5,EW6,EW8,EW9,EW10,EW11,EW12A,EW12B,EW12C,EW12D,EW13A,EW14A,EW14B,EW14C,EW14D,EW15A,EW15B,EW15C,...,AP9,EWELIGIBLE,WKANY5,WKANIMAL,WKBUSINESS,WKAGLAB,WKFARM,WKNONAG,WKSALARY,WKNREGA,WKHOURS,NFHOURS,WKDAYS,NFDAYS,RSUNEARN,SPRO10,SPRO3,SPRO4,SPRO5,SPRO6,SPRO8,SPED2,SPED3,SPED4,SPED6,SPWKANY5,SPWKANIMAL,SPWKBUSINESS,SPWKAGLAB,SPWKFARM,SPWKNONAG,SPWKHOURS,SPWKDAYS,EWQELIGIBLE,AGERANK,NEVMFEM,EWPOSITION,NEWQELIGIBLE,WTEW,FWTEW
0,2,1,2,1,10,1,2,10201,102010101,10201010102,1.0,1.0,2.0,2.0,9.0,3,7,2012,19177,10,50,1,2,49,0,4,5,2,1,0,1,1,2.0,0,0,0,0,0,0,0,...,65.69999694825,1,2,3,0,0,2,0,0,0,24,0,6,0,175804.109375,69,1,1,57,1,2,0,0,0,0,4,0,0,0,0,3,4000,365,1,1,2,2,2,3687.92627,3688
1,2,1,2,1,10,1,6,10201,102010101,10201010106,,,,,,3,7,2012,19177,10,15,1,4,26,8,3,5,2,1,1,1,1,2.0,0,1,0,0,0,6,0,...,60.0,1,0,0,0,0,0,0,0,0,0,0,0,0,176100.0,2,1,3,29,1,6,1,1,1,9,4,0,0,0,0,3,3300,275,1,2,2,3,2,3687.92627,3688
2,2,1,2,1,20,1,8,10201,102010201,10201020108,1.0,1.0,1.0,3.0,3.0,3,7,2012,19177,10,0,1,4,33,12,3,3,3,1,1,0,1,2.0,0,0,0,1,0,0,0,...,60.29999923707,1,4,0,0,0,0,0,4,0,2555,0,365,0,999500.0,75,1,3,37,1,8,1,2,1,16,4,0,0,0,0,0,2555,365,1,2,3,3,1,11063.779297,11064
3,2,1,2,1,30,1,2,10201,102010301,10201030102,1.0,1.0,1.0,2.0,9.0,3,7,2012,19177,12,25,2,2,43,0,5,3,2,0,0,0,0,,0,0,0,0,0,0,0,...,79.09999847413,1,3,3,0,0,3,0,0,0,240,0,60,0,178200.0,75,1,1,45,1,2,1,2,1,10,4,0,0,0,0,0,2555,365,1,1,1,1,1,3687.92627,3688
4,2,1,2,1,40,1,2,10201,102010401,10201040102,1.0,1.0,1.0,3.0,9.0,3,7,2012,19177,1,20,2,2,47,0,3,3,2,0,0,1,0,,0,0,0,0,0,0,0,...,56.40000152588,1,3,3,0,0,3,0,0,0,450,0,90,0,89568.335938,69,1,1,57,1,2,0,0,0,0,4,0,0,0,3,4,2710,350,1,1,1,1,1,3687.92627,3688


In [0]:
format_df = format_data(df)
format_df.head()

Unnamed: 0,SURVEY,STATEID,DISTID,PSUID,HHID,HHSPLITID,PERSONID,IDPSU,IDHH,IDPERSON,GE10A,GE10B,GE11,GE12,GE13,CD3D,CD3M,CD3Y,CD3DATE,CD4A,CD4B,CD4C,EW5,EW6,EW8,EW9,EW10,EW11,EW12A,EW12B,EW12C,EW12D,EW13A,EW14A,EW14B,EW14C,EW14D,EW15A,EW15B,EW15C,...,AP9,EWELIGIBLE,WKANY5,WKANIMAL,WKBUSINESS,WKAGLAB,WKFARM,WKNONAG,WKSALARY,WKNREGA,WKHOURS,NFHOURS,WKDAYS,NFDAYS,RSUNEARN,SPRO10,SPRO3,SPRO4,SPRO5,SPRO6,SPRO8,SPED2,SPED3,SPED4,SPED6,SPWKANY5,SPWKANIMAL,SPWKBUSINESS,SPWKAGLAB,SPWKFARM,SPWKNONAG,SPWKHOURS,SPWKDAYS,EWQELIGIBLE,AGERANK,NEVMFEM,EWPOSITION,NEWQELIGIBLE,WTEW,FWTEW
0,2,1,2,1,10,1,2,10201,102010101,10201010102,1.0,1.0,2.0,2.0,9.0,3,7,2012,19177,10,50,1,2,3,0,4,2,2,1,0,1,1,2.0,0,0,0,0,0,0,0,...,65.69999694825,1,2,3,0,0,2,0,0,0,24,0,6,0,175804.109375,69,1,1,57,1,2,0,0,0,0,4,0,0,0,0,3,4000,365,1,1,2,2,2,3687.92627,3688
1,2,1,2,1,10,1,6,10201,102010101,10201010106,,,,,,3,7,2012,19177,10,15,1,4,1,1,3,2,2,1,1,1,1,2.0,0,1,0,0,0,1,0,...,60.0,1,0,0,0,0,0,0,0,0,0,0,0,0,176100.0,2,1,3,29,1,6,1,1,1,9,4,0,0,0,0,3,3300,275,1,2,2,3,2,3687.92627,3688
2,2,1,2,1,20,1,8,10201,102010201,10201020108,1.0,1.0,1.0,3.0,3.0,3,7,2012,19177,10,0,1,4,2,2,3,1,3,1,1,0,1,2.0,0,0,0,1,0,0,0,...,60.29999923707,1,4,0,0,0,0,0,4,0,2555,0,365,0,999500.0,75,1,3,37,1,8,1,2,1,16,4,0,0,0,0,0,2555,365,1,2,3,3,1,11063.779297,11064
3,2,1,2,1,30,1,2,10201,102010301,10201030102,1.0,1.0,1.0,2.0,9.0,3,7,2012,19177,12,25,2,2,3,0,5,1,2,0,0,0,0,,0,0,0,0,0,0,0,...,79.09999847413,1,3,3,0,0,3,0,0,0,240,0,60,0,178200.0,75,1,1,45,1,2,1,2,1,10,4,0,0,0,0,0,2555,365,1,1,1,1,1,3687.92627,3688
4,2,1,2,1,40,1,2,10201,102010401,10201040102,1.0,1.0,1.0,3.0,9.0,3,7,2012,19177,1,20,2,2,3,0,3,1,2,0,0,1,0,,0,0,0,0,0,0,0,...,56.40000152588,1,3,3,0,0,3,0,0,0,450,0,90,0,89568.335938,69,1,1,57,1,2,0,0,0,0,4,0,0,0,3,4,2710,350,1,1,1,1,1,3687.92627,3688


In [0]:
filter = [
'EW8',
'EW15A',
'EW15B',
'EW10',
'MH1A',
'MH4B'
]

lookup = get_lookup()

print_items(filter, lookup)

filtered_df = filter_dataframe(format_df, filter)
filtered_df.head()

EW8  ==>  Years of education completed
EW15A  ==>  Mother education
EW15B  ==>  Father education
EW10  ==>  Your general health
MH1A  ==>  Age at marriage (in years)
MH4B  ==>  Did you have any say in choosing your husband


Unnamed: 0,EW8,EW15A,EW15B,EW10,MH1A,MH4B
0,0,0,0,5,1,0.0
1,1,0,1,5,2,0.0
2,2,0,0,3,3,
3,0,0,0,3,3,
4,0,0,0,3,3,0.0


In [0]:
trans_df = get_transaction_df(filtered_df)

In [0]:
lookup = get_lookup()
frequent_itemsets = get_frequent_itemsets(trans_df, 0.2)
frequent_itemsets_filtered = frequent_itemsets[ (frequent_itemsets['support'] < 0.3) ]
print_freq_itemsets(frequent_itemsets_filtered, lookup)

# Years of education completed  " 0 ",  Father education  " 0 ",  Mother education  " 0 ",    Support :  0.31450041747843027
# Father education  " 0 ",  Age at marriage (in years)  " 1.0 ",  Mother education  " 0 ",    Support :  0.3032664524454115
# Years of education completed  " 0 ",  Father education  " 0 ",  Age at marriage (in years)  " 1.0 ",  Mother education  " 0 ",    Support :  0.20739822381904208
# Did you have any say in choosing your husband  " 0 ",  Father education  " 0 ",  Mother education  " 0 ",    Support :  0.2597727905270349

Your general health  " 1 ",    Support :  0.2389747741821218
Mother education  " 1 ",    Support :  0.21718999063836247
Age at marriage (in years)  " 2.0 ",    Support :  0.2544594286870936
Age at marriage (in years)  " 3.0 ",    Support :  0.27965994484224377
Your general health  " 2 ",  Years of education completed  " 1 ",    Support :  0.25736912683753765
Your general health  " 2 ",  Age at marriage (in years)  " 1.0 ",    Support :  0.241150722364193
Your general health  " 2 ",  Did you have any say in choosing your husband  " 0 ",    Support :  0.21086455987652758
Father education  " 1 ",  Mother education  " 0 ",    Support :  0.20026313791969233
Did you have any say in choosing your husband  " 1 ",  Mother education  " 0 ",    Support :  0.2129139994433621
Father education  " 0 ",  Years of education completed  " 1 ",    Support :  0.22814563671786048
Did you have any say in choosing your husband  " 0 ",  Father education  " 0 ",    Support :  0.265668091997065
Father education 

In [0]:
filter = [
'EW8',
'EW15A',
'EW15B',
'EW10',
]

print_items(filter, lookup)

filtered_df = filter_dataframe(format_df, filter)
filtered_df.head()

EW8  ==>  Years of education completed
EW15A  ==>  Mother education
EW15B  ==>  Father education
EW10  ==>  Your general health


Unnamed: 0,EW8,EW15A,EW15B,EW10
0,0,0,0,5
1,1,0,1,5
2,2,0,0,3
3,0,0,0,3
4,0,0,0,3


In [0]:
trans_df = get_transaction_df(filtered_df)

In [0]:
lookup = get_lookup()
frequent_itemsets = get_frequent_itemsets(trans_df, 0.02)
frequent_itemsets_filtered = frequent_itemsets[ (frequent_itemsets['support'] < 0.05) ]
print_freq_itemsets(frequent_itemsets_filtered, lookup)

# Years of education completed  " 0 ",  Mother education  " 0 ",  Father education  " 0 ",  Your general health  " 3 ",    Support :  0.05525896313538952
# Years of education completed  " 2 ",  Father education  " 1 ",  Mother education  " 1 ",    Support :  0.028337929813020267
# Your general health  " 4 ",  Father education  " 0 ",  Mother education  " 0 ",    Support :  0.04354426536447132

Father education  " 15 ",    Support :  0.020899223237102447
Father education  " 2 ",    Support :  0.0373200414948258
Years of education completed  " 15 ",    Support :  0.038711636262429475
Your general health  " 1 ",  Years of education completed  " 2 ",    Support :  0.021885990435948688
Your general health  " 2 ",  Years of education completed  " 2 ",    Support :  0.035473015712370014
Mother education  " 1 ",  Your general health  " 3 ",    Support :  0.031095817625180273
Your general health  " 4 ",  Father education  " 0 ",    Support :  0.0446828429016016
Your general health  " 4 ",  Father education  " 1 ",    Support :  0.023606507603167775
Years of education completed  " 0 ",  Your general health  " 4 ",    Support :  0.037952584571009286
Your general health  " 4 ",  Years of education completed  " 1 ",    Support :  0.02978012802671862
Mother education  " 0 ",  Years of education completed  " 2 ",    Support :  0.02866685221263568
Mother education  " 1 ",  Father education 

In [0]:
format_df = format_data(df)
format_df.head()
format_df = format_df[format_df.EW10 > '3']
format_df.shape

(2973, 350)

In [0]:
filter = [
'EW8',
'EW15A',
'EW15B'
]

lookup = get_lookup()

print_items(filter, lookup)

filtered_df = filter_dataframe(format_df, filter)
filtered_df.head()

EW8  ==>  Years of education completed
EW15A  ==>  Mother education
EW15B  ==>  Father education


Unnamed: 0,EW8,EW15A,EW15B
0,0,0,0
1,1,0,1
7,0,0,0
8,0,0,0
9,0,0,0


In [0]:
trans_df = get_transaction_df(filtered_df)

In [0]:
lookup = get_lookup()
frequent_itemsets = get_frequent_itemsets(trans_df, 0.2)
frequent_itemsets_filtered = frequent_itemsets#[ (frequent_itemsets['support'] < 0.05) ]
print_freq_itemsets(frequent_itemsets_filtered, lookup)

# Mother education  " 0 ",  Father education  " 0 ",    Support :  0.6098217288933737
# Father education  " 1 ",  Mother education  " 0 ",    Support :  0.2125798856374033
# Years of education completed  " 0 ",  Mother education  " 0 ",  Father education  " 0 ",    Support :  0.42751429532458796

Mother education  " 0 ",    Support :  0.8456104944500504
Father education  " 0 ",    Support :  0.6259670366633031
Father education  " 1 ",    Support :  0.32795156407669024
Years of education completed  " 0 ",    Support :  0.53447695930037
Years of education completed  " 1 ",    Support :  0.4117053481331988
Mother education  " 0 ",  Father education  " 0 ",    Support :  0.6098217288933737
Father education  " 1 ",  Mother education  " 0 ",    Support :  0.2125798856374033
Years of education completed  " 0 ",  Mother education  " 0 ",    Support :  0.5200134544231416
Mother education  " 0 ",  Years of education completed  " 1 ",    Support :  0.30205179952909517
Years of education completed  " 0 ",  Father education  " 0 ",    Support :  0.4308779011099899
Father education  " 1 ",  Years of education completed  " 1 ",    Support :  0.20181634712411706
Years of education completed  " 0 ",  Mother education  " 0 ",  Father education  " 0 ",    Support :  0.42751429532458796


In [0]:
format_df = format_data(df)
format_df.head()
format_df = format_df[format_df.EW10 < '4']
format_df.shape

(36412, 350)

In [0]:
filter = [
'EW8',
'EW15A',
'EW15B'
]

lookup = get_lookup()

print_items(filter, lookup)

filtered_df = filter_dataframe(format_df, filter)
filtered_df.head()

EW8  ==>  Years of education completed
EW15A  ==>  Mother education
EW15B  ==>  Father education


Unnamed: 0,EW8,EW15A,EW15B
2,2,0,0
3,0,0,0
4,0,0,0
5,0,0,0
6,0,0,0


In [0]:
trans_df = get_transaction_df(filtered_df)

In [0]:
lookup = get_lookup()
frequent_itemsets = get_frequent_itemsets(trans_df, 0.1)
frequent_itemsets_filtered = frequent_itemsets#[ (frequent_itemsets['support'] < 0.05) ]
print_freq_itemsets(frequent_itemsets_filtered, lookup)


# Years of education completed  " 0 ",  Your general health  " 4 ",  Mother education  " 0 ",  Father education  " 0 ",    Support :  0.4029599730911537

Mother education  " 0 ",    Support :  0.7625233439525431
Mother education  " 1 ",    Support :  0.22250906294628145
Father education  " 0 ",    Support :  0.5624519389212348
Father education  " 1 ",    Support :  0.36633527408546634
Years of education completed  " 0 ",    Support :  0.3716906514335933
Years of education completed  " 1 ",    Support :  0.4868449961551137
Mother education  " 0 ",  Father education  " 0 ",    Support :  0.5419916511040317
Father education  " 1 ",  Mother education  " 0 ",    Support :  0.19971437987476656
Years of education completed  " 0 ",  Mother education  " 0 ",    Support :  0.3617488739975832
Mother education  " 0 ",  Years of education completed  " 1 ",    Support :  0.3538119301329232
Father education  " 1 ",  Mother education  " 1 ",    Support :  0.1633527408546633
Mother education  " 1 ",  Years of education completed  " 1 ",    Support :  0.13050642645281776
Years of education completed  " 0 ",  Father education  " 0 ",    Support :  0.30827

In [0]:
format_df = format_data(df)
format_df.head()
format_df = format_df[format_df.INCOMEPC == 3.00]
format_df.shape

(9879, 350)

In [0]:
filter = [
'EW8',
'EW15A',
'EW15B',
]

lookup = get_lookup()

print_items(filter, lookup)

filtered_df = filter_dataframe(format_df, filter)
filtered_df.head()

EW8  ==>  Years of education completed
EW15A  ==>  Mother education
EW15B  ==>  Father education


Unnamed: 0,EW8,EW15A,EW15B
2,2,0,0
15,0,0,1
19,0,0,0
23,0,0,0
25,0,0,0


In [0]:
trans_df = get_transaction_df(filtered_df)

In [0]:
lookup = get_lookup()
frequent_itemsets = get_frequent_itemsets(trans_df, 0.1)
frequent_itemsets_filtered = frequent_itemsets#[ (frequent_itemsets['support'] < 0.05) ]
print_freq_itemsets(frequent_itemsets_filtered, lookup)

# Mother education  " 0 ",  Father education  " 0 ",  Years of education completed  " 0 ",    Support :  0.15679724668488712
# Father education  " 0 ",  Mother education  " 0 ",  Years of education completed  " 1 ",    Support :  0.18939163882984109
# Father education  " 1 ",  Mother education  " 0 ",  Years of education completed  " 1 ",    Support :  0.12248203259439215
# Father education  " 1 ",  Mother education  " 1 ",  Years of education completed  " 1 ",    Support :  0.13098491750177144

Mother education  " 0 ",    Support :  0.6022876809393664
Mother education  " 1 ",    Support :  0.3597530114384047
Father education  " 0 ",    Support :  0.3994331410061747
Father education  " 1 ",    Support :  0.45045045045045046
Years of education completed  " 0 ",    Support :  0.19344063164287884
Years of education completed  " 1 ",    Support :  0.500050612410163
Years of education completed  " 15 ",    Support :  0.1000101224820326
Years of education completed  " 2 ",    Support :  0.13189594088470494
Mother education  " 0 ",  Father education  " 0 ",    Support :  0.37908695212065996
Father education  " 1 ",  Mother education  " 0 ",    Support :  0.19050511185342647
Years of education completed  " 0 ",  Mother education  " 0 ",    Support :  0.1871646927826703
Mother education  " 0 ",  Years of education completed  " 1 ",    Support :  0.32786719303573236
Father education  " 1 ",  Mother education  " 1 ",    Support :  0.25316327563518576
Mother education  " 1 ",  Years of ed

In [0]:
format_df = format_data(df)
format_df.head()
format_df = format_df[format_df.INCOMEPC == 2.00]
format_df.shape

(19790, 350)

In [0]:
filter = [
'EW8',
'EW15A',
'EW15B',
]

lookup = get_lookup()

print_items(filter, lookup)

filtered_df = filter_dataframe(format_df, filter)
filtered_df.head()

EW8  ==>  Years of education completed
EW15A  ==>  Mother education
EW15B  ==>  Father education


Unnamed: 0,EW8,EW15A,EW15B
0,0,0,0
1,1,0,1
3,0,0,0
4,0,0,0
5,0,0,0


In [0]:
trans_df = get_transaction_df(filtered_df)

In [0]:
lookup = get_lookup()
frequent_itemsets = get_frequent_itemsets(trans_df, 0.1)
frequent_itemsets_filtered = frequent_itemsets#[ (frequent_itemsets['support'] < 0.05) ]
print_freq_itemsets(frequent_itemsets_filtered, lookup)

# Mother education  " 0 ",  Father education  " 0 ",  Years of education completed  " 0 ",    Support :  0.15679724668488712
# Father education  " 0 ",  Mother education  " 0 ",  Years of education completed  " 1 ",    Support :  0.18939163882984109
# Father education  " 1 ",  Mother education  " 0 ",  Years of education completed  " 1 ",    Support :  0.12248203259439215
# Father education  " 1 ",  Mother education  " 1 ",  Years of education completed  " 1 ",    Support :  0.13098491750177144

Mother education  " 0 ",    Support :  0.8020212228398181
Mother education  " 1 ",    Support :  0.19115715007579587
Father education  " 0 ",    Support :  0.606922688226377
Father education  " 1 ",    Support :  0.3471955533097524
Years of education completed  " 0 ",    Support :  0.405255179383527
Years of education completed  " 1 ",    Support :  0.5031834259727135
Mother education  " 0 ",  Father education  " 0 ",    Support :  0.5855482566953006
Father education  " 1 ",  Mother education  " 0 ",    Support :  0.1994946942900455
Years of education completed  " 0 ",  Mother education  " 0 ",    Support :  0.3940373926225366
Mother education  " 0 ",  Years of education completed  " 1 ",    Support :  0.3720565942395149
Father education  " 1 ",  Mother education  " 1 ",    Support :  0.14547751389590702
Mother education  " 1 ",  Years of education completed  " 1 ",    Support :  0.12905507832238505
Years of education completed  " 0 ",  Father education  " 0 ",    Support :  0.33678625

# EW10  ==>  Your general health
# MH1A  ==>  Age at marriage (in years)
# MH4B  ==>  Did you have any say in choosing your husband
# COPC  ==>  Household expenditure /capita
# INCOMEPC  ==>  per capita income

## **Father, Mother, Daughter Educated**

In [0]:
format_df = format_data(df)
format_df.head()
format_df = format_df[format_df.EW8 != 0]
format_df = format_df[format_df.EW15A != 0]
format_df = format_df[format_df.EW15B != 0]
format_df.shape

(8056, 350)

In [0]:
filter = [
'EW10',
'MH1A',
'MH4B',
'COPC',
'INCOMEPC'
]

lookup = get_lookup()

print_items(filter, lookup)

filtered_df = filter_dataframe(format_df, filter)
filtered_df.head()

EW10  ==>  Your general health
MH1A  ==>  Age at marriage (in years)
MH4B  ==>  Did you have any say in choosing your husband
COPC  ==>  Household expenditure /capita
INCOMEPC  ==>  per capita income


Unnamed: 0,EW10,MH1A,MH4B,COPC,INCOMEPC
29,2,2,1.0,3.0,3.0
118,1,2,,3.0,3.0
166,2,2,1.0,3.0,3.0
170,1,2,1.0,3.0,3.0
217,1,2,,3.0,3.0


In [0]:
trans_df = get_transaction_df(filtered_df)

In [0]:
lookup = get_lookup()
frequent_itemsets = get_frequent_itemsets(trans_df, 0.1)
frequent_itemsets_filtered = frequent_itemsets#[ (frequent_itemsets['support'] < 0.05) ]
print_freq_itemsets(frequent_itemsets_filtered, lookup)

# Did you have any say in choosing your husband  " 0 ",    Support :  0.2163604766633565
# Did you have any say in choosing your husband  " 1 ",    Support :  0.4051638530287984

# Age at marriage (in years)  " 1.0 ",    Support :  0.2038232373386296
# Age at marriage (in years)  " 2.0 ",    Support :  0.7801638530287984

# Your general health  " 1 ",    Support :  0.948982125124131

# Household expenditure /capita  " 1.0 ",    Support :  0.10911122144985104
# Household expenditure /capita  " 2.0 ",    Support :  0.46387785501489576
# Household expenditure /capita  " 3.0 ",    Support :  0.4268867924528302

# per capita income  " 1.0 ",    Support :  0.12934458788480635
# per capita income  " 2.0 ",    Support :  0.4138530287984111
# per capita income  " 3.0 ",    Support :  0.4568023833167825

Household expenditure /capita  " 1.0 ",    Support :  0.10911122144985104
Household expenditure /capita  " 2.0 ",    Support :  0.46387785501489576
Household expenditure /capita  " 3.0 ",    Support :  0.4268867924528302
Your general health  " 1 ",    Support :  0.948982125124131
per capita income  " 1.0 ",    Support :  0.12934458788480635
per capita income  " 2.0 ",    Support :  0.4138530287984111
per capita income  " 3.0 ",    Support :  0.4568023833167825
Age at marriage (in years)  " 1.0 ",    Support :  0.2038232373386296
Age at marriage (in years)  " 2.0 ",    Support :  0.7801638530287984
Did you have any say in choosing your husband  " 0 ",    Support :  0.2163604766633565
Did you have any say in choosing your husband  " 1 ",    Support :  0.4051638530287984
Your general health  " 1 ",  Household expenditure /capita  " 1.0 ",    Support :  0.10315292949354518
Your general health  " 1 ",  Household expenditure /capita  " 2.0 ",    Support :  0.4426514399205561
per capita incom

## **Father, Mother Uneducated** 
## **Daughter Educated**

In [0]:
format_df = format_data(df)
format_df.head()
format_df = format_df[format_df.EW8 != 0]
format_df = format_df[format_df.EW15A == 0]
format_df = format_df[format_df.EW15B == 0]
format_df.shape

(9209, 350)

In [0]:
filter = [
'HB1',
'HB2',
'HB3',
'HB4',
'HB5',
'HB6'
]

lookup = get_lookup()

print_items(filter, lookup)

filtered_df = filter_dataframe(format_df, filter)
filtered_df.head()

EW10  ==>  Your general health
MH1A  ==>  Age at marriage (in years)
MH4B  ==>  Did you have any say in choosing your husband
COPC  ==>  Household expenditure /capita
INCOMEPC  ==>  per capita income


Unnamed: 0,EW10,MH1A,MH4B,COPC,INCOMEPC
2,1,2,,3.0,3.0
49,2,2,1.0,3.0,1.0
54,2,2,1.0,3.0,2.0
58,2,1,0.0,3.0,2.0
64,2,2,1.0,2.0,2.0


In [0]:
trans_df = get_transaction_df(filtered_df)

In [0]:
lookup = get_lookup()
frequent_itemsets = get_frequent_itemsets(trans_df, 0.1)
frequent_itemsets_filtered = frequent_itemsets#[ (frequent_itemsets['support'] < 0.05) ]
print_freq_itemsets(frequent_itemsets_filtered, lookup)

# Did you have any say in choosing your husband  " 0 ",    Support :  0.3390161798240851
# Did you have any say in choosing your husband  " 1 ",    Support :  0.3630144423933109

# Age at marriage (in years)  " 1.0 ",    Support :  0.41144532522532307
# Age at marriage (in years)  " 2.0 ",    Support :  0.5817135411010967

# Your general health  " 1 ",    Support :  0.9354978825062439

# Household expenditure /capita  " 1.0 ",    Support :  0.23216418720816592
# Household expenditure /capita  " 2.0 ",    Support :  0.5225323053534586
# Household expenditure /capita  " 3.0 ",    Support :  0.244977739168205

# per capita income  " 1.0 ",    Support :  0.2203279400586383
# per capita income  " 2.0 ",    Support :  0.5412096861765664
# per capita income  " 3.0 ",    Support :  0.23846237376479532


Household expenditure /capita  " 1.0 ",    Support :  0.23216418720816592
Household expenditure /capita  " 2.0 ",    Support :  0.5225323053534586
Household expenditure /capita  " 3.0 ",    Support :  0.244977739168205
Your general health  " 1 ",    Support :  0.9354978825062439
per capita income  " 1.0 ",    Support :  0.2203279400586383
per capita income  " 2.0 ",    Support :  0.5412096861765664
per capita income  " 3.0 ",    Support :  0.23846237376479532
Age at marriage (in years)  " 1.0 ",    Support :  0.41144532522532307
Age at marriage (in years)  " 2.0 ",    Support :  0.5817135411010967
Did you have any say in choosing your husband  " 0 ",    Support :  0.3390161798240851
Did you have any say in choosing your husband  " 1 ",    Support :  0.3630144423933109
Your general health  " 1 ",  Household expenditure /capita  " 1.0 ",    Support :  0.21707025735693344
per capita income  " 1.0 ",  Household expenditure /capita  " 1.0 ",    Support :  0.10424584645455533
per capita inco

## **Father, Mother, Daughter Uneducated**

In [0]:
format_df = format_data(df)
format_df.head()
format_df = format_df[format_df.EW8 == 0]
format_df = format_df[format_df.EW15A == 0]
format_df = format_df[format_df.EW15B == 0]
format_df.shape

(12430, 350)

In [0]:
filter = [a
'EW10',
'MH1A',
'MH4B',
'COPC',
'INCOMEPC'
]

lookup = get_lookup()

print_items(filter, lookup)

filtered_df = filter_dataframe(format_df, filter)
filtered_df.head()

EW10  ==>  Your general health
MH1A  ==>  Age at marriage (in years)
MH4B  ==>  Did you have any say in choosing your husband
COPC  ==>  Household expenditure /capita
INCOMEPC  ==>  per capita income


Unnamed: 0,EW10,MH1A,MH4B,COPC,INCOMEPC
0,2,1,0.0,2.0,2.0
3,1,2,,2.0,2.0
4,1,2,0.0,2.0,2.0
5,1,2,,2.0,2.0
6,1,2,1.0,2.0,2.0


In [0]:
trans_df = get_transaction_df(filtered_df)

In [0]:
lookup = get_lookup()
frequent_itemsets = get_frequent_itemsets(trans_df, 0.1)
frequent_itemsets_filtered = frequent_itemsets#[ (frequent_itemsets['support'] < 0.05) ]
print_freq_itemsets(frequent_itemsets_filtered, lookup)

# Did you have any say in choosing your husband  " 0 ",    Support :  0.5748189863234111
# Did you have any say in choosing your husband  " 1 ",    Support :  0.21263073209975863

# Age at marriage (in years)  " 1.0 ",    Support :  0.6594529364440869
# Age at marriage (in years)  " 2.0 ",    Support :  0.33604183427192275

# Your general health  " 1 ",    Support :  0.8946098149637972

# Household expenditure /capita  " 1.0 ",    Support :  0.37248592115848755
# Household expenditure /capita  " 2.0 ",    Support :  0.48865647626709574
# Household expenditure /capita  " 3.0 ",    Support :  0.13861625100563155

# per capita income  " 1.0 ",    Support :  0.34408688656476266
# per capita income  " 2.0 ",    Support :  0.5312952534191472
# per capita income  " 3.0 ",    Support :  0.1246178600160901

Household expenditure /capita  " 1.0 ",    Support :  0.37248592115848755
Household expenditure /capita  " 2.0 ",    Support :  0.48865647626709574
Household expenditure /capita  " 3.0 ",    Support :  0.13861625100563155
Your general health  " 1 ",    Support :  0.8946098149637972
Your general health  " 2 ",    Support :  0.10225261464199517
per capita income  " 1.0 ",    Support :  0.34408688656476266
per capita income  " 2.0 ",    Support :  0.5312952534191472
per capita income  " 3.0 ",    Support :  0.1246178600160901
Age at marriage (in years)  " 1.0 ",    Support :  0.6594529364440869
Age at marriage (in years)  " 2.0 ",    Support :  0.33604183427192275
Did you have any say in choosing your husband  " 0 ",    Support :  0.5748189863234111
Did you have any say in choosing your husband  " 1 ",    Support :  0.21263073209975863
Your general health  " 1 ",  Household expenditure /capita  " 1.0 ",    Support :  0.3355591311343524
per capita income  " 1.0 ",  Household expenditure /c

# HB1  ==>  glasses of milk daily harmful during pregnancy
# HB2  ==>  Belief: Men physically weak months after sterilization
# HB3  ==>  Belief: 1st milk after birth good for baby
# HB4  ==>  Belief: Chulha smoke good for health
# HB5  ==>  Belief: Child diarrhea more to drink
# HB6  ==>  Belief: Illness spread through impure water
# AI1  ==>  HIV/AIDS Awareness

## **Father, Mother, Daughter Educated**

In [0]:
format_df = format_data(df)
format_df.head()
format_df = format_df[format_df.EW8 != 0]
format_df = format_df[format_df.EW15A != 0]
format_df = format_df[format_df.EW15B != 0]
format_df.shape

(8056, 350)

In [0]:
filter = [
'HB1',
'HB2',
'HB3',
'HB4',
'HB5',
'HB6',
'AI1'
]

lookup = get_lookup()

print_items(filter, lookup)

filtered_df = filter_dataframe(format_df, filter)
filtered_df.head()

HB1  ==>  glasses of milk daily harmful during pregnancy
HB2  ==>  Belief: Men physically weak months after sterilization
HB3  ==>  Belief: 1st milk after birth good for baby
HB4  ==>  Belief: Chulha smoke good for health
HB5  ==>  Belief: Child diarrhea more to drink
HB6  ==>  Belief: Illness spread through impure water
AI1  ==>  HIV/AIDS Awareness


Unnamed: 0,HB1,HB2,HB3,HB4,HB5,HB6,AI1
29,1,0.0,1,2,2,2,1
118,0,1.0,1,2,2,1,1
166,1,,1,2,2,2,1
170,0,0.0,1,2,2,2,1
217,0,0.0,1,2,2,2,1


In [0]:
trans_df = get_transaction_df(filtered_df)

In [0]:
lookup = get_lookup()
frequent_itemsets = get_frequent_itemsets(trans_df, 0.1)
frequent_itemsets_filtered = frequent_itemsets#[ (frequent_itemsets['support'] < 0.05) ]
print_freq_itemsets(frequent_itemsets_filtered, lookup)

# HIV/AIDS Awareness  " 1 ",    Support :  0.9234111221449851

# glasses of milk daily harmful during pregnancy  " 0 ",    Support :  0.756578947368421
# glasses of milk daily harmful during pregnancy  " 1 ",    Support :  0.22902184707050646

# Belief: Men physically weak months after sterilization  " 0 ",    Support :  0.4637537239324727
# Belief: Men physically weak months after sterilization  " 1 ",    Support :  0.33887785501489576

# Belief: 1st milk after birth good for baby  " 1 ",    Support :  0.9073982125124131

# Belief: Chulha smoke good for health  " 2 ",    Support :  0.8138033763654419
# Belief: Chulha smoke good for health  " 3 ",    Support :  0.12487586891757696

# Belief: Child diarrhea more to drink  " 1 ",    Support :  0.10911122144985104
# Belief: Child diarrhea more to drink  " 2 ",    Support :  0.7218222442899702
# Belief: Child diarrhea more to drink  " 3 ",    Support :  0.1413853028798411

# Belief: Illness spread through impure water  " 2 ",    Support :  0.7841360476663356

HIV/AIDS Awareness  " 1 ",    Support :  0.9234111221449851
glasses of milk daily harmful during pregnancy  " 0 ",    Support :  0.756578947368421
glasses of milk daily harmful during pregnancy  " 1 ",    Support :  0.22902184707050646
Belief: Men physically weak months after sterilization  " 0 ",    Support :  0.4637537239324727
Belief: Men physically weak months after sterilization  " 1 ",    Support :  0.33887785501489576
Belief: 1st milk after birth good for baby  " 1 ",    Support :  0.9073982125124131
Belief: Chulha smoke good for health  " 2 ",    Support :  0.8138033763654419
Belief: Chulha smoke good for health  " 3 ",    Support :  0.12487586891757696
Belief: Child diarrhea more to drink  " 1 ",    Support :  0.10911122144985104
Belief: Child diarrhea more to drink  " 2 ",    Support :  0.7218222442899702
Belief: Child diarrhea more to drink  " 3 ",    Support :  0.1413853028798411
Belief: Illness spread through impure water  " 2 ",    Support :  0.7841360476663356
HIV/AIDS A

## **Father, Mother Uneducated** 
## **Daughter Educated**

In [0]:
format_df = format_data(df)
format_df.head()
format_df = format_df[format_df.EW8 != 0]
format_df = format_df[format_df.EW15A == 0]
format_df = format_df[format_df.EW15B == 0]
format_df.shape

(9209, 350)

In [0]:
filter = [
'HB1',
'HB2',
'HB3',
'HB4',
'HB5',
'HB6',
'AI1'
]

lookup = get_lookup()

print_items(filter, lookup)

filtered_df = filter_dataframe(format_df, filter)
filtered_df.head()

HB1  ==>  glasses of milk daily harmful during pregnancy
HB2  ==>  Belief: Men physically weak months after sterilization
HB3  ==>  Belief: 1st milk after birth good for baby
HB4  ==>  Belief: Chulha smoke good for health
HB5  ==>  Belief: Child diarrhea more to drink
HB6  ==>  Belief: Illness spread through impure water
AI1  ==>  HIV/AIDS Awareness


Unnamed: 0,HB1,HB2,HB3,HB4,HB5,HB6,AI1
2,0,,1,2,2,4,1
49,0,1.0,1,2,3,4,0
54,0,1.0,1,2,2,4,1
58,0,1.0,2,2,2,4,1
64,0,0.0,3,3,3,4,1


In [0]:
trans_df = get_transaction_df(filtered_df)

In [0]:
lookup = get_lookup()
frequent_itemsets = get_frequent_itemsets(trans_df, 0.1)
frequent_itemsets_filtered = frequent_itemsets#[ (frequent_itemsets['support'] < 0.05) ]
print_freq_itemsets(frequent_itemsets_filtered, lookup)

# HIV/AIDS Awareness  " 0 ",    Support :  0.2625692257574112
# HIV/AIDS Awareness  " 1 ",    Support :  0.7362362905852969

# glasses of milk daily harmful during pregnancy  " 0 ",    Support :  0.7293951569117167
# glasses of milk daily harmful during pregnancy  " 1 ",    Support :  0.2430231295471821

# Belief: Men physically weak months after sterilization  " 0 ",    Support :  0.3697469866435009
# Belief: Men physically weak months after sterilization  " 1 ",    Support :  0.41448582908024756

# Belief: 1st milk after birth good for baby  " 1 ",    Support :  0.8656748832663699
# Belief: 1st milk after birth good for baby  " 2 ",    Support :  0.10305136279726354

# Belief: Chulha smoke good for health  " 2 ",    Support :  0.8245194918014985
# Belief: Chulha smoke good for health  " 3 ",    Support :  0.1122814637854273

# Belief: Child diarrhea more to drink  " 1 ",    Support :  0.15387121294385928
# Belief: Child diarrhea more to drink  " 2 ",    Support :  0.640894776848735
# Belief: Child diarrhea more to drink  " 3 ",    Support :  0.1572374850689543

# Belief: Illness spread through impure water  " 1 ",    Support :  0.11380171571288956
# Belief: Illness spread through impure water  " 2 ",    Support :  0.6071234661743946
# Belief: Illness spread through impure water  " 4 ",    Support :  0.18373330437615376


HIV/AIDS Awareness  " 0 ",    Support :  0.2625692257574112
HIV/AIDS Awareness  " 1 ",    Support :  0.7362362905852969
glasses of milk daily harmful during pregnancy  " 0 ",    Support :  0.7293951569117167
glasses of milk daily harmful during pregnancy  " 1 ",    Support :  0.2430231295471821
Belief: Men physically weak months after sterilization  " 0 ",    Support :  0.3697469866435009
Belief: Men physically weak months after sterilization  " 1 ",    Support :  0.41448582908024756
Belief: 1st milk after birth good for baby  " 1 ",    Support :  0.8656748832663699
Belief: 1st milk after birth good for baby  " 2 ",    Support :  0.10305136279726354
Belief: Chulha smoke good for health  " 2 ",    Support :  0.8245194918014985
Belief: Chulha smoke good for health  " 3 ",    Support :  0.1122814637854273
Belief: Child diarrhea more to drink  " 1 ",    Support :  0.15387121294385928
Belief: Child diarrhea more to drink  " 2 ",    Support :  0.640894776848735
Belief: Child diarrhea more to

## **Father, Mother, Daughter Uneducated**

In [0]:
format_df = format_data(df)
format_df.head()
format_df = format_df[format_df.EW8 == 0]
format_df = format_df[format_df.EW15A == 0]
format_df = format_df[format_df.EW15B == 0]
format_df.shape

(12430, 350)

In [0]:
filter = [
'HB1',
'HB2',
'HB3',
'HB4',
'HB5',
'HB6',
'AI1'
]

lookup = get_lookup()

print_items(filter, lookup)

filtered_df = filter_dataframe(format_df, filter)
filtered_df.head()

HB1  ==>  glasses of milk daily harmful during pregnancy
HB2  ==>  Belief: Men physically weak months after sterilization
HB3  ==>  Belief: 1st milk after birth good for baby
HB4  ==>  Belief: Chulha smoke good for health
HB5  ==>  Belief: Child diarrhea more to drink
HB6  ==>  Belief: Illness spread through impure water
AI1  ==>  HIV/AIDS Awareness


Unnamed: 0,HB1,HB2,HB3,HB4,HB5,HB6,AI1
0,0,1.0,1,3,2,4,1
3,0,,1,2,2,4,0
4,0,,1,2,3,4,0
5,0,,1,2,3,4,0
6,0,0.0,2,2,3,4,0


In [0]:
trans_df = get_transaction_df(filtered_df)

In [0]:
lookup = get_lookup()
frequent_itemsets = get_frequent_itemsets(trans_df, 0.1)
frequent_itemsets_filtered = frequent_itemsets#[ (frequent_itemsets['support'] < 0.05) ]
print_freq_itemsets(frequent_itemsets_filtered, lookup)

# HIV/AIDS Awareness  " 0 ",    Support :  0.651810136765889
# HIV/AIDS Awareness  " 1 ",    Support :  0.34746580852775544

# glasses of milk daily harmful during pregnancy  " 0 ",    Support :  0.7022526146419952
# glasses of milk daily harmful during pregnancy  " 1 ",    Support :  0.2424778761061947

# Belief: Men physically weak months after sterilization  " 0 ",    Support :  0.25583266291230894
# Belief: Men physically weak months after sterilization  " 1 ",    Support :  0.4966210780370072

# Belief: 1st milk after birth good for baby  " 1 ",    Support :  0.7568785197103781
# Belief: 1st milk after birth good for baby  " 2 ",    Support :  0.18640386162510056

# Belief: Chulha smoke good for health  " 2 ",    Support :  0.8340305711987128
# Belief: Chulha smoke good for health  " 3 ",    Support :  0.11134352373290426

# Belief: Child diarrhea more to drink  " 1 ",    Support :  0.21745776347546258
# Belief: Child diarrhea more to drink  " 2 ",    Support :  0.5287208366854385
# Belief: Child diarrhea more to drink  " 3 ",    Support :  0.1670957361222848

# Belief: Illness spread through impure water  " 1 ",    Support :  0.15591311343523734
# Belief: Illness spread through impure water  " 2 ",    Support :  0.3594529364440869
# Belief: Illness spread through impure water  " 3 ",    Support :  0.1001609010458568
# Belief: Illness spread through impure water  " 4 ",    Support :  0.34762670957361225

HIV/AIDS Awareness  " 0 ",    Support :  0.651810136765889
HIV/AIDS Awareness  " 1 ",    Support :  0.34746580852775544
glasses of milk daily harmful during pregnancy  " 0 ",    Support :  0.7022526146419952
glasses of milk daily harmful during pregnancy  " 1 ",    Support :  0.2424778761061947
Belief: Men physically weak months after sterilization  " 0 ",    Support :  0.25583266291230894
Belief: Men physically weak months after sterilization  " 1 ",    Support :  0.4966210780370072
Belief: 1st milk after birth good for baby  " 1 ",    Support :  0.7568785197103781
Belief: 1st milk after birth good for baby  " 2 ",    Support :  0.18640386162510056
Belief: Chulha smoke good for health  " 2 ",    Support :  0.8340305711987128
Belief: Chulha smoke good for health  " 3 ",    Support :  0.11134352373290426
Belief: Child diarrhea more to drink  " 1 ",    Support :  0.21745776347546258
Belief: Child diarrhea more to drink  " 2 ",    Support :  0.5287208366854385
Belief: Child diarrhea more 

# FP2A  ==>  Currently use contraceptives
# GR18A  ==>  Are you a member of a: Mahila Mandal
# GR18B  ==>  Are you a member of a: Self-help group

## **Father, Mother, Daughter Educated**

In [0]:
format_df = format_data(df)
format_df.head()
format_df = format_df[format_df.EW8 != 0]
format_df = format_df[format_df.EW15A != 0]
format_df = format_df[format_df.EW15B != 0]
format_df.shape

(8056, 350)

In [0]:
filter = [
'FP2A',
'GR18A',
'GR18B'
]

lookup = get_lookup()

print_items(filter, lookup)

filtered_df = filter_dataframe(format_df, filter)
filtered_df.head()

FP2A  ==>  Currently use contraceptives
GR18A  ==>  Are you a member of a: Mahila Mandal
GR18B  ==>  Are you a member of a: Self-help group


Unnamed: 0,FP2A,GR18A,GR18B
29,1,0,0
118,0,0,0
166,0,0,0
170,1,0,0
217,1,0,0


In [0]:
trans_df = get_transaction_df(filtered_df)

In [0]:
lookup = get_lookup()
frequent_itemsets = get_frequent_itemsets(trans_df, 0.1)
frequent_itemsets_filtered = frequent_itemsets#[ (frequent_itemsets['support'] < 0.05) ]
print_freq_itemsets(frequent_itemsets_filtered, lookup)

# Currently use contraceptives  " 0 ",    Support :  0.23882820258192652
# Currently use contraceptives  " 1 ",    Support :  0.6668321747765641

# Are you a member of a: Mahila Mandal  " 0 ",    Support :  0.9222939424031777

# Are you a member of a: Self-help group  " 0 ",    Support :  0.8753723932472691
# Are you a member of a: Self-help group  " 1 ",    Support :  0.12338629592850049

Currently use contraceptives  " 0 ",    Support :  0.23882820258192652
Currently use contraceptives  " 1 ",    Support :  0.6668321747765641
Are you a member of a: Mahila Mandal  " 0 ",    Support :  0.9222939424031777
Are you a member of a: Self-help group  " 0 ",    Support :  0.8753723932472691
Are you a member of a: Self-help group  " 1 ",    Support :  0.12338629592850049
Currently use contraceptives  " 0 ",  Are you a member of a: Mahila Mandal  " 0 ",    Support :  0.2228152929493545
Currently use contraceptives  " 0 ",  Are you a member of a: Self-help group  " 0 ",    Support :  0.21660873882820258
Are you a member of a: Mahila Mandal  " 0 ",  Currently use contraceptives  " 1 ",    Support :  0.6134558093346574
Currently use contraceptives  " 1 ",  Are you a member of a: Self-help group  " 0 ",    Support :  0.5752234359483615
Are you a member of a: Mahila Mandal  " 0 ",  Are you a member of a: Self-help group  " 0 ",    Support :  0.8357745779543198
Currently use contracepti

## **Father, Mother Uneducated** 
## **Daughter Educated**

In [0]:
format_df = format_data(df)
format_df.head()
format_df = format_df[format_df.EW8 != 0]
format_df = format_df[format_df.EW15A == 0]
format_df = format_df[format_df.EW15B == 0]
format_df.shape

(9209, 350)

In [0]:
filter = [
'FP2A',
'GR18A',
'GR18B'
]

lookup = get_lookup()

print_items(filter, lookup)

filtered_df = filter_dataframe(format_df, filter)
filtered_df.head()

FP2A  ==>  Currently use contraceptives
GR18A  ==>  Are you a member of a: Mahila Mandal
GR18B  ==>  Are you a member of a: Self-help group


Unnamed: 0,FP2A,GR18A,GR18B
2,1.0,0.0,0.0
49,0.0,0.0,0.0
54,1.0,0.0,0.0
58,1.0,,
64,,0.0,0.0


In [0]:
trans_df = get_transaction_df(filtered_df)

In [0]:
lookup = get_lookup()
frequent_itemsets = get_frequent_itemsets(trans_df, 0.1)
frequent_itemsets_filtered = frequent_itemsets#[ (frequent_itemsets['support'] < 0.05) ]
print_freq_itemsets(frequent_itemsets_filtered, lookup)

# Currently use contraceptives  " 0 ",    Support :  0.22749484200238898
# Currently use contraceptives  " 1 ",    Support :  0.6567488326636985

# Are you a member of a: Mahila Mandal  " 0 ",    Support :  0.9359322401998046

# Are you a member of a: Self-help group  " 0 ",    Support :  0.8264741014225214
# Are you a member of a: Self-help group  " 1 ",    Support :  0.17189705722662613


Currently use contraceptives  " 0 ",    Support :  0.22749484200238898
Currently use contraceptives  " 1 ",    Support :  0.6567488326636985
Are you a member of a: Mahila Mandal  " 0 ",    Support :  0.9359322401998046
Are you a member of a: Self-help group  " 0 ",    Support :  0.8264741014225214
Are you a member of a: Self-help group  " 1 ",    Support :  0.17189705722662613
Currently use contraceptives  " 0 ",  Are you a member of a: Mahila Mandal  " 0 ",    Support :  0.21381257465522857
Currently use contraceptives  " 0 ",  Are you a member of a: Self-help group  " 0 ",    Support :  0.19904441307416657
Are you a member of a: Mahila Mandal  " 0 ",  Currently use contraceptives  " 1 ",    Support :  0.6143989575415355
Currently use contraceptives  " 1 ",  Are you a member of a: Self-help group  " 0 ",    Support :  0.5273102399826257
Are you a member of a: Self-help group  " 1 ",  Currently use contraceptives  " 1 ",    Support :  0.12835269844717123
Are you a member of a: Mahila M

## **Father, Mother, Daughter Uneducated**

In [0]:
format_df = format_data(df)
format_df.head()
format_df = format_df[format_df.EW8 == 0]
format_df = format_df[format_df.EW15A == 0]
format_df = format_df[format_df.EW15B == 0]
format_df.shape

(12430, 350)

In [0]:
filter = [
'FP2A',
'GR18A',
'GR18B'
]

lookup = get_lookup()

print_items(filter, lookup)

filtered_df = filter_dataframe(format_df, filter)
filtered_df.head()

FP2A  ==>  Currently use contraceptives
GR18A  ==>  Are you a member of a: Mahila Mandal
GR18B  ==>  Are you a member of a: Self-help group


Unnamed: 0,FP2A,GR18A,GR18B
0,1,0,0
3,1,0,0
4,1,0,0
5,1,0,0
6,1,0,0


In [0]:
trans_df = get_transaction_df(filtered_df)

In [0]:
lookup = get_lookup()
frequent_itemsets = get_frequent_itemsets(trans_df, 0.1)
frequent_itemsets_filtered = frequent_itemsets#[ (frequent_itemsets['support'] < 0.05) ]
print_freq_itemsets(frequent_itemsets_filtered, lookup)

# Currently use contraceptives  " 0 ",    Support :  0.2407079646017699
# Currently use contraceptives  " 1 ",    Support :  0.6357200321802091

# Are you a member of a: Mahila Mandal  " 0 ",    Support :  0.9547868061142397

# Are you a member of a: Self-help group  " 0 ",    Support :  0.8625905068382944
# Are you a member of a: Self-help group  " 1 ",    Support :  0.13660498793242157

Currently use contraceptives  " 0 ",    Support :  0.2407079646017699
Currently use contraceptives  " 1 ",    Support :  0.6357200321802091
Are you a member of a: Mahila Mandal  " 0 ",    Support :  0.9547868061142397
Are you a member of a: Self-help group  " 0 ",    Support :  0.8625905068382944
Are you a member of a: Self-help group  " 1 ",    Support :  0.13660498793242157
Currently use contraceptives  " 0 ",  Are you a member of a: Mahila Mandal  " 0 ",    Support :  0.23121480289621882
Currently use contraceptives  " 0 ",  Are you a member of a: Self-help group  " 0 ",    Support :  0.21874497184231698
Are you a member of a: Mahila Mandal  " 0 ",  Currently use contraceptives  " 1 ",    Support :  0.6069991954947708
Currently use contraceptives  " 1 ",  Are you a member of a: Self-help group  " 0 ",    Support :  0.5366049879324215
Are you a member of a: Mahila Mandal  " 0 ",  Are you a member of a: Self-help group  " 0 ",    Support :  0.8366049879324216
Are you a member of a: Se

# Discriminancy between boy and girl education

## **Father, Mother, Daughter Educated**

In [0]:
format_df = format_data(df)
format_df.head()
format_df = format_df[format_df.EW15A != 0]
format_df = format_df[format_df.EW15B != 0]
format_df.shape

(8347, 350)

In [0]:
format_df.dropna(subset=['EW18A'], inplace=True)
format_df['EW18B'].fillna('0', inplace=True)

In [0]:
format_df['boyEducated'] = np.where(format_df['EW18A']=='0', 0, 1)
format_df['girlEducated'] = np.where((format_df['EW8']==0)&(format_df['EW18B']=='0'), 0, 1)

format_df.head()

Unnamed: 0,SURVEY,STATEID,DISTID,PSUID,HHID,HHSPLITID,PERSONID,IDPSU,IDHH,IDPERSON,GE10A,GE10B,GE11,GE12,GE13,CD3D,CD3M,CD3Y,CD3DATE,CD4A,CD4B,CD4C,EW5,EW6,EW8,EW9,EW10,EW11,EW12A,EW12B,EW12C,EW12D,EW13A,EW14A,EW14B,EW14C,EW14D,EW15A,EW15B,EW15C,...,WKANY5,WKANIMAL,WKBUSINESS,WKAGLAB,WKFARM,WKNONAG,WKSALARY,WKNREGA,WKHOURS,NFHOURS,WKDAYS,NFDAYS,RSUNEARN,SPRO10,SPRO3,SPRO4,SPRO5,SPRO6,SPRO8,SPED2,SPED3,SPED4,SPED6,SPWKANY5,SPWKANIMAL,SPWKBUSINESS,SPWKAGLAB,SPWKFARM,SPWKNONAG,SPWKHOURS,SPWKDAYS,EWQELIGIBLE,AGERANK,NEVMFEM,EWPOSITION,NEWQELIGIBLE,WTEW,FWTEW,boyEducated,girlEducated
29,2,1,2,2,70,1,4,10202,102020701,10202070104,1.0,1.0,1.0,2.0,3.0,20,6,2012,19164,11,45,1,4,1,1,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,...,3,0,0,0,3,0,0,0,600,0,150,0,444530.0,2,1,3,33,1,4,1,1,1,9,0,0,0,0,0,0,0,0,1,2,2,3,1,5720.114258,5720,1,1
118,2,1,2,7,20,1,4,10207,102070201,10207020104,,,,,,14,7,2012,19188,8,50,1,4,1,15,0,1,2,1,1,1,1,2,1,1,0,1,1,1,0,...,2,0,0,0,2,0,0,0,150,0,30,0,1062160.75,2,1,3,29,1,4,1,2,1,16,4,0,0,0,2,0,2615,365,1,2,2,3,2,2108.087646,2108,1,1
166,2,1,3,2,60,2,2,10302,103020602,10302060202,1.0,1.0,1.0,3.0,9.0,11,6,2012,19155,10,15,1,2,1,15,2,2,3,1,1,1,0,2,1,1,0,1,1,2,0,...,4,0,0,0,0,0,4,0,2190,0,365,0,240000.0,69,1,1,32,1,2,1,2,1,15,4,0,0,0,0,0,2190,365,1,1,1,1,1,2012.265381,2012,1,1
170,2,1,3,2,120,1,2,10302,103021201,10302120102,1.0,1.0,1.0,3.0,3.0,9,6,2012,19153,3,15,2,2,3,16,3,1,1,1,0,1,1,2,1,1,1,1,16,16,16,...,4,0,0,0,0,0,4,0,3285,0,365,0,547400.0,69,1,1,47,1,2,1,2,1,16,4,0,0,0,0,0,3285,365,1,1,1,1,1,2012.265381,2012,0,1
200,2,1,3,5,40,1,2,10305,103050401,10305040102,1.0,1.0,1.0,2.0,9.0,2,7,2012,19176,10,50,1,2,3,0,4,1,3,1,1,1,1,2,1,1,0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0,0,0,57000.0,69,1,1,42,1,2,1,2,1,12,4,0,0,0,0,0,2430,270,1,1,1,1,1,2038.742432,2039,1,0


In [0]:
filter = [
'boyEducated',
'girlEducated'
]

lookup = get_lookup()

print_items(filter, lookup)

filtered_df = filter_dataframe(format_df, filter)
filtered_df.head()

boyEducated  ==>  Boy Educated
girlEducated  ==>  Girl Educated


Unnamed: 0,boyEducated,girlEducated
29,1,1
118,1,1
166,1,1
170,0,1
200,1,0


In [0]:
trans_df = get_transaction_df(filtered_df)

In [0]:
lookup = get_lookup()
frequent_itemsets = get_frequent_itemsets(trans_df, 0.01)
frequent_itemsets_filtered = frequent_itemsets#[ (frequent_itemsets['support'] < 0.05) ]
print_freq_itemsets(frequent_itemsets_filtered, lookup)

# Boy Educated  " 1 ",  Girl Educated  " 0 ",    Support :  0.011253395421032208
# Boy Educated  " 1 ",  Girl Educated  " 1 ",    Support :  0.9502004915276161

Boy Educated  " 0 ",    Support :  0.0385461130513517
Boy Educated  " 1 ",    Support :  0.9614538869486483
Girl Educated  " 0 ",    Support :  0.01332298538352089
Girl Educated  " 1 ",    Support :  0.9866770146164792
Boy Educated  " 0 ",  Girl Educated  " 1 ",    Support :  0.03647652308886302
Boy Educated  " 1 ",  Girl Educated  " 0 ",    Support :  0.011253395421032208
Boy Educated  " 1 ",  Girl Educated  " 1 ",    Support :  0.9502004915276161


## **Father, Mother, Daughter Uneducated**

In [0]:
format_df = format_data(df)
format_df.head()
format_df = format_df[format_df.EW15A == 0]
format_df = format_df[format_df.EW15B == 0]
format_df.shape

(21639, 350)

In [0]:
format_df.dropna(subset=['EW18A'], inplace=True)
format_df['EW18B'].fillna('0', inplace=True)

In [0]:
format_df['boyEducated'] = np.where(format_df['EW18A']=='0', 0, 1)
format_df['girlEducated'] = np.where((format_df['EW8']==0)&(format_df['EW18B']=='0'), 0, 1)

# format_df['boyEducated'] = df.apply(lambda row: 0 if row.EW18A == '0' else 1, axis=1)

format_df.head()

Unnamed: 0,SURVEY,STATEID,DISTID,PSUID,HHID,HHSPLITID,PERSONID,IDPSU,IDHH,IDPERSON,GE10A,GE10B,GE11,GE12,GE13,CD3D,CD3M,CD3Y,CD3DATE,CD4A,CD4B,CD4C,EW5,EW6,EW8,EW9,EW10,EW11,EW12A,EW12B,EW12C,EW12D,EW13A,EW14A,EW14B,EW14C,EW14D,EW15A,EW15B,EW15C,...,WKANY5,WKANIMAL,WKBUSINESS,WKAGLAB,WKFARM,WKNONAG,WKSALARY,WKNREGA,WKHOURS,NFHOURS,WKDAYS,NFDAYS,RSUNEARN,SPRO10,SPRO3,SPRO4,SPRO5,SPRO6,SPRO8,SPED2,SPED3,SPED4,SPED6,SPWKANY5,SPWKANIMAL,SPWKBUSINESS,SPWKAGLAB,SPWKFARM,SPWKNONAG,SPWKHOURS,SPWKDAYS,EWQELIGIBLE,AGERANK,NEVMFEM,EWPOSITION,NEWQELIGIBLE,WTEW,FWTEW,boyEducated,girlEducated
0,2,1,2,1,10,1,2,10201,102010101,10201010102,1,1,2,2,9,3,7,2012,19177,10,50,1,2,3,0,4,2,2,1,0,1,1,2.0,0,0,0,0,0,0,0,...,2,3,0,0,2,0,0,0,24,0,6,0,175804.109375,69,1,1,57,1,2,0,0,0,0,4,0,0,0,0,3,4000,365,1,1,2,2,2,3687.92627,3688,1,1
2,2,1,2,1,20,1,8,10201,102010201,10201020108,1,1,1,3,3,3,7,2012,19177,10,0,1,4,2,2,3,1,3,1,1,0,1,2.0,0,0,0,1,0,0,0,...,4,0,0,0,0,0,4,0,2555,0,365,0,999500.0,75,1,3,37,1,8,1,2,1,16,4,0,0,0,0,0,2555,365,1,2,3,3,1,11063.779297,11064,1,1
3,2,1,2,1,30,1,2,10201,102010301,10201030102,1,1,1,2,9,3,7,2012,19177,12,25,2,2,3,0,5,1,2,0,0,0,0,,0,0,0,0,0,0,0,...,3,3,0,0,3,0,0,0,240,0,60,0,178200.0,75,1,1,45,1,2,1,2,1,10,4,0,0,0,0,0,2555,365,1,1,1,1,1,3687.92627,3688,0,0
4,2,1,2,1,40,1,2,10201,102010401,10201040102,1,1,1,3,9,3,7,2012,19177,1,20,2,2,3,0,3,1,2,0,0,1,0,,0,0,0,0,0,0,0,...,3,3,0,0,3,0,0,0,450,0,90,0,89568.335938,69,1,1,57,1,2,0,0,0,0,4,0,0,0,3,4,2710,350,1,1,1,1,1,3687.92627,3688,0,0
5,2,1,2,1,50,1,2,10201,102010501,10201050102,1,1,2,3,9,3,7,2012,19177,2,45,2,2,2,0,3,1,2,0,0,0,0,,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,212600.0,75,1,1,50,1,2,1,0,1,8,0,0,0,0,0,0,0,0,1,1,3,2,2,5531.889648,5532,1,0


In [0]:
filter = [
'boyEducated',
'girlEducated'
]

lookup = get_lookup()

print_items(filter, lookup)

filtered_df = filter_dataframe(format_df, filter)
filtered_df.head()

boyEducated  ==>  Boy Educated
girlEducated  ==>  Girl Educated


Unnamed: 0,boyEducated,girlEducated
0,1,1
2,1,1
3,0,0
4,0,0
5,1,0


In [0]:
trans_df = get_transaction_df(filtered_df)

In [0]:
lookup = get_lookup()
frequent_itemsets = get_frequent_itemsets(trans_df, 0.1)
frequent_itemsets_filtered = frequent_itemsets#[ (frequent_itemsets['support'] < 0.05) ]
print_freq_itemsets(frequent_itemsets_filtered, lookup)

# Boy Educated  " 1 ",  Girl Educated  " 0 ",    Support :  0.23913911914511135
# Boy Educated  " 1 ",  Girl Educated  " 1 ",    Support :  0.48596824128632776

Boy Educated  " 0 ",    Support :  0.27489263956856086
Boy Educated  " 1 ",    Support :  0.7251073604314391
Girl Educated  " 0 ",    Support :  0.44422251073604313
Girl Educated  " 1 ",    Support :  0.5557774892639569
Boy Educated  " 0 ",  Girl Educated  " 0 ",    Support :  0.20508339159093178
Boy Educated  " 1 ",  Girl Educated  " 0 ",    Support :  0.23913911914511135
Boy Educated  " 1 ",  Girl Educated  " 1 ",    Support :  0.48596824128632776
