In [0]:
import pandas as pd
import numpy as np
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

In [0]:
def get_clean_data(file_name):
  df = pd.read_csv(file_name, sep='\t', engine='python')

  # Replace the empty values by NaN value
  df = df.replace(r'^\s*$', np.nan, regex=True)

  # Find the number of valid values for each column
  counts = df.count().to_frame('count')
  print("NUmber of columns in the given data = ", counts.shape[0])

  # Select the columns that have 3/4 th values as valid
  filter_columns_df = counts.loc[counts['count']>=int(df.shape[0]*0.70)]
  filter_columns_list = filter_columns_df.index.tolist()
  print("Number of columns chosen for analysis = ", filter_columns_df.shape[0])

  # Drop the remaining columns from data frame
  df = df[filter_columns_list]
  return df

In [0]:
def filter_dataframe(dataframe, filter):
  return dataframe[filter]

In [0]:
def format_data(format_df):
  dataframe = format_df.copy()
  if 'EW6' in dataframe.columns:
    bins= [0, 18, 20, 30, 40, 50, 60, 150]
    labels = [0, 1, 2, 3, 4, 5, 6]
    dataframe['EW6'] = pd.cut(dataframe['EW6'], bins=bins, labels=labels, right=False)
    dataframe.head()

  if 'EW8' in dataframe.columns:
    dataframe['EW8'].fillna(0, inplace=True)
    dataframe['EW8'] = dataframe['EW8'].astype(int)
    dataframe.loc[dataframe['EW8'].between(1,10), 'EW8'] = 1
    dataframe.loc[dataframe['EW8'].between(11,12), 'EW8'] = 2
    dataframe.loc[dataframe['EW8'].between(13,14), 'EW8'] = 3

  if 'EW15A' in dataframe.columns:
    dataframe['EW15A'].fillna(0, inplace=True)
    dataframe['EW15A'] = dataframe['EW15A'].astype(int)
    dataframe.loc[dataframe['EW15A'].between(1,10), 'EW15A'] = 1
    dataframe.loc[dataframe['EW15A'].between(11,12), 'EW15A'] = 2
    dataframe.loc[dataframe['EW15A'].between(13,14), 'EW15A'] = 3

  if 'EW15B' in dataframe.columns:
    dataframe['EW15B'].fillna(0, inplace=True)
    dataframe['EW15B'] = dataframe['EW15B'].astype(int)
    dataframe.loc[dataframe['EW15B'].between(1,10), 'EW15B'] = 1
    dataframe.loc[dataframe['EW15B'].between(11,12), 'EW15B'] = 2
    dataframe.loc[dataframe['EW15B'].between(13,14), 'EW15B'] = 3

  if 'MH1A' in dataframe.columns:
    bins= [1, 18, 20, 30, 40, 50, 60]
    labels = [1, 2, 3, 4, 5, 6]     
    dataframe['MH1A'].fillna(0, inplace=True)
    dataframe['MH1A'] = dataframe['MH1A'].astype(int)
    dataframe['MH1A'] = pd.cut(dataframe['MH1A'], bins=bins, labels=labels, right=False)

  if 'COPC' in dataframe.columns:
    dataframe['COPC'] = df['COPC'].astype(float)
    quant = dataframe['COPC'].quantile([0.25,0.75])
    dataframe.loc[dataframe['COPC'] < quant[0.25], 'COPC'] = 1
    dataframe.loc[dataframe['COPC'].between(quant[0.25],quant[0.75]), 'COPC'] = 2
    dataframe.loc[dataframe['COPC'] >  quant[0.75], 'COPC'] = 3
    dataframe['COPC'].unique()

  if 'INCOMEPC' in dataframe.columns:
    quant = dataframe['INCOMEPC'].quantile([0.25,0.75])
    dataframe.loc[dataframe['INCOMEPC'] < quant[0.25], 'INCOMEPC'] = 1
    dataframe.loc[dataframe['INCOMEPC'].between(quant[0.25],quant[0.75]), 'INCOMEPC'] = 2
    dataframe.loc[dataframe['INCOMEPC'] >  quant[0.75], 'INCOMEPC'] = 3
    dataframe['INCOMEPC'].unique()
  
  return dataframe

In [0]:
def is_nan(x):
  return (x is np.nan or x != x)

In [0]:
def get_transaction_df(df_filtered):
  transactions = []
  for index, row in df_filtered.iterrows():
    tran = []
    for col in df_filtered.columns:
      if not is_nan(row[col]):
        tran.append(col + "_" + str(row[col]))
    transactions.append(tran)

  te = TransactionEncoder()
  te_ary = te.fit(transactions).transform(transactions)
  trans_df = pd.DataFrame(te_ary, columns=te.columns_)
  return trans_df

In [0]:
def get_frequent_itemsets(trans_df, min_sup):
  frequent_itemsets = apriori(trans_df, min_support=min_sup, use_colnames=True)
  frequent_itemsets['length'] = frequent_itemsets['itemsets'].apply(lambda x: len(x))
  return frequent_itemsets

In [0]:
def get_assoc_rules(freq_itemsets, min_conf):
  rules = association_rules(freq_itemsets, metric="confidence", min_threshold=min_conf)
  rules["antecedent_len"] = rules["antecedents"].apply(lambda x: len(x))
  return rules

In [0]:
def get_lookup():
  lookup = dict()
  lookup['EW5'] = "Relationship to Household head"
  lookup['EW6'] = "Age (in years)"
  lookup['EW7Y'] = "Year of birth"
  lookup['EW8'] = "Years of education completed"
  lookup['EW9'] = "N children alive"
  lookup['EW10'] = "Your general health"
  lookup['EW13A'] = "Mother lives in household"
  lookup['EW13B'] = "Father lives in household"
  lookup['EW14A'] = "Mother attended school"
  lookup['EW15A'] = "Mother education"
  lookup['EW15B'] = "Father education"
  lookup['EW16A'] = "Mother literate"
  lookup['EW16B'] = "Father literate"
  lookup['EW16C'] = "Mother-in-law literate"
  lookup['EW18A'] = "Brother highest education"
  lookup['EW18B'] = "Sister highest education"
  lookup['HB1'] = "glasses of milk daily harmful during pregnancy"
  lookup['HB2'] = "Belief: Men physically weak months after sterilization"
  lookup['HB3'] = "Belief: 1st milk after birth good for baby"
  lookup['HB4'] = "Belief: Chulha smoke good for health"
  lookup['HB5'] = "Belief: Child diarrhea more to drink"
  lookup['HB6'] = "Belief: Illness spread through impure water"
  lookup['HB7'] = "Belief: How Malaria spreads"
  lookup['HB8'] = "Belief: Pregnancy most likely during menstrual cycle"
  lookup['AI1'] = "HIV/AIDS Awareness"
  lookup['GR9F'] = "Can visit health centre alone (permission needed or not)"
  lookup['GR13A'] = "Past 5 years: Been to metro city (beside current residence)"
  lookup['GR16A'] = "Past 5 years: Been to another state"
  lookup['GR17A'] = "Past 5 years: Been abroad"
  lookup['GR18A'] = "Are you a member of a: Mahila Mandal"
  lookup['GR18B'] = "Are you a member of a: Self-help group"
  lookup['GR22'] = "Family outings to cinema, mela, or restaurant"
  lookup['GR27A'] = "Family member has bank account"
  lookup['MH1A'] = "Age at marriage (in years)"
  lookup['MH1E'] = "Age first started menarche (in years)"
  lookup['MH2'] = "Marriage status"
  lookup['MH4A'] = "Who chose your husband"
  lookup['MH4B'] = "Did you have any say in choosing your husband"
  lookup['FP2A'] = "Currently use contraceptives"
  lookup['URBAN2011'] = "Urban residence from census 2011"
  lookup['METRO'] = "Largest 6 metro areas 0/1"
  lookup['ID11'] = "Religion"
  lookup['ID13'] = "Caste category"
  lookup['COPC'] = "Household expenditure /capita"
  lookup['INCOMEPC'] = "per capita income"
  lookup['NPERSONS'] = "N in household"
  lookup['ED2'] = "Education: Literacy"
  lookup['ED4'] = "Education: Attended school"
  return lookup

In [0]:
def format_item(item):
  return item.split("_")

In [0]:
def print_freq_itemsets(freq_itemsets, lookup):
  for index, row in freq_itemsets.iterrows():
    items = row['itemsets']
    for a in items:
      item = format_item(a)
      print(lookup[item[0]]," \"", item[1], "\", ", end =" ") 
    print("  Support : ", row['support'])

In [0]:
def print_assoc_rules(assoc_rules, lookup):
  for index, row in assoc_rules.iterrows():
    ant = row['antecedents']
    con = row['consequents']
    for a in ant:
      item = format_item(a)
      print(lookup[item[0]]," \"", item[1], "\", ", end =" ") 
    print(" =====> ", end =" ") 
    for c in con:
      item = format_item(c)
      print(lookup[item[0]]," \"", item[1], "\", ", end =" ") 
    print()

In [0]:
def print_items(items, lookup):
  for item in items:
    print(item, " ==> ", lookup[item])

In [28]:
df = get_clean_data("36151-0003-Data.tsv")
df.head()

NUmber of columns in the given data =  580
Number of columns chosen for analysis =  350


Unnamed: 0,SURVEY,STATEID,DISTID,PSUID,HHID,HHSPLITID,PERSONID,IDPSU,IDHH,IDPERSON,GE10A,GE10B,GE11,GE12,GE13,CD3D,CD3M,CD3Y,CD3DATE,CD4A,CD4B,CD4C,EW5,EW6,EW8,EW9,EW10,EW11,EW12A,EW12B,EW12C,EW12D,EW13A,EW14A,EW14B,EW14C,EW14D,EW15A,EW15B,EW15C,...,AP9,EWELIGIBLE,WKANY5,WKANIMAL,WKBUSINESS,WKAGLAB,WKFARM,WKNONAG,WKSALARY,WKNREGA,WKHOURS,NFHOURS,WKDAYS,NFDAYS,RSUNEARN,SPRO10,SPRO3,SPRO4,SPRO5,SPRO6,SPRO8,SPED2,SPED3,SPED4,SPED6,SPWKANY5,SPWKANIMAL,SPWKBUSINESS,SPWKAGLAB,SPWKFARM,SPWKNONAG,SPWKHOURS,SPWKDAYS,EWQELIGIBLE,AGERANK,NEVMFEM,EWPOSITION,NEWQELIGIBLE,WTEW,FWTEW
0,2,1,2,1,10,1,2,10201,102010101,10201010102,1.0,1.0,2.0,2.0,9.0,3,7,2012,19177,10,50,1,2,49,0,4,5,2,1,0,1,1,2.0,0,0,0,0,0,0,0,...,65.69999694825,1,2,3,0,0,2,0,0,0,24,0,6,0,175804.109375,69,1,1,57,1,2,0,0,0,0,4,0,0,0,0,3,4000,365,1,1,2,2,2,3687.92627,3688
1,2,1,2,1,10,1,6,10201,102010101,10201010106,,,,,,3,7,2012,19177,10,15,1,4,26,8,3,5,2,1,1,1,1,2.0,0,1,0,0,0,6,0,...,60.0,1,0,0,0,0,0,0,0,0,0,0,0,0,176100.0,2,1,3,29,1,6,1,1,1,9,4,0,0,0,0,3,3300,275,1,2,2,3,2,3687.92627,3688
2,2,1,2,1,20,1,8,10201,102010201,10201020108,1.0,1.0,1.0,3.0,3.0,3,7,2012,19177,10,0,1,4,33,12,3,3,3,1,1,0,1,2.0,0,0,0,1,0,0,0,...,60.29999923707,1,4,0,0,0,0,0,4,0,2555,0,365,0,999500.0,75,1,3,37,1,8,1,2,1,16,4,0,0,0,0,0,2555,365,1,2,3,3,1,11063.779297,11064
3,2,1,2,1,30,1,2,10201,102010301,10201030102,1.0,1.0,1.0,2.0,9.0,3,7,2012,19177,12,25,2,2,43,0,5,3,2,0,0,0,0,,0,0,0,0,0,0,0,...,79.09999847413,1,3,3,0,0,3,0,0,0,240,0,60,0,178200.0,75,1,1,45,1,2,1,2,1,10,4,0,0,0,0,0,2555,365,1,1,1,1,1,3687.92627,3688
4,2,1,2,1,40,1,2,10201,102010401,10201040102,1.0,1.0,1.0,3.0,9.0,3,7,2012,19177,1,20,2,2,47,0,3,3,2,0,0,1,0,,0,0,0,0,0,0,0,...,56.40000152588,1,3,3,0,0,3,0,0,0,450,0,90,0,89568.335938,69,1,1,57,1,2,0,0,0,0,4,0,0,0,3,4,2710,350,1,1,1,1,1,3687.92627,3688


In [36]:
format_df = format_data(df)
format_df.head()

Unnamed: 0,SURVEY,STATEID,DISTID,PSUID,HHID,HHSPLITID,PERSONID,IDPSU,IDHH,IDPERSON,GE10A,GE10B,GE11,GE12,GE13,CD3D,CD3M,CD3Y,CD3DATE,CD4A,CD4B,CD4C,EW5,EW6,EW8,EW9,EW10,EW11,EW12A,EW12B,EW12C,EW12D,EW13A,EW14A,EW14B,EW14C,EW14D,EW15A,EW15B,EW15C,...,AP9,EWELIGIBLE,WKANY5,WKANIMAL,WKBUSINESS,WKAGLAB,WKFARM,WKNONAG,WKSALARY,WKNREGA,WKHOURS,NFHOURS,WKDAYS,NFDAYS,RSUNEARN,SPRO10,SPRO3,SPRO4,SPRO5,SPRO6,SPRO8,SPED2,SPED3,SPED4,SPED6,SPWKANY5,SPWKANIMAL,SPWKBUSINESS,SPWKAGLAB,SPWKFARM,SPWKNONAG,SPWKHOURS,SPWKDAYS,EWQELIGIBLE,AGERANK,NEVMFEM,EWPOSITION,NEWQELIGIBLE,WTEW,FWTEW
0,2,1,2,1,10,1,2,10201,102010101,10201010102,1.0,1.0,2.0,2.0,9.0,3,7,2012,19177,10,50,1,2,4,0,4,5,2,1,0,1,1,2.0,0,0,0,0,0,0,0,...,65.69999694825,1,2,3,0,0,2,0,0,0,24,0,6,0,175804.109375,69,1,1,57,1,2,0,0,0,0,4,0,0,0,0,3,4000,365,1,1,2,2,2,3687.92627,3688
1,2,1,2,1,10,1,6,10201,102010101,10201010106,,,,,,3,7,2012,19177,10,15,1,4,2,1,3,5,2,1,1,1,1,2.0,0,1,0,0,0,1,0,...,60.0,1,0,0,0,0,0,0,0,0,0,0,0,0,176100.0,2,1,3,29,1,6,1,1,1,9,4,0,0,0,0,3,3300,275,1,2,2,3,2,3687.92627,3688
2,2,1,2,1,20,1,8,10201,102010201,10201020108,1.0,1.0,1.0,3.0,3.0,3,7,2012,19177,10,0,1,4,3,2,3,3,3,1,1,0,1,2.0,0,0,0,1,0,0,0,...,60.29999923707,1,4,0,0,0,0,0,4,0,2555,0,365,0,999500.0,75,1,3,37,1,8,1,2,1,16,4,0,0,0,0,0,2555,365,1,2,3,3,1,11063.779297,11064
3,2,1,2,1,30,1,2,10201,102010301,10201030102,1.0,1.0,1.0,2.0,9.0,3,7,2012,19177,12,25,2,2,4,0,5,3,2,0,0,0,0,,0,0,0,0,0,0,0,...,79.09999847413,1,3,3,0,0,3,0,0,0,240,0,60,0,178200.0,75,1,1,45,1,2,1,2,1,10,4,0,0,0,0,0,2555,365,1,1,1,1,1,3687.92627,3688
4,2,1,2,1,40,1,2,10201,102010401,10201040102,1.0,1.0,1.0,3.0,9.0,3,7,2012,19177,1,20,2,2,4,0,3,3,2,0,0,1,0,,0,0,0,0,0,0,0,...,56.40000152588,1,3,3,0,0,3,0,0,0,450,0,90,0,89568.335938,69,1,1,57,1,2,0,0,0,0,4,0,0,0,3,4,2710,350,1,1,1,1,1,3687.92627,3688


In [51]:
filter = [
'EW8',
'EW15A',
'EW15B',
'EW10',
'MH1A',
'MH4B'
]

print_items(filter, lookup)

filtered_df = filter_dataframe(format_df, filter)
filtered_df.head()

EW8  ==>  Years of education completed
EW15A  ==>  Mother education
EW15B  ==>  Father education
EW10  ==>  Your general health
MH1A  ==>  Age at marriage (in years)
MH4B  ==>  Did you have any say in choosing your husband


Unnamed: 0,EW8,EW15A,EW15B,EW10,MH1A,MH4B
0,0,0,0,5,1,0.0
1,1,0,1,5,2,0.0
2,2,0,0,3,3,
3,0,0,0,3,3,
4,0,0,0,3,3,0.0


In [0]:
trans_df = get_transaction_df(filtered_df)

In [53]:
lookup = get_lookup()
frequent_itemsets = get_frequent_itemsets(trans_df, 0.2)
frequent_itemsets_filtered = frequent_itemsets[ (frequent_itemsets['support'] < 0.3) ]
print_freq_itemsets(frequent_itemsets_filtered, lookup)

# Years of education completed  " 0 ",  Father education  " 0 ",  Mother education  " 0 ",    Support :  0.31450041747843027
# Father education  " 0 ",  Age at marriage (in years)  " 1.0 ",  Mother education  " 0 ",    Support :  0.3032664524454115
# Years of education completed  " 0 ",  Father education  " 0 ",  Age at marriage (in years)  " 1.0 ",  Mother education  " 0 ",    Support :  0.20739822381904208
# Did you have any say in choosing your husband  " 0 ",  Father education  " 0 ",  Mother education  " 0 ",    Support :  0.2597727905270349

Your general health  " 1 ",    Support :  0.2389747741821218
Mother education  " 1 ",    Support :  0.21718999063836247
Age at marriage (in years)  " 2.0 ",    Support :  0.2544594286870936
Age at marriage (in years)  " 3.0 ",    Support :  0.27965994484224377
Your general health  " 2 ",  Years of education completed  " 1 ",    Support :  0.25736912683753765
Your general health  " 2 ",  Age at marriage (in years)  " 1.0 ",    Support :  0.241150722364193
Did you have any say in choosing your husband  " 0 ",  Your general health  " 2 ",    Support :  0.21086455987652758
Father education  " 1 ",  Mother education  " 0 ",    Support :  0.20026313791969233
Did you have any say in choosing your husband  " 1 ",  Mother education  " 0 ",    Support :  0.2129139994433621
Father education  " 0 ",  Years of education completed  " 1 ",    Support :  0.22814563671786048
Did you have any say in choosing your husband  " 0 ",  Father education  " 0 ",    Support :  0.265668091997065
Father education 

In [55]:
filter = [
'EW8',
'EW15A',
'EW15B',
'EW10',
]

print_items(filter, lookup)

filtered_df = filter_dataframe(format_df, filter)
filtered_df.head()

EW8  ==>  Years of education completed
EW15A  ==>  Mother education
EW15B  ==>  Father education
EW10  ==>  Your general health


Unnamed: 0,EW8,EW15A,EW15B,EW10
0,0,0,0,5
1,1,0,1,5
2,2,0,0,3
3,0,0,0,3
4,0,0,0,3


In [0]:
trans_df = get_transaction_df(filtered_df)

In [61]:
lookup = get_lookup()
frequent_itemsets = get_frequent_itemsets(trans_df, 0.02)
frequent_itemsets_filtered = frequent_itemsets[ (frequent_itemsets['support'] < 0.05) ]
print_freq_itemsets(frequent_itemsets_filtered, lookup)

# Years of education completed  " 0 ",  Mother education  " 0 ",  Father education  " 0 ",  Your general health  " 3 ",    Support :  0.05525896313538952
# Years of education completed  " 2 ",  Father education  " 1 ",  Mother education  " 1 ",    Support :  0.028337929813020267
# Your general health  " 4 ",  Father education  " 0 ",  Mother education  " 0 ",    Support :  0.04354426536447132

Father education  " 15 ",    Support :  0.020899223237102447
Father education  " 2 ",    Support :  0.0373200414948258
Years of education completed  " 15 ",    Support :  0.038711636262429475
Years of education completed  " 2 ",  Your general health  " 1 ",    Support :  0.021885990435948688
Years of education completed  " 2 ",  Your general health  " 2 ",    Support :  0.035473015712370014
Mother education  " 1 ",  Your general health  " 3 ",    Support :  0.031095817625180273
Your general health  " 4 ",  Father education  " 0 ",    Support :  0.0446828429016016
Your general health  " 4 ",  Father education  " 1 ",    Support :  0.023606507603167775
Years of education completed  " 0 ",  Your general health  " 4 ",    Support :  0.037952584571009286
Your general health  " 4 ",  Years of education completed  " 1 ",    Support :  0.02978012802671862
Years of education completed  " 2 ",  Mother education  " 0 ",    Support :  0.02866685221263568
Father education  " 2 ",  Mother education 