## Fix the household composition inconsistencies and check with marital status

#### Load the data

In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import os
# to print numbers in a nice way
import locale
locale.setlocale(locale.LC_ALL, '')

'LC_CTYPE=en_US.UTF-8;LC_NUMERIC=it_IT.UTF-8;LC_TIME=it_IT.UTF-8;LC_COLLATE=en_US.UTF-8;LC_MONETARY=it_IT.UTF-8;LC_MESSAGES=en_US.UTF-8;LC_PAPER=it_IT.UTF-8;LC_NAME=it_IT.UTF-8;LC_ADDRESS=it_IT.UTF-8;LC_TELEPHONE=it_IT.UTF-8;LC_MEASUREMENT=it_IT.UTF-8;LC_IDENTIFICATION=it_IT.UTF-8'

In [2]:
#load the data
hh_demographic = pd.read_csv('../data/dunnhumby_complete_csv/hh_demographic.csv', sep = ',')
transaction_data = pd.read_csv('../data/dunnhumby_complete_csv/transaction_data.csv', sep = ',')
product = pd.read_csv('../data/dunnhumby_complete_csv/product.csv', sep = ',')

We first change tha marital status to a more intuitive one, setting to M the married entries and to S the single ones.

In [3]:
hh_demographic.loc[hh_demographic['MARITAL_STATUS_CODE'] == 'A', 'MARITAL_STATUS_CODE'] = 'M'
hh_demographic.loc[hh_demographic['MARITAL_STATUS_CODE'] == 'B', 'MARITAL_STATUS_CODE'] = 'S'

#### Explore again the household composition information

We already know from the data exploration that 2 Adults No Kids and 2 Adults Kids make sense. So we do not take into account these compositions.

In [4]:
for hh_composition in hh_demographic['HH_COMP_DESC'].unique():
    if hh_composition == "2 Adults No Kids" or hh_composition == "2 Adults Kids" : continue
    print("Looking at the household composition:", hh_composition)
    hh_demographic_current_composition = hh_demographic[hh_demographic['HH_COMP_DESC'] == hh_composition]
    print("Household size unique information:", hh_demographic_current_composition['HOUSEHOLD_SIZE_DESC'].unique())
    print("Kids number unique information:", hh_demographic_current_composition['KID_CATEGORY_DESC'].unique())
    print()

Looking at the household composition: Single Female
Household size unique information: ['1' '2']
Kids number unique information: ['None/Unknown']

Looking at the household composition: Unknown
Household size unique information: ['1' '3' '2' '5+']
Kids number unique information: ['None/Unknown' '1' '3+']

Looking at the household composition: Single Male
Household size unique information: ['1' '2']
Kids number unique information: ['None/Unknown']

Looking at the household composition: 1 Adult Kids
Household size unique information: ['2' '3' '5+' '4']
Kids number unique information: ['1' '2' '3+']



#### Analysing the composition 1 Adult Kids

In [5]:
hh_composition = "1 Adult Kids"
hh_demographic_1adultkids = hh_demographic[hh_demographic['HH_COMP_DESC'] == hh_composition]
for household_size in hh_demographic_1adultkids['HOUSEHOLD_SIZE_DESC'].unique():
    hh_demographic_1adultkids_size = hh_demographic_1adultkids[hh_demographic_1adultkids['HOUSEHOLD_SIZE_DESC'] == household_size]
    print(f"For house hold size {household_size}, with 1 adult, there are", hh_demographic_1adultkids_size['KID_CATEGORY_DESC'].unique(), "kid categories")

For house hold size 2, with 1 adult, there are ['1'] kid categories
For house hold size 3, with 1 adult, there are ['2' '1'] kid categories
For house hold size 5+, with 1 adult, there are ['3+'] kid categories
For house hold size 4, with 1 adult, there are ['2' '3+'] kid categories


We have problems where the household size is 3 or 4 units. Let's explore these in a better way.

In [6]:
hh_demographic_1adultkids[(hh_demographic_1adultkids['HOUSEHOLD_SIZE_DESC'] == '3') & (hh_demographic_1adultkids['KID_CATEGORY_DESC'] == '1')]

Unnamed: 0,AGE_DESC,MARITAL_STATUS_CODE,INCOME_DESC,HOMEOWNER_DESC,HH_COMP_DESC,HOUSEHOLD_SIZE_DESC,KID_CATEGORY_DESC,household_key
183,35-44,M,35-49K,Probable Owner,1 Adult Kids,3,1,543
200,45-54,M,15-24K,Homeowner,1 Adult Kids,3,1,596
285,65+,M,35-49K,Homeowner,1 Adult Kids,3,1,857
464,25-34,M,15-24K,Homeowner,1 Adult Kids,3,1,1437
665,35-44,M,Under 15K,Homeowner,1 Adult Kids,3,1,2092
738,35-44,M,35-49K,Homeowner,1 Adult Kids,3,1,2302


We can see here that there are entries where the composition is 1 Adult Kids, the household size is 3 and the number of kids is 1. This means one of the following: there is one more person living in the house, the household size is wrong, the composition is wrong.<br>
However, we can notice that all of these entries have a marital status M, which stands for married. We can assume then that the composition is wrong and there is an actual couple living in the house, with 1 kid.

In [7]:
hh_demographic_1adultkids[(hh_demographic_1adultkids['HOUSEHOLD_SIZE_DESC'] == '4') & (hh_demographic_1adultkids['KID_CATEGORY_DESC'] == '2')]

Unnamed: 0,AGE_DESC,MARITAL_STATUS_CODE,INCOME_DESC,HOMEOWNER_DESC,HH_COMP_DESC,HOUSEHOLD_SIZE_DESC,KID_CATEGORY_DESC,household_key
88,45-54,M,75-99K,Homeowner,1 Adult Kids,4,2,250
117,45-54,M,50-74K,Homeowner,1 Adult Kids,4,2,350
249,35-44,M,15-24K,Renter,1 Adult Kids,4,2,742
504,25-34,M,100-124K,Renter,1 Adult Kids,4,2,1557


For household size of 4, the same as before stands.

In [8]:
hh_demographic_1adultkids[(hh_demographic_1adultkids['HOUSEHOLD_SIZE_DESC'] == '4') & (hh_demographic_1adultkids['KID_CATEGORY_DESC'] == '2')]

Unnamed: 0,AGE_DESC,MARITAL_STATUS_CODE,INCOME_DESC,HOMEOWNER_DESC,HH_COMP_DESC,HOUSEHOLD_SIZE_DESC,KID_CATEGORY_DESC,household_key
88,45-54,M,75-99K,Homeowner,1 Adult Kids,4,2,250
117,45-54,M,50-74K,Homeowner,1 Adult Kids,4,2,350
249,35-44,M,15-24K,Renter,1 Adult Kids,4,2,742
504,25-34,M,100-124K,Renter,1 Adult Kids,4,2,1557


#### Analysing composition Single Male/Female

It des not make sense that the household size is bigger than 1.

In [9]:
hh_composition = "Single Female"
hh_demographic_singlefemale_size2 = hh_demographic[hh_demographic['HH_COMP_DESC'] == hh_composition]
hh_demographic_singlefemale_size2 = hh_demographic_singlefemale_size2[hh_demographic_singlefemale_size2['HOUSEHOLD_SIZE_DESC'] == '2']

print(hh_demographic_singlefemale_size2['MARITAL_STATUS_CODE'].unique())

['M']


All of them look married, again.

In [10]:
hh_composition = "Single Male"
hh_demographic_singlemale_size2 = hh_demographic[hh_demographic['HH_COMP_DESC'] == hh_composition]
hh_demographic_singlemale_size2 = hh_demographic_singlemale_size2[hh_demographic_singlemale_size2['HOUSEHOLD_SIZE_DESC'] == '2']

print(hh_demographic_singlemale_size2['MARITAL_STATUS_CODE'].unique())

['M']


As above.

We now want to check if the marital status, the number of kids and the household size are always coherent with each others. If it is, then we can assume that the household composition information is sometimes wrong. Hence, we can correct this parameter or just discard it, since it does not carry more information with respect to the other three.

#### Are the marital status, the number of kids and the household size coherent with each others?

In [11]:
for marital_status in np.sort(hh_demographic['MARITAL_STATUS_CODE'].unique()):
    print("Marital status:", marital_status)
    hh_demographic_current_marital = hh_demographic[hh_demographic['MARITAL_STATUS_CODE'] == marital_status]
    print(hh_demographic_current_marital.groupby(['HH_COMP_DESC', 'HOUSEHOLD_SIZE_DESC', 'KID_CATEGORY_DESC']).size())
    print()

Marital status: M
HH_COMP_DESC      HOUSEHOLD_SIZE_DESC  KID_CATEGORY_DESC
1 Adult Kids      3                    1                      6
                  4                    2                      4
                  5+                   3+                     6
2 Adults Kids     3                    1                     55
                  4                    2                     31
                  5+                   3+                    49
2 Adults No Kids  2                    None/Unknown         135
Single Female     2                    None/Unknown          33
Single Male       2                    None/Unknown          12
Unknown           2                    None/Unknown           3
                  3                    1                      5
                  5+                   3+                     1
dtype: int64

Marital status: S
HH_COMP_DESC      HOUSEHOLD_SIZE_DESC  KID_CATEGORY_DESC
1 Adult Kids      2                    1                     5
     

We can conclude that the marital status is always coherent with the household size and the number of kids. Combined with the findings above, we can say that in these cases the household composition is wrong and we will not consider that.<br>
We have some incongruities in the household size / number of kids when the marital status is Single, so we discard these entries.<br>
If the marital status is Unknown, we fall back on the household size / number of children information and we give the corresponding marital status, when it makes sense.

#### Cleaning up the dataset

In [12]:
hh_demographic_fxd = hh_demographic.copy()

Dropping the entries marked as Single with inconsistencies in the household size / number of kids.

In [13]:
dropindex = hh_demographic_fxd.index[
    (hh_demographic_fxd['MARITAL_STATUS_CODE'] == 'S') &
    (hh_demographic_fxd['HOUSEHOLD_SIZE_DESC'] == '2') &
    (hh_demographic_fxd['KID_CATEGORY_DESC'] == 'None/Unknown')].tolist()
dropindex += hh_demographic_fxd.index[
    (hh_demographic_fxd['MARITAL_STATUS_CODE'] == 'S') &
    (hh_demographic_fxd['HOUSEHOLD_SIZE_DESC'] == '3') &
    (hh_demographic_fxd['KID_CATEGORY_DESC'] == '1')].tolist()
dropindex += hh_demographic_fxd.index[
    (hh_demographic_fxd['MARITAL_STATUS_CODE'] == 'S') &
    (hh_demographic_fxd['HOUSEHOLD_SIZE_DESC'] == '4') &
    (hh_demographic_fxd['KID_CATEGORY_DESC'] == '2')].tolist()
print(dropindex)
print(len(dropindex), "entries dropped.")

hh_demographic_fxd.drop(dropindex, axis=0, inplace=True)

[5, 12, 20, 23, 42, 46, 73, 141, 157, 159, 185, 234, 296, 309, 356, 362, 406, 437, 511, 561, 580, 644, 647, 651, 677, 701, 769, 788, 54, 250, 374, 460, 608, 370, 733]
35 entries dropped.


Assigning the correct marital status to the entries marked as Unknown, when the household size and the number of children are coherent with each other.

In [14]:
hh_demographic_fxd.loc[
    (hh_demographic_fxd['MARITAL_STATUS_CODE'] == 'U') &
    (hh_demographic_fxd['HOUSEHOLD_SIZE_DESC'] == '3') &
    (hh_demographic_fxd['KID_CATEGORY_DESC'] == '1'),
    'MARITAL_STATUS_CODE'] = 'M'
hh_demographic_fxd.loc[
    (hh_demographic_fxd['MARITAL_STATUS_CODE'] == 'U') &
    (hh_demographic_fxd['HOUSEHOLD_SIZE_DESC'] == '4') &
    (hh_demographic_fxd['KID_CATEGORY_DESC'] == '2'),
    'MARITAL_STATUS_CODE'] = 'M'
hh_demographic_fxd.loc[
    (hh_demographic_fxd['MARITAL_STATUS_CODE'] == 'U') &
    (hh_demographic_fxd['HOUSEHOLD_SIZE_DESC'] == '2') &
    (hh_demographic_fxd['KID_CATEGORY_DESC'] == 'None/Unknown'),
    'MARITAL_STATUS_CODE'] = 'M'

hh_demographic_fxd.loc[
    (hh_demographic_fxd['MARITAL_STATUS_CODE'] == 'U') &
    (hh_demographic_fxd['HOUSEHOLD_SIZE_DESC'] == '2') &
    (hh_demographic_fxd['KID_CATEGORY_DESC'] == '1'),
    'MARITAL_STATUS_CODE'] = 'S'
hh_demographic_fxd.loc[
    (hh_demographic_fxd['MARITAL_STATUS_CODE'] == 'U') &
    (hh_demographic_fxd['HOUSEHOLD_SIZE_DESC'] == '3') &
    (hh_demographic_fxd['KID_CATEGORY_DESC'] == '2'),
    'MARITAL_STATUS_CODE'] = 'S'
hh_demographic_fxd.loc[
    (hh_demographic_fxd['MARITAL_STATUS_CODE'] == 'U') &
    (hh_demographic_fxd['HOUSEHOLD_SIZE_DESC'] == '4') &
    (hh_demographic_fxd['KID_CATEGORY_DESC'] == '3+'),
    'MARITAL_STATUS_CODE'] = 'S'
hh_demographic_fxd.loc[
    (hh_demographic_fxd['MARITAL_STATUS_CODE'] == 'U') &
    (hh_demographic_fxd['HOUSEHOLD_SIZE_DESC'] == '1') &
    (hh_demographic_fxd['KID_CATEGORY_DESC'] == 'None/Unknown'),
    'MARITAL_STATUS_CODE'] = 'S'

hh_demographic_fxd = hh_demographic_fxd[hh_demographic_fxd['MARITAL_STATUS_CODE'] != 'U']

In [15]:
for marital_status in np.sort(hh_demographic_fxd['MARITAL_STATUS_CODE'].unique()):
    print("Marital status:", marital_status)
    hh_demographic_current_marital = hh_demographic_fxd[hh_demographic_fxd['MARITAL_STATUS_CODE'] == marital_status]
    print(hh_demographic_current_marital.groupby(['HH_COMP_DESC', 'HOUSEHOLD_SIZE_DESC', 'KID_CATEGORY_DESC']).size())
    print()

Marital status: M
HH_COMP_DESC      HOUSEHOLD_SIZE_DESC  KID_CATEGORY_DESC
1 Adult Kids      3                    1                      6
                  4                    2                      4
                  5+                   3+                     6
2 Adults Kids     3                    1                     83
                  4                    2                     44
                  5+                   3+                    49
2 Adults No Kids  2                    None/Unknown         227
Single Female     2                    None/Unknown          33
Single Male       2                    None/Unknown          12
Unknown           2                    None/Unknown           3
                  3                    1                      5
                  5+                   3+                     1
dtype: int64

Marital status: S
HH_COMP_DESC   HOUSEHOLD_SIZE_DESC  KID_CATEGORY_DESC
1 Adult Kids   2                    1                     14
          

In [16]:
print(f"The number of entries goes from {len(hh_demographic.count(axis='columns')):n} to {len(hh_demographic_fxd.count(axis='columns')):n}.")

The number of entries goes from 801 to 759.


42 entries were discarded because the marital status Single did not matched with the household size / number of children or because, in the Unknown marital status, the household size and the number of children did not carry enough information to conclude something on the marital status.

In [17]:
hh_demographic_fxd.drop(['HH_COMP_DESC'], axis=1, inplace=True)
hh_demographic_fxd.loc[hh_demographic_fxd['KID_CATEGORY_DESC'] == 'None/Unknown', 'KID_CATEGORY_DESC'] = '0'
hh_demographic_fxd.rename(columns={'KID_CATEGORY_DESC': 'KIDS_DESC'}, inplace=True)

hh_demographic_fxd.reset_index(drop=True, inplace=True)

In [18]:
# saving the fixed dataframe
if not os.path.exists("saved_structures"):
    os.makedirs("saved_structures")
hh_demographic_fxd.to_csv("saved_structures/hh_demographic_fix_hhcomp.csv", sep='\t')

In [19]:
hh_demographic_fxd

Unnamed: 0,AGE_DESC,MARITAL_STATUS_CODE,INCOME_DESC,HOMEOWNER_DESC,HOUSEHOLD_SIZE_DESC,KIDS_DESC,household_key
0,65+,M,35-49K,Homeowner,2,0,1
1,45-54,M,50-74K,Homeowner,2,0,7
2,25-34,M,25-34K,Unknown,3,1,8
3,25-34,M,75-99K,Homeowner,4,2,13
4,45-54,S,50-74K,Homeowner,1,0,16
...,...,...,...,...,...,...,...
754,35-44,M,50-74K,Homeowner,2,0,2494
755,45-54,M,75-99K,Homeowner,3,1,2496
756,45-54,S,35-49K,Unknown,1,0,2497
757,25-34,M,50-74K,Homeowner,2,0,2498
