In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
pd.options.display.width = 180
import numpy as np

data2000 = 'data/NFSopen_2000/' #http://data.defra.gov.uk/Food/NationalFoodSurvey/NFSopen_2000.zip
dataRefe = 'data/NFSopen_Reference/' #http://data.defra.gov.uk/Food/NationalFoodSurvey/NFSopen_Reference.zip

df_house2000 = pd.DataFrame.from_csv(data2000+'2000 household data.txt', sep='\t', index_col=None)

df_diary = pd.DataFrame.from_csv(data2000+'2000 diary data.txt', sep='\t', index_col=None)
df_diary.columns=['hhno', 'fooditem', 'logday', 'purchasevalue', 'minor', 'quantity', 'purchasefree']
df_diary = df_diary[['hhno', 'minor', 'quantity']]
df_diary.head(3)

Unnamed: 0,hhno,minor,quantity
0,261119,31901,6.77
1,261119,26001,28.21
2,261119,1503,1.75


In [2]:
foods = pd.crosstab(df_diary.hhno, df_diary.minor, values = df_diary.quantity, aggfunc=np.sum)
foods = foods.fillna(0)
foods.reset_index(level=0, inplace=True)
foods.head(3)

minor,hhno,402,403,404,501,601,901,1101,1201,1301,...,38001,38101,38201,38301,38401,38501,38601,38701,38801,38901
0,261117,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.39,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,261118,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,261119,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,75.0,0.0,0.0,0.0,0.0,0.0


In [3]:
# 315 detailed "minor" foodcodes --- then aggregated into 183 "major" groups.
df_min_maj = pd.DataFrame.from_csv(dataRefe+'Ref_ Minor and major foods.txt', sep='\t', index_col=None)
df_min_maj.columns=['minor', 'minor_text', 'major']
df_maj_text = pd.DataFrame.from_csv(dataRefe+'Ref_ Major food codes.txt', sep='\t', index_col=None)
df_maj_text.columns=['major', 'major_text']
#Each "foodcode" was described with one of 7 "units": pints, ounces, etc.
df_min_units = pd.DataFrame.from_csv(dataRefe+'Ref_MINFD_Minor_food_codes.txt', sep='\t', index_col=None)
df_min_units.columns=['minor','minor_text','units']
df_min_units.drop(['minor_text'], inplace=True, axis=1) #to avoid duplicate later

#24 more aggregated "groups" were defined, and the 183 previous "detailed groups" were mapped to these 24.
df_grp_text = pd.DataFrame.from_csv(dataRefe+'Ref_ food groups (standard).txt', sep='\t', index_col=None)
df_grp_text.columns=['group','group_text']
df_mapping = pd.DataFrame.from_csv(dataRefe+'Ref_ Major-food group mapping.txt', sep='\t', index_col=None)
df_mapping.columns=['major','group']
#The mapping included 92 groups, and those 24 had to be selected.
group24 = [4006, 9017, 22023, 31041, 46094, 100127, 129129, 135148, 150154, 155161, 162171, 172183, \
           184208, 210231, 233248, 251263, 264264, 267277, 281301, 304313, 314339, 340344, 350354, 380389]
df_maj_group = df_mapping[df_mapping['group'].isin(group24) == True]

In [4]:
df = pd.merge(df_diary, df_min_maj, how='left', on='minor')
df = pd.merge(df, df_min_units, how='left', on='minor')
df = pd.merge(df, df_maj_text, how='left', on='major')
df = pd.merge(df, df_maj_group, how='left', on='major')
df_diary = pd.merge(df, df_grp_text, how='left', on='group')
df_diary = df_diary[['hhno', 'quantity', 'units', 'minor_text', 'group_text', 'minor', 'major', 'major_text', 'group']]
df_diary.head(3)

Unnamed: 0,hhno,quantity,units,minor_text,group_text,minor,major,major_text,group
0,261119,6.77,oz,SOUPS DEHYDRATED & POWDERED,ALL OTHER FOODS,31901,319,SOUPS DEHYDRATED AND POWDERED,314339
1,261119,28.21,oz,BREAD WHOLEMEAL SLICED,ALL BREAD,26001,260,BREAD WHOLEMEAL SLICED,251263
2,261119,1.75,pt,SEMI AND OTHER SKIMMED MILKS,OTHER MILK & CREAM,1503,15,SKIMMED MILKS,9017


In [5]:
df_diary = df_diary[['hhno', 'quantity', 'group']]

groups = pd.crosstab(df_diary.hhno, df_diary.group, values = df_diary.quantity, aggfunc=np.sum)
groups = groups.fillna(0)
groups.reset_index(level=0, inplace=True)
groups.head(3)

group,hhno,4006,9017,22023,31041,46094,100127,129129,135148,150154,...,233248,251263,264264,267277,281301,304313,314339,340344,350354,380389
0,261117,0.0,29.39,47.99,0.0,153.22,56.24,12.0,10.58,0.0,...,113.51,14.1,0.0,35.26,79.34,0.0,84.29,140.77,300.0,0.0
1,261118,0.0,2.0,0.0,0.0,64.81,7.93,0.0,0.0,0.0,...,0.0,42.31,0.0,14.1,4.76,14.1,0.0,0.0,0.0,0.0
2,261119,0.0,5.49,0.0,21.16,97.89,19.04,0.0,17.63,35.27,...,0.0,112.84,0.0,0.0,0.0,12.33,15.58,0.0,0.0,75.0


In [6]:
df = pd.merge(df_house2000, groups, how='left', on='hhno')
print "So we have {0:5} diary entries from {1:4} families".format(len(df_diary), len(df_house2000))
print "and for each family we now have the original 53 demographical variables + 18 food-group variables."
print "Here's a sample:"
df.head(3)

So we have 237424 diary entries from 6699 families
and for each family we now have the original 53 demographical variables + 18 food-group variables.
Here's a sample:


Unnamed: 0,hhno,gormet2,reg,lad,styr,stmth,mic,frez,owndw,memhh,...,233248,251263,264264,267277,281301,304313,314339,340344,350354,380389
0,261117,1,2,2002,2000,1,1,1,6,3,...,113.51,14.1,0.0,35.26,79.34,0.0,84.29,140.77,300.0,0.0
1,261118,1,2,2002,2000,1,1,1,1,1,...,0.0,42.31,0.0,14.1,4.76,14.1,0.0,0.0,0.0,0.0
2,261119,1,2,2002,2000,1,1,1,1,1,...,0.0,112.84,0.0,0.0,0.0,12.33,15.58,0.0,0.0,75.0


In [7]:
#now, build models!