In [2]:
import pandas as pd
from nltk.corpus import wordnet

In [3]:
meta = pd.read_csv("data/ddi_metadata_1525463016734.csv", delimiter=',(?=\S)')

  """Entry point for launching an IPython kernel.


In [4]:
occupationLookup = dict(meta[['Cat 3 Seq', 'Category 3, Variable 4']].to_dict(orient='split')['data'])
occupationLookup

{2: 'Accountant',
 4: 'Accoutrement-maker',
 6: 'Actor (Play)',
 7: 'Aeronaut',
 11: 'Agent and Factor (branch not specified)',
 16: 'Agricultural Implement-maker',
 24: 'Alkali-manufacturer',
 27: 'Alum Manufacturer',
 33: 'Anchor-smith and Chain-maker',
 34: 'Animal and Bird-dealer',
 35: 'Animal and Bird-preserver',
 38: 'Annatto-maker',
 40: 'Anti-dry-rot Works',
 42: 'Anvil-maker',
 47: 'Archery-goods-dealer and Maker',
 49: 'Architect',
 50: 'Architect, Naval',
 51: 'Armourer',
 53: 'Army (at Home)',
 55: 'Army Agent',
 56: 'Army Clothier',
 57: 'Army Contractor',
 58: 'Army, Half-pay',
 61: 'Articulator (Anatomical Instrument-maker)',
 66: 'Artist (Fine Arts)',
 69: 'Asphalte-manufacturer',
 71: 'Assayer',
 73: 'Assay-master',
 80: 'Attorney, Solicitor, Writer, and Law Student',
 87: 'Auctioneer, Appraiser, and House Agent',
 90: 'Aurist',
 91: 'Author',
 94: 'Axle-tree-maker',
 95: 'Baby-linen-dealer and Maker',
 101: 'Bacon and Ham-dealer and Factor',
 114: 'Baker',
 120: 'Bal

In [5]:
data = pd.read_csv('data/gbhgis_statistics_1525463016734.csv')

In [6]:
data.g_name.value_counts()

GREAT BRITAIN        3488
ENGLAND AND WALES    3436
ENGLAND              3429
SCOTLAND             2388
WALES                1660
Name: g_name, dtype: int64

In [7]:
eng = data[data.g_name == 'ENGLAND']

In [8]:
eng.head()

Unnamed: 0,g_unit,g_name,g_suffix,cellRef,start_year,start_date,end_year,end_date,redistricted,g_data,g_data_credit
5096,10061325,ENGLAND,Dep,OCC_RAW1841:female/o20/1002,0,,1841,,E,37.0,
5097,10061325,ENGLAND,Dep,OCC_RAW1841:female/o20/1003,0,,1841,,E,14.0,
5098,10061325,ENGLAND,Dep,OCC_RAW1841:female/o20/1004,0,,1841,,E,593.0,
5099,10061325,ENGLAND,Dep,OCC_RAW1841:female/o20/101,0,,1841,,E,23.0,
5100,10061325,ENGLAND,Dep,OCC_RAW1841:female/o20/1014,0,,1841,,E,1.0,


In [9]:
def getGenderAndID(cellRef): 
    return cellRef.split(':')[-1].split('/')

In [10]:
cleanData = pd.DataFrame()
cleanData['sex'] = eng['cellRef'].apply(getGenderAndID).apply(lambda x: x[0])
cleanData['age'] = eng['cellRef'].apply(getGenderAndID).apply(lambda x: x[1])
cleanData['occupation'] = eng['cellRef'].apply(getGenderAndID).apply(lambda x: x[2]).apply(lambda x: occupationLookup[int(x)])
cleanData['numPeople'] = eng['g_data']

In [11]:
cleanData

Unnamed: 0,sex,age,occupation,numPeople
5096,female,o20,Egg-merchant and Dealer,37.0
5097,female,o20,Embosser,14.0
5098,female,o20,Embroiderer,593.0
5099,female,o20,Bacon and Ham-dealer and Factor,23.0
5100,female,o20,Enameller,1.0
5101,female,o20,Engine and Machine-maker,45.0
5102,female,o20,Engineer and Engine-worker,45.0
5103,female,o20,"Engineer, Civil",0.0
5104,female,o20,Engine-turner,0.0
5105,female,o20,Engraver (all branches),25.0


In [12]:
regularizedOccupations = pd.read_csv('regularized_names.csv', header=None, names=['occupation', 'reg', 'unmodified']) 

In [13]:
regularizedOccupations['synonym_list'] = regularizedOccupations['reg'].apply(lambda x: x.split(','))

In [14]:
regularizedOccupations['unmodified_synonym_list'] = regularizedOccupations['unmodified'].apply(lambda x: x.split(','))

In [15]:
cleanData2 = cleanData.merge(regularizedOccupations)

In [16]:
def getSyns(word):
    """ Tries to get synonyms for a list of words or phrases,
    using WordNet. """
    synsets = wordnet.synsets(word, pos='n')
    if len(synsets) > 0: 
        return synsets[0].lemma_names()
    else:
        return None


In [17]:
def getSynsForAll(wordList):
    allSyns = []
    # Some words have synonym results that aren't occupations
    ignored = ["grinder", "barber", "mason"] 
    for word in wordList: 
        if word.lower() not in ignored:
            syns = getSyns(word)
            if syns is not None:
                syns = [word.replace('_', ' ') for word in syns]
                allSyns = allSyns + syns
    return allSyns

In [18]:
cleanData2['additional_synonyms'] = cleanData2['synonym_list'].apply(getSynsForAll)

In [19]:
cleanData2['additional_unmodified_synonyms'] = cleanData2['unmodified_synonym_list'].apply(getSynsForAll)

In [30]:
cleanData2

Unnamed: 0,sex,age,occupation,numPeople,reg,unmodified,synonym_list,unmodified_synonym_list,additional_synonyms,additional_unmodified_synonyms
0,female,o20,Egg-merchant and Dealer,37.0,"Egg merchant,Egg dealer","Merchant,Dealer","[Egg merchant, Egg dealer]","[Merchant, Dealer]",[],"[merchant, merchandiser, trader, bargainer, de..."
1,female,u19,Egg-merchant and Dealer,3.0,"Egg merchant,Egg dealer","Merchant,Dealer","[Egg merchant, Egg dealer]","[Merchant, Dealer]",[],"[merchant, merchandiser, trader, bargainer, de..."
2,male,o20,Egg-merchant and Dealer,136.0,"Egg merchant,Egg dealer","Merchant,Dealer","[Egg merchant, Egg dealer]","[Merchant, Dealer]",[],"[merchant, merchandiser, trader, bargainer, de..."
3,male,u19,Egg-merchant and Dealer,10.0,"Egg merchant,Egg dealer","Merchant,Dealer","[Egg merchant, Egg dealer]","[Merchant, Dealer]",[],"[merchant, merchandiser, trader, bargainer, de..."
4,female,o20,Embosser,14.0,Embosser,Embosser,[Embosser],[Embosser],[],[]
5,female,u19,Embosser,4.0,Embosser,Embosser,[Embosser],[Embosser],[],[]
6,male,o20,Embosser,54.0,Embosser,Embosser,[Embosser],[Embosser],[],[]
7,male,u19,Embosser,6.0,Embosser,Embosser,[Embosser],[Embosser],[],[]
8,female,o20,Embroiderer,593.0,Embroiderer,Embroiderer,[Embroiderer],[Embroiderer],[embroiderer],[embroiderer]
9,female,u19,Embroiderer,209.0,Embroiderer,Embroiderer,[Embroiderer],[Embroiderer],[embroiderer],[embroiderer]


In [23]:
doc1 = wordnet.synsets('artist', pos='n')[0]

In [25]:
doc1.lemma_names()

['artist', 'creative_person']

In [26]:
doc1 = wordnet.synsets('author', pos='n')[0]

In [27]:
doc1.lemma_names()

['writer', 'author']

In [28]:
cleanData2.to_hdf('census-wordlist.hdf', 'data')

your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block1_values] [items->['sex', 'age', 'occupation', 'reg', 'unmodified', 'synonym_list', 'unmodified_synonym_list', 'additional_synonyms', 'additional_unmodified_synonyms']]

  return pytables.to_hdf(path_or_buf, key, self, **kwargs)


In [29]:
cleanData2.to_csv('census-wordlist.csv')