Loading data

In [2]:
import xml.etree.cElementTree as ET
import pprint
import re
import codecs
import json
from collections import defaultdict
from pymongo import MongoClient

In [3]:
k = 100 # Parameter: take every k-th top level element
SAMPLE_FILE = "mapsimple.xml"
xmlFile = "map.xml"

def get_element(xml_file, tags=('node', 'way', 'relation')):
    """Yield element if it is the right type of tag

    Reference:
    http://stackoverflow.com/questions/3095434/inserting-newlines-in-xml-file-generated-via-xml-etree-elementtree-in-python
    """
    context = iter(ET.iterparse(xml_file, events=('start', 'end')))
    _, root = next(context)
    for event, elem in context:
        if event == 'end' and elem.tag in tags:
            yield elem
            root.clear()


with open(SAMPLE_FILE, 'wb') as output:
    output.write('<?xml version="1.0" encoding="UTF-8"?>\n')
    output.write('<osm>\n  ')

    # Write every kth top level element
    for i, element in enumerate(get_element(xmlFile)):
        if i % k == 0:
            output.write(ET.tostring(element, encoding='utf-8'))

    output.write('</osm>')

In [4]:
CREATED = [ "version", "changeset", "timestamp", "user", "uid"]
problemchars = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')
lower = re.compile(r'^([a-z]|_)*$')
lower_colon = re.compile(r'^([a-z]|_)*:([a-z]|_)*$')

#Changing this regex to match brasilian names of streets
street_type_re = re.compile(r'^\b\S+\.?', re.IGNORECASE)
street_types = defaultdict(set)


expected = [u"Rua", u"Avenida", u"Travessa", u"Beco", u"Rodovia", u"Estrada", u"Logradouro", u"Viela", u"Alameda", u"Via", u"Praça"]

# UPDATE THIS VARIABLE
mapping = { "R": "Rua",
            "R.": "Rua",
            "r." : "Rua",
            "rua": "Rua",
            "RUA": "Rua",
            "Rua-": "Rua ",
            "Ruas": "Rua",
            "Av.": "Avenida",
            "AV.": "Avenida",
            "AVENIDA":"Avenida",
            "Alamenda": "Alameda",
            "Rodovoa": "Rodovia"
            }


def audit_street_type(street_types, street_name):
    m = street_type_re.search(street_name)
    if m:
        street_type = m.group()
        if street_type not in expected:
            street_types[street_type].add(street_name)

def shape_element(element):
    node = {}
    if element.tag == "node" or element.tag == "way" :
        node["type"]=element.tag        
        
        for at in element.attrib:
            if at in CREATED :
                if node.get("created") == None:
                    node[u"created"] = {}
                node[u"created"].update({at : element.get(at)})
            elif at in "lon":
                if node.get("pos") ==None:
                    node[u"pos"] = [0,1]
                node[u"pos"][1] = float(element.get(at))
            elif at in "lat":
                if node.get("pos") ==None:
                    node[u"pos"] = [0,1]
                node[u"pos"][0]= float(element.get(at))
            else: 
                node[at] = element.get(at)
        for child in element:
            
            if child.get("k"):      
                if problemchars.search(child.get("k")):
                    continue                    
                elif re.search(':',child.get("k")) != None:
                    name = child.get("k").split(":")
                    if node.get(name[0]) == None:                            
                        node[name[0]] = {}
                    if name[1]==u"street":
                        audit_street_type(street_types, child.get("v"))
                    node[name[0]].update({name[1] : child.get("v")})                            
                else:
                    if node.get("caract")==None:
                        node[u"caract"]={}                        
                    node[u"caract"].update({child.get("k"):child.get("v")})
            if element.tag == "way":                
                if child.tag == "nd":                     
                    if node.get("node_refs") == None:
                        node[u"node_refs"] = []
                    node[u"node_refs"].append(child.get("ref"))                       
            
        return node
    else:
        return None

def key_type(element, keys):
    if element.tag == "way":
        for tag in element.iter("tag"):
            k = tag.get("k")
            if lower.search(k):
                keys['lower'] +=1
            elif lower_colon.search(k):
                keys['lower_colon'] +=1
            elif problemchars.search(k):
                keys['problemchars'] +=1
            else:
                keys['other'] +=1
    
    return keys    

def process_keys(file_in):
    keys = {"lower": 0, "lower_colon": 0, "problemchars": 0, "other": 0}
    for event, element in ET.iterparse(file_in, events=("start",)):
        keys = key_type(element, keys)
        
    return keys    
    
    
def process_map(file_in, pretty = False):
    #street_types = defaultdict(set)
    file_out = "{0}.json".format(file_in)
    data = []
    with codecs.open(file_out, "w") as fo:
        for _, element in ET.iterparse(file_in):
            el = shape_element(element)
            if el:
                data.append(el)
                if pretty:
                    fo.write(json.dumps(el, indent=2)+"\n")
                else:
                    fo.write(json.dumps(el) + "\n")
    return data

def update_name(name):
    m = street_type_re.search(name)
    if m:
        if m in mapping:
            name = name.replace(m.group(), mapping[m.group()])
    
    return name

def audit_street_name(data):
    for tupla in data:        
        if tupla.get('addr'):                   
            if tupla.get('addr').get('street'):
                name = tupla.get('addr').get('street')
                tupla.get('addr')['street'] = update_name(name)
    

In [5]:
data = process_map(xmlFile, True)
audit_street_name(data)

In [6]:
pprint.pprint(data[300110])

{u'caract': {'highway': 'residential', 'name': 'Rua RPB 01'},
 u'created': {'changeset': '32026685',
              'timestamp': '2015-06-17T11:21:55Z',
              'uid': '2647840',
              'user': 'geocorreiosgo',
              'version': '2'},
 'id': '308073677',
 u'node_refs': ['3133517231', '3133517190'],
 'type': 'way'}


In [14]:
    client = MongoClient("mongodb://localhost:27017")
    db = client['goiania']
    db.goiania.drop();


In [15]:
print db

Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True), u'goiania')


In [16]:
db.goiania.insert_many(data)
print db.goiania.find_one()

{u'name': {u'el': u'\u0393\u03ba\u03bf\u03ca\u03ac\u03bd\u03b9\u03b1', u'en': u'Goi\xe2nia', u'zh': u'\u6208\u4e9a\u5c3c\u4e9a', u'ce': u'\u0413\u043e\u044f\u043d\u0438', u'ar': u'\u063a\u0648\u064a\u0627\u0646\u064a\u0627', u'ru': u'\u0413\u043e\u044f\u043d\u0438\u044f', u'pt': u'Goi\xe2nia', u'lv': u'Gojanija', u'lt': u'Gojanija', u'th': u'\u0e42\u0e01\u0e22\u0e32\u0e40\u0e19\u0e35\u0e22', u'tg': u'\u0413\u043e\u044f\u043d\u0438\u044f', u'pl': u'Goi\xe2nia', u'be': u'\u0413\u0430\u044f\u043d\u0456\u044f', u'bg': u'\u0413\u043e\u044f\u043d\u0438\u044f', u'de': u'Goi\xe2nia', u'fa': u'\u06af\u0648\u06cc\u0627\u0646\u06cc\u0627', u'hy': u'\u0533\u0578\u0575\u0561\u0576\u056b\u0561', u'ja': u'\u30b4\u30a4\u30a2\u30cb\u30a2', u'he': u'\u05d2\u05d5\u05d0\u05d9\u05d0\u05e0\u05d9\u05d4', u'ka': u'\u10d2\u10dd\u10d8\u10d0\u10dc\u10d8\u10d0', u'uz': u'Goyaniya', u'kk': u'\u0413\u043e\u044f\u043d\u0438\u044f', u'sr': u'\u0413\u043e\u0458\u0430\u043d\u0438\u0458\u0430', u'mn': u'\u0413\u043e\u04

In [17]:
db.goiania.find().count()

319027

In [32]:

user = db.goiania.distinct("created.user")

print len(user)

385


In [33]:
#quantidade de contribuições por usuario
query = []
query.append({"$group" : {"_id" : "$created.user", "count":{"$sum":1}}})
query.append({"$sort" : {"count": -1}})
query.append({"$limit" : 10})

result = db.goiania.aggregate(query)

for doc in result:
    pprint.pprint(doc)


{u'_id': u'street0501', u'count': 105016}
{u'_id': u'greecemapper', u'count': 33594}
{u'_id': u'geocorreiosgo', u'count': 32420}
{u'_id': u'erickdeoliveiraleal', u'count': 32363}
{u'_id': u'jeffersonbr', u'count': 13613}
{u'_id': u'T\xfallio', u'count': 10010}
{u'_id': u'Vilmar Amaral', u'count': 9005}
{u'_id': u'taciofernandes', u'count': 8136}
{u'_id': u'Dangoh', u'count': 6533}
{u'_id': u'jdaraujo', u'count': 6172}


In [20]:
#usuario que mais contribuiu
query = []
query.append({"$group" : {"_id" : "$created.user", "count":{"$sum":1}}})
query.append({"$sort" : {"count": -1}})
query.append({"$limit" : 1})

result = db.goiania.aggregate(query)

for doc in result:
    pprint.pprint(doc)

{u'_id': u'street0501', u'count': 105016}


In [58]:
#unique contributors
query = []

query.append({"$group" : {"_id" : "$created.user", "count":{"$sum":1}}})
query.append({"$match" : {"count" : 1}})

result = db.goiania.aggregate(query)
print len(list(db.goiania.aggregate(query))) 

for doc in result:
    pprint.pprint(doc)
    
   

82
{u'_id': u'Douglas Soares Pereira', u'count': 1}
{u'_id': u'marthaleena', u'count': 1}
{u'_id': u"DonomarBarber'Shop", u'count': 1}
{u'_id': u'wambacher', u'count': 1}
{u'_id': u'Paulo Henrique Rodrigues Oliveira', u'count': 1}
{u'_id': u'Coyoty', u'count': 1}
{u'_id': u'marciocr', u'count': 1}
{u'_id': u'CentralDoXadrez', u'count': 1}
{u'_id': u'Edmilson Pimentel', u'count': 1}
{u'_id': u'P\xc9ROLA MULTI SERVICE', u'count': 1}
{u'_id': u'J\xe9ssika Rodrigues', u'count': 1}
{u'_id': u'Jeanlks', u'count': 1}
{u'_id': u'Ewana', u'count': 1}
{u'_id': u'Jean Moura', u'count': 1}
{u'_id': u'Fabio De Carvalho Luz', u'count': 1}
{u'_id': u'Jacob Neto', u'count': 1}
{u'_id': u'Sousa807', u'count': 1}
{u'_id': u'Marcos Pinheiro', u'count': 1}
{u'_id': u'Alucard_c5', u'count': 1}
{u'_id': u'soldado junior infantaria costa (malcriado)', u'count': 1}
{u'_id': u'thetornado76', u'count': 1}
{u'_id': u'Rui Alves Madalena', u'count': 1}
{u'_id': u'Alverne Passos Barbosa', u'count': 1}
{u'_id': u'Pl

In [21]:
#quantidade de registro por tipo 
query = []
query.append({"$group" : {"_id" : "$type", "count":{"$sum":1}}})
query.append({"$sort" : {"count": -1}})

result = db.goiania.aggregate(query)

for doc in result:
    pprint.pprint(doc)

{u'_id': u'node', u'count': 267749}
{u'_id': u'way', u'count': 51278}


In [22]:
#tipos de amenity
db.goiania.distinct("caract.amenity")

[u'place_of_worship',
 u'hospital',
 u'post_box',
 u'school',
 u'cinema',
 u'pharmacy',
 u'taxi',
 u'fast_food',
 u'theatre',
 u'recycling',
 u'Laborat\xf3rio',
 u'atm',
 u'restaurant',
 u'bus_station',
 u'post_office',
 u'fuel',
 u'cafe',
 u'bank',
 u'clinic',
 u'car_rental',
 u'arts_centre',
 u'police',
 u'car_wash',
 u'toilets',
 u'bicycle_parking',
 u'bar',
 u'university',
 u'nightclub',
 u'parking',
 u'telephone',
 u'prison',
 u'childcare',
 u'pub',
 u'library',
 u'college',
 u'community_centre',
 u'parking_entrance',
 u'fountain',
 u'shelter',
 u'veterinary',
 u'townhall',
 u'internet_cafe',
 u'bicycle_repair_station',
 u'dentist',
 u'charging_station',
 u'embassy',
 u'driving_school',
 u'kindergarten',
 u'bench',
 u'marketplace',
 u'food_court',
 u'nursing_home',
 u'courthouse',
 u'fire_station',
 u'club',
 u'love_hotel',
 u'public_building',
 u'grave_yard',
 u'studio']

In [34]:
#quantidade de registro por tipo 
query = []
query.append({"$group" : {"_id" : "$caract.amenity", "count":{"$sum":1}}})
query.append({"$sort" : {"count": -1}})
query.append({"$limit" : 10})

result = db.goiania.aggregate(query)

for doc in result:
    pprint.pprint(doc)

{u'_id': None, u'count': 317771}
{u'_id': u'parking', u'count': 165}
{u'_id': u'fuel', u'count': 160}
{u'_id': u'bank', u'count': 112}
{u'_id': u'school', u'count': 98}
{u'_id': u'place_of_worship', u'count': 88}
{u'_id': u'fast_food', u'count': 80}
{u'_id': u'restaurant', u'count': 78}
{u'_id': u'pharmacy', u'count': 76}
{u'_id': u'post_office', u'count': 54}


In [24]:
#ruas
db.goiania.distinct("addr.street")

[u'Rua 85',
 u'Rua S 5',
 u'Alameda Alfredo Lopes de Morais',
 u'Avenida T-4',
 u'Avenida Rep\xfablica do L\xedbano',
 u'Avenida Primeira Radial',
 u'Rua 261',
 u'Avenida Rio Verde',
 u'Rua Haroldo Barbosa',
 u'Rua da Sa\xfade',
 u'Avenida Goi\xe1s',
 u'Avenida Quinta Avenida',
 u'Rua Corumb\xe1',
 u'Avenida T-7',
 u'Rua 10',
 u'Rua 14',
 u'Avenida Jamel Cec\xedlio',
 u'Avenida T-9',
 u'Rua U-47',
 u'Rua U-54',
 u'Rua Mossor\xf3',
 u'AV. N\xe1poli',
 u'Rua 52',
 u'Rua 54',
 u'Avenida N\xe1poli',
 u'Avenida Mil\xe3o',
 u'Avenida dos Alpes',
 u'Avenida Armando de Godoy',
 u'Avenida Pio XII',
 u'Avenida S\xe3o Paulo',
 u'Pra\xe7a Doutor Pedro Ludovico Teixeira',
 u'Rua 11 de Junho',
 u'Avenida 24 de Outubro',
 u'Rua R-7',
 u'Avenida Pires Fernandes',
 u'Rua S-1',
 u'Rua C-137 (Continua\xe7\xe3o da T-10) esquina com Rua C-252',
 u'Rua 18-A',
 u'Rua C-159',
 u'Rodovia GO-070, km 15,5.',
 u'R. GB 19',
 u'Rua GB-14',
 u'Rua 107',
 u'Avenida Circular',
 u'Avenida da Independ\xeancia',
 u'AVENI

In [45]:
query = []
query.append({"$match":{"addr.street": {"$eq": "Qd4 Lt10"}}})

result = db.goiania.aggregate(query)

for doc in result:
    pprint.pprint(doc)

{u'_id': ObjectId('5ab983cd659c2a16ec184f27'),
 u'addr': {u'street': u'Qd4 Lt10'},
 u'caract': {u'craft': u'tailor',
             u'name': u'Maristela Quinta',
             u'opening_hours': u'Th 07:30-16:00'},
 u'created': {u'changeset': u'50282627',
              u'timestamp': u'2017-07-14T13:05:08Z',
              u'uid': u'6265328',
              u'user': u'MGMorais',
              u'version': u'1'},
 u'id': u'4970393722',
 u'pos': [-16.755721, -49.176259],
 u'type': u'node'}


In [25]:
#trazendo todas as ruas que começam com qd
#mostrar esses problemas no banco
result = db.goiania.find({"addr.street": {'$regex': '^Qd', '$options': 'i'}})
print db.goiania.find({"addr.street": {'$regex': '^Qd', '$options': 'i'}}).count()
print db.goiania.find({"addr.street": {'$regex': '^Qd', '$options': 'i'}}).distinct("created.user")
for doc in result:
    print pprint.pprint(doc)
    



29
[u'MGMorais', u'erickdeoliveiraleal']
{u'_id': ObjectId('5ac0e4ba659c2a221c5aa9c4'),
 u'addr': {u'street': u'Qd20 Lt04'},
 u'caract': {u'historic': u'castle',
             u'name': u'Carla TER\xc7A SEXTA',
             u'opening_hours': u'Tu, Fr 07:30-16:00'},
 u'created': {u'changeset': u'50281768',
              u'timestamp': u'2017-07-14T12:20:05Z',
              u'uid': u'6265328',
              u'user': u'MGMorais',
              u'version': u'3'},
 u'id': u'4953787721',
 u'pos': [-16.746967, -49.1752213],
 u'type': u'node'}
None
{u'_id': ObjectId('5ac0e4ba659c2a221c5aa9c5'),
 u'addr': {u'street': u'Qd41 Lt'},
 u'caract': {u'craft': u'tailor',
             u'name': u'Maristela Quinta',
             u'opening_hours': u'Th'},
 u'created': {u'changeset': u'51448304',
              u'timestamp': u'2017-08-26T01:54:11Z',
              u'uid': u'6265328',
              u'user': u'MGMorais',
              u'version': u'2'},
 u'id': u'4955500424',
 u'name': {u'pt': u'Sexta'},
 u'pos': 

In [28]:
string = "74.110-100"
print re.match(r"^74\.?\d{3}\-?\d{3}$", string)
#"^74[0-9\-]{6:7}"

<_sre.SRE_Match object at 0x0000000013BAB780>


In [29]:
        
#count quantidade de tags de enderecos
print db.goiania.find({"addr": {'$exists': True}}).count()           
#count quantidade de tags de enderecos com ruas
print db.goiania.find({"addr.street": {'$exists': True}}).count()
#checar se os postcodes estao no padrao
print db.goiania.find({"addr.postcode":{'$exists':True}}).count()
#postcodes = db.goiania.find({"addr.postcode":{'$exists':True}})
#postcodes = db.goiania.find({"addr.postcode":{'$exists':True, "$not": re.compile(r"^74(\d|\-|\.){6,8}")}})
print db.goiania.find({"addr.postcode":{'$exists':True, "$not": re.compile(r"^74\.?\d{3}\-?\d{3}$")}}).count()
postcodes = db.goiania.find({"addr.postcode":{'$exists':True, "$not": re.compile(r"^74\.?\d{3}\-?\d{3}$}")}})
#for doc in postcodes:
    #print pprint.pprint(doc)
#se os postcodes nao estao no padrao checar a cidade addr.city
print db.goiania.find({"addr.postcode":{'$exists':True, "$not": re.compile(r"^74\.?\d{3}\-?\d{3}$")}}).distinct("addr.city")


3246
3000
167
12
[u'Goianira', u'Santo Ant\xf4nio de Goi\xe1s', u'Trindade', u'Santo Antonio de Goi\xe1s']


In [30]:
#amenity: restaurant cousine
print db.goiania.find({"caract.amenity":{"$exists": True, "$eq": u'restaurant'}}).distinct("caract.cuisine")
#most frequent cuisine 
query = []
query.append({"$match":{"caract.amenity":{"$exists": True, "$eq": u'restaurant'}}})
query.append({"$group" : {"_id" : "$caract.cuisine", "count":{"$sum":1}}})
query.append({"$sort" : {"count": -1}})
cuisine = db.goiania.aggregate(query)
for doc in cuisine:
    print pprint.pprint(doc)

#amenity: postal_office user geocorreios
print db.goiania.find({"caract.amenity":{"$exists": True, "$eq": u'post_office'}}).distinct("created.user")

#addr suburb (bairro)
print db.goiania.find({"addr.suburb":{"$exists": True}}).count()
query = []
query.append({"$match":{"addr.suburb":{"$exists": True}}})
query.append({"$group" : {"_id" : "$addr.suburb", "count":{"$sum":1}}})
query.append({"$sort" : {"count": -1}})
bairros = db.goiania.aggregate(query)                                          
for doc in bairros:
    print pprint.pprint(doc)

[u'barbecue', u'japanese', u'pizza', u'steak_house;burger', u'chinese', u'pasta', u'regional', u'italian', u'international', u'burger', u'barbecue;steak_house;brazilian', u'vegetarian', u'barbecue;japanese', u'chinese;oriental', u'steak_house']
{u'_id': None, u'count': 47}
None
{u'_id': u'pizza', u'count': 6}
None
{u'_id': u'regional', u'count': 5}
None
{u'_id': u'burger', u'count': 3}
None
{u'_id': u'japanese', u'count': 3}
None
{u'_id': u'barbecue', u'count': 3}
None
{u'_id': u'international', u'count': 2}
None
{u'_id': u'chinese;oriental', u'count': 1}
None
{u'_id': u'barbecue;japanese', u'count': 1}
None
{u'_id': u'barbecue;steak_house;brazilian', u'count': 1}
None
{u'_id': u'vegetarian', u'count': 1}
None
{u'_id': u'steak_house;burger', u'count': 1}
None
{u'_id': u'pasta', u'count': 1}
None
{u'_id': u'steak_house', u'count': 1}
None
{u'_id': u'chinese', u'count': 1}
None
{u'_id': u'italian', u'count': 1}
None
[u'geocorreiosgo', u'charasman', u'T\xfallio', u'jdaraujo', u'santamarie

In [36]:
#amenity: restaurant cousine
print db.goiania.find({"caract.amenity":{"$exists": True, "$eq": u'place_of_worship'}}).distinct("caract.religion")
#most frequent cuisine 
query = []
query.append({"$match":{"caract.amenity":{"$exists": True, "$eq": u'place_of_worship'}}})
query.append({"$group" : {"_id" : "$caract.religion", "count":{"$sum":1}}})
query.append({"$sort" : {"count": -1}})
religion = db.goiania.aggregate(query)
for doc in religion:
    print pprint.pprint(doc)

[u'christian', u'Spiritist']
{u'_id': u'christian', u'count': 66}
None
{u'_id': None, u'count': 21}
None
{u'_id': u'Spiritist', u'count': 1}
None


In [61]:
#information about the most common amenity parking

parking = db.goiania.find({"caract.amenity":{"$exists": True, "$eq": u'parking'}})
print db.goiania.find({"caract.amenity":{"$exists": True, "$eq": u'parking'}}).count()
print db.goiania.find({"$and" : [{"caract.amenity":{"$exists": True, "$eq": u'parking'}}, {"addr":{"$exists": True}}]}).count()


#for doc in parking:
 #   print pprint.pprint(doc)

165
3


In [60]:
#information about quantity of addr in the amenities restaurant
print db.goiania.find({"caract.amenity":{"$exists": True, "$eq": u'restaurant'}}).count()
print db.goiania.find({"$and" : [{"caract.amenity":{"$exists": True, "$eq": u'restaurant'}}, {"addr":{"$exists": True}}]}).count()


78
14


In [62]:
#information about quantity of addr in the amenities fuel
print db.goiania.find({"caract.amenity":{"$exists": True, "$eq": u'fuel'}}).count()
print db.goiania.find({"$and" : [{"caract.amenity":{"$exists": True, "$eq": u'fuel'}}, {"addr":{"$exists": True}}]}).count()

160
12


In [63]:
#information about quantity of addr in the amenities bank
print db.goiania.find({"caract.amenity":{"$exists": True, "$eq": u'bank'}}).count()
print db.goiania.find({"$and" : [{"caract.amenity":{"$exists": True, "$eq": u'bank'}}, {"addr":{"$exists": True}}]}).count()

112
11


In [40]:
#information about the second most common amenity fuel

fuel = db.goiania.find({"caract.amenity":{"$exists": True, "$eq": u'fuel'}})
print db.goiania.find({"caract.amenity":{"$exists": True, "$eq": u'fuel'}}).distinct("caract.brand")


#for doc in parking:
 #   print pprint.pprint(doc)

[u'BP', u'Rede 3', u'Ale', u'BR', u'Texaco', u'Shell', u'ALE', u'Z+Z', u'Petrobras', u'shell', u'XPETRO', u'bridgestone', u'Petrobr\xe1s']


In [41]:
#information about school
school = db.goiania.find({"caract.amenity":{"$exists": True, "$eq": u'school'}})

for doc in school:
    print pprint.pprint(doc)

{u'_id': ObjectId('5ac0e4b7659c2a221c56e121'),
 u'caract': {u'amenity': u'school'},
 u'created': {u'changeset': u'9904590',
              u'timestamp': u'2011-11-22T03:51:02Z',
              u'uid': u'149876',
              u'user': u'dbusse',
              u'version': u'3'},
 u'id': u'588236790',
 u'pos': [-16.7134662, -49.2569001],
 u'type': u'node'}
None
{u'_id': ObjectId('5ac0e4b7659c2a221c579180'),
 u'caract': {u'amenity': u'school', u'name': u'FACLIONS - Faculdade Lions'},
 u'created': {u'changeset': u'11666712',
              u'timestamp': u'2012-05-21T21:37:00Z',
              u'uid': u'686556',
              u'user': u'danilokbrito',
              u'version': u'1'},
 u'id': u'1761563606',
 u'pos': [-16.60819, -49.2015289],
 u'type': u'node'}
None
{u'_id': ObjectId('5ac0e4b7659c2a221c57ae2e'),
 u'addr': {u'housenumber': u'49',
           u'postcode': u'74080010',
           u'street': u'Rua 85'},
 u'caract': {u'amenity': u'school', u'name': u'Achieve Languages - Oxford'},
 u'cr

In [42]:
#amenity: restaurant cousine
print db.goiania.find({"caract.amenity":{"$exists": True, "$eq": u'fast_food'}}).distinct("caract.cuisine")
#most frequent cuisine 
query = []
query.append({"$match":{"caract.amenity":{"$exists": True, "$eq": u'fast_food'}}})
query.append({"$group" : {"_id" : "$caract.cuisine", "count":{"$sum":1}}})
query.append({"$sort" : {"count": -1}})
cuisine = db.goiania.aggregate(query)
for doc in cuisine:
    print pprint.pprint(doc)

[u'burger', u'sandwich', u'burger;arab', u'burger;sandwich', u'pastel;salgados;coca_gelada;sucos;simpatia', u'sandwiches', u'regional;ice_cream', u'pizza;burger']
{u'_id': None, u'count': 65}
None
{u'_id': u'burger', u'count': 6}
None
{u'_id': u'sandwich', u'count': 3}
None
{u'_id': u'regional;ice_cream', u'count': 1}
None
{u'_id': u'sandwiches', u'count': 1}
None
{u'_id': u'burger;arab', u'count': 1}
None
{u'_id': u'pastel;salgados;coca_gelada;sucos;simpatia', u'count': 1}
None
{u'_id': u'burger;sandwich', u'count': 1}
None
{u'_id': u'pizza;burger', u'count': 1}
None
