In [1]:
import src.constants; reload(src.constants)
from src.constants import fn_frsq_venues_dc, fn_segments_dc, epsg_dc
from src.constants import fn_mapping_for_fs, fn_mapping_for_osm
import geopandas as gp
import pandas as pd

# TODO
1. [x] mapped frsq venues to poi categories, 
2. [] filter frsq venues by obj_near_segment
2. [x] mapped osm data to poi categories
4. [] filter osm data by obj_near_segment
3. [] remove overlap between osm and frsq
4. [] Visalization:
    - [] poi categories distribution
        1. [x] frsq venues near segments
        2. [x] osm venues near segments
        3. [] final poi near segments(after removing overlap)
    - [] poi categories spatial distribution

In [2]:
data_dir = 'data/'
path_frsq_venues_dc = data_dir + fn_frsq_venues_dc
path_mapping_for_fs = data_dir+fn_mapping_for_fs
path_mapping_for_osm = data_dir+fn_mapping_for_osm

frsq_venues_dc = gp.read_file(path_frsq_venues_dc)

In [3]:
def mapping_for_fs(path_mapping_for_fs):
    result = []
    with open(path_mapping_for_fs) as f:
        lvs = ['','','','','','','','','']
        for line in f:
            line = line.split('\t')
            lv = len(line)
            node = line[-1].strip()
            lvs[lv]=node
            parent = '/'.join(lvs[:lv])
            result.append([lv, node, parent, lvs[1]])
    new_taxonomy = pd.DataFrame(result,columns=['lv','tag','parents','top_parent'])
    return pd.Series(new_taxonomy.top_parent.values,index=new_taxonomy.tag).to_dict()



In [4]:
def map_frsq_venues_to_poi_category(frsq_venues_gpdf, path_mapping_for_fs, debug=False):
    print '===========mapping  frsq venues to poi categories==========='
    mapping = mapping_for_fs(path_mapping_for_fs)
    poi_frsq = frsq_venues_gpdf.copy()
    poi_frsq['mapped'] = poi_frsq.category.apply(lambda x: mapping[x.encode('utf-8')] if x.encode('utf-8') in mapping else 'no category')
    
    if debug:
        unmapped = poi_frsq[poi_frsq.mapped=='no category']
        print 'venues without poi category: #venues={}, #frsq_categories={}'.format(len(unmapped), len(pd.unique(unmapped.category)))
        print 'top ten frsq_categories', unmapped.category.value_counts().head(10).to_dict()
    poi_frsq = poi_frsq[['id','name','mapped','geometry']]
    poi_frsq.columns = ['id','name','category','geometry']
    return poi_frsq

poi_frsq_dc = map_frsq_venues_to_poi_category(frsq_venues_dc, path_mapping_for_fs, debug=True)




venues without poi category: #venues=3139, #frsq_categories=63
top ten frsq_categories {u'': 1447, u'Building': 882, u'TV Station': 11, u'Exhibit': 51, u'Moving Target': 121, u'Zoo': 14, u'Festival': 6, u'Conference': 7, u'Intersection': 55, u'Road': 450}


In [5]:
def mapping_for_osm(path_mapping_for_osm):
    result = []
    with open(path_mapping_for_osm) as f:
        category, key, value = '','',''
        for line in f:
            line_ = line
            line = line.split('\t')
            type_ = len(line)
            txt = line[-1].strip()
            if type_==1:
                category = txt
            elif type_==2:
                key = txt
            elif type_==3:
                value = txt
                result.append([category, key, value])
    new_taxonomy = pd.DataFrame(result,columns=['category', 'key', 'value'])
    mapping = pd.Series(new_taxonomy.category.values, 
              index=new_taxonomy.apply(lambda x: '{}={}'.format(x.key, x.value), axis=1).values
             ).to_dict()
    return mapping


In [6]:
from src.constants import fn_osm_db_dc
path_osm_db_dc = data_dir + fn_osm_db_dc

In [8]:
def map_osm_to_poi_category(path_osm_db, path_mapping_for_osm, debug=True):
    print '===========mapping osm to poi category==========='
    from src.osm.osmdb_filter import filter_tbtag
    from src.osm.osmdb_constants import FIELDS_TB_TAG
    from src.constants import var_exclude_category_for_osm
    assert FIELDS_TB_TAG==['ot', 'oid', 'key', 'value'], 'Assume fields of table tag are ot,oid,key,value'

    rows = filter_tbtag(path_osm_db)

    mapping = mapping_for_osm(path_mapping_for_osm)
    mapped = []
    for ot, oid, key, value in rows:
        key_value = '{}={}'.format(key.encode('utf-8'), value.encode('utf-8'))
        key_ = key + '=*'
        category = ''
        if key_value in mapping:
            category = mapping[key_value]
        elif key_ in mapping:
            category = mapping[key_]
        if category:
            mapped.append((ot, oid, category))
    
    poi_osm = pd.DataFrame(mapped, columns=FIELDS_TB_TAG[:2]+['category'])
    poi_osm = poi_osm[poi_osm.category!=var_exclude_category_for_osm].groupby(['ot','oid']).agg(set).reset_index()
    
    if debug:
        print '# rows in table tag  =', len(rows)        
        print '# rows after mapping =', len(mapped)
        print '# objs have category =', len(poi_osm)
        print '# categories: # objs =', poi_osm.category.apply(len).value_counts().to_dict()
    return poi_osm

poi_osm_dc = map_osm_to_poi_category(path_osm_db_dc, path_mapping_for_osm, debug=True)


# rows in table tag  = 1484405
# rows after mapping = 44060
# objs have category = 32808
# categories: # objs = {1: 32183, 2: 620, 3: 5}


In [10]:
def poi_distribution(poi_frsq, poi_osm, poi=None):
    import numpy as np
    poi_frsq_distr = poi_frsq.category.value_counts().reset_index()
    poi_frsq_distr.columns = ['category','fs']
    categories, counts = np.unique(np.hstack(poi_osm.category.apply(list).apply(np.array).values), return_counts=True)
    poi_osm_distr = pd.DataFrame(zip(categories,counts),columns=['category','osm'])
    print poi_frsq_distr
    print poi_osm_distr
    print poi_frsq_distr.merge(poi_osm_distr)
    
poi_distribution(poi_frsq_dc, poi_osm_dc)

                   category    fs
0      professional service  6620
1   outdoors and recreation  4357
2                      food  3937
3               retail shop  3753
4               no category  3139
5                 residence  1980
6            transportation  1927
7        schools&university  1709
8            nightlife spot  1593
9                       art  1007
10       cycling facilities   165
                  category    osm
0                      art    130
1       cycling facilities  12646
2                     food   1623
3           nightlife spot    278
4  outdoors and recreation   6037
5     professional service   1449
6                residence    583
7              retail shop   1773
8       schools&university    948
9           transportation   7971
                  category    fs    osm
0     professional service  6620   1449
1  outdoors and recreation  4357   6037
2                     food  3937   1623
3              retail shop  3753   1773
4                r