In [1]:
DATA_DIR = '../data/'
place_choices = [
#     'place_polys_museum','place_polys_museum_convex','place_polys_museum_convex_5m','place_polys_museum_convex_10m',
#     'place_polys_museum_convex_50m','place_polys_museum_convex_100m',
    'place_polys_np','place_polys_np_5m','place_polys_np_10m','place_polys_np_50m','place_polys_np_100m',
]
bounded_tw_paths = [DATA_DIR+'sm#tw_{}#{}.csv'.format((i+1), place_choice)for i in range(2) for place_choice in place_choices ]
bounded_fl_paths = [DATA_DIR+'sm#fl_{}#{}.csv'.format((i+1), place_choice)for i in range(3) for place_choice in place_choices ]

In [6]:
import os
bounded_tw_paths = [x for x in bounded_tw_paths if os.path.isfile(x)]
bounded_fl_paths = [x for x in bounded_fl_paths if os.path.isfile(x)]

In [7]:
import pandas as pd
path_df = pd.DataFrame(bounded_fl_paths+bounded_tw_paths,columns=['path'])
path_df['fn']=path_df.path.apply(lambda x: x.rsplit('/',1)[1].replace('.csv',''))
path_df['sm']=path_df.fn.apply(lambda x: x.split('#')[1].split('_')[0])
path_df['bfr']=path_df.fn.apply(lambda x: x.split('#')[2])
path_df.shape

(10, 4)

In [15]:
dfs_sm = {}
for bfr_smtype, grp in path_df.groupby(['bfr','sm']):
    df_concat = []
    for path in grp.path.values:
        df = pd.read_csv(path,index_col=0)
        df_concat.append(df)
    df_concat = pd.concat(df_concat)
    dfs_sm[bfr_smtype] = df_concat

In [17]:
def simplify_sm(dfs_sm):
    from dateutil.parser import parse as date_parse
    def get_yr_mn_dy(x):
        dt = date_parse(x)
        return '%s_%02d_%02d' %(dt.year, int(dt.month),int(dt.day))
    def clean_place(x):
        x = x.split('##')[0]
        x = x[:-1] if x[-1].isdigit() else x
        return x.replace(' ','_')
    dfs_sm_simplified = {}
    for (bfr, smtype), df_sm in dfs_sm.items():
        ts_col = {'tw':'ts', 'fl':'date_taken'}[smtype]
        keep_cols = {'tw':['smid','user','place','ymd','ym'],'fl':['smid','nsid', 'place', 'ymd','ym']}[smtype]
        df = df_sm.copy()
        df['ymd'] = df[ts_col].apply(get_yr_mn_dy)
        df['ym'] = df.ymd.apply(lambda x:x[:-3])
        df = df[keep_cols].copy()
        df.columns = ['smid','user','place', 'ymd','ym']
        df.place = df.place.apply(clean_place)
        dfs_sm_simplified[(bfr, smtype)]=df
    return dfs_sm_simplified
dfs_sm_simplified = simplify_sm(dfs_sm)


In [18]:
dfs_sm_simplified.keys()

[('place_polys_np_50m', 'tw'),
 ('place_polys_np_10m', 'fl'),
 ('place_polys_np', 'tw'),
 ('place_polys_np', 'fl'),
 ('place_polys_np_100m', 'fl'),
 ('place_polys_np_100m', 'tw'),
 ('place_polys_np_10m', 'tw'),
 ('place_polys_np_5m', 'tw'),
 ('place_polys_np_50m', 'fl'),
 ('place_polys_np_5m', 'fl')]

In [19]:
MONTHS = ['2014_01','2014_02','2014_03','2014_04','2014_05','2014_06','2014_07','2014_08','2014_09','2014_10','2014_11','2014_12',
          '2015_01','2015_02','2015_03','2015_04','2015_05','2015_06','2015_07','2015_08','2015_09','2015_10','2015_11','2015_12',
          '2016_01','2016_02','2016_03','2016_04','2016_05','2016_06','2016_07','2016_08']

def agg_month_lvl(dfs_sm, months):
    dfs_sm_mn_lvl = {}
    for (bfr,smtype), df_sm in dfs_sm.items():
        visit_col = '{}_visit'.format(smtype)
        sm_mn_lvl = pd.DataFrame(columns=['place','ym',visit_col])
        for place, gb in df_sm.groupby('place'):
            place_df = gb.copy()
            place_df = place_df.drop_duplicates(['ymd','user'])
            mn_lvl = place_df.groupby('ym').count()['ymd']
            min_month_idx = months.index(mn_lvl.index.min()) if smtype=='tw' else 0
            if min_month_idx<12:
                mn_lvl = mn_lvl.reindex(months[min_month_idx:]).fillna(0)
                mn_lvl = mn_lvl.reset_index()
                mn_lvl.columns = ['ym',visit_col]
                mn_lvl['place'] = place
                sm_mn_lvl = pd.concat([sm_mn_lvl, mn_lvl], ignore_index=True)
#                 print mn_lvl.head()
        dfs_sm_mn_lvl[(bfr,smtype)] = sm_mn_lvl
    return dfs_sm_mn_lvl

dfs_sm_mn_lvl = agg_month_lvl(dfs_sm_simplified,MONTHS)

def output_mn_lvl(dfs_sm_mn_lvl):
    for (bfr,smtype), df_sm_mn_lvl in dfs_sm_mn_lvl.items():
        df_sm_mn_lvl.to_csv('../data/mn_lvl_sm#{}#{}.csv'.format(smtype, bfr))
output_mn_lvl(dfs_sm_mn_lvl)


# back up

In [1]:
import pandas as pd
bounded_tw_paths = ['../data/twitter_infos_1_buffer=1.csv', '../data/twitter_infos_2_buffer=1.csv']
bounded_fl_paths = ['../data/flickr_photo_in_museum_buffer=1.csv']


In [2]:
def smdf_by_buffer(sm_paths):
    from collections import defaultdict
    dfs = defaultdict(list)
    dfs_concat = {}
    for path in sm_paths:
        bfr = path.rsplit('_',1)[1][:-4]
        df = pd.read_csv(path,index_col=0)
        dfs[bfr].append(df)
    for bfr, dfs_bfr in dfs.items():
        df = pd.concat(dfs_bfr)
        dfs_concat[bfr]=df
    return dfs_concat
dfs_tw = smdf_by_buffer(bounded_tw_paths)
dfs_fl = smdf_by_buffer(bounded_fl_paths)
        

In [3]:
def simplify_sm(dfs_sm, ts_col, keep_cols):
    from dateutil.parser import parse as date_parse
    def get_yr_mn_dy(x):
        dt = date_parse(x)
        return '%s_%02d_%02d' %(dt.year, int(dt.month),int(dt.day))
    def clean_place(x):
        x = x[:-1] if x[-1].isdigit() else x
        return x.replace(' ','_')
    dfs_sm_simplified = {}
    for bfr, df_sm in dfs_sm.items():
        df = df_sm.copy()
        df['ymd'] = df[ts_col].apply(get_yr_mn_dy)
        df['ym'] = df.ymd.apply(lambda x:x[:-3])
        df = df[keep_cols].copy()
        df.columns = ['smid','user','place', 'ymd','ym']
        df.place = df.place.apply(clean_place)
        dfs_sm_simplified[bfr]=df
    return dfs_sm_simplified

dfs_tw = simplify_sm(dfs_tw, 'ts',['smid','user','place','ymd','ym'])
dfs_fl = simplify_sm(dfs_fl, 'date_taken', ['id','owner', 'museum', 'ymd','ym'])

  elif res.tzname and res.tzname in time.tzname:
