In [14]:
import pandas as pd
bounded_tw_paths = ['../data/twitter_infos_1_buffer=1.csv', '../data/twitter_infos_2_buffer=1.csv']
bounded_fl_paths = ['../data/flickr_photo_in_museum_buffer=1.csv']

In [61]:
def smdf_by_buffer(sm_paths):
    from collections import defaultdict
    dfs = defaultdict(list)
    dfs_concat = {}
    for path in sm_paths:
        bfr = path.rsplit('_',1)[1][:-4]
        df = pd.read_csv(path,index_col=0)
        dfs[bfr].append(df)
    for bfr, dfs_bfr in dfs.items():
        df = pd.concat(dfs_bfr)
        dfs_concat[bfr]=df
    return dfs_concat
dfs_tw = smdf_by_buffer(bounded_tw_paths)
dfs_fl = smdf_by_buffer(bounded_fl_paths)
        

In [63]:
def simplify_sm(dfs_sm, ts_col, keep_cols):
    from dateutil.parser import parse as date_parse
    def get_yr_mn_dy(x):
        dt = date_parse(x)
        return '%s_%02d_%02d' %(dt.year, int(dt.month),int(dt.day))
    def clean_place(x):
        x = x[:-1] if x[-1].isdigit() else x
        return x.replace(' ','_')
    dfs_sm_simplified = {}
    for bfr, df_sm in dfs_sm.items():
        df = df_sm.copy()
        df['ymd'] = df[ts_col].apply(get_yr_mn_dy)
        df['ym'] = df.ymd.apply(lambda x:x[:-3])
        df = df[keep_cols].copy()
        df.columns = ['smid','user','place', 'ymd','ym']
        df.place = df.place.apply(clean_place)
        dfs_sm_simplified[bfr]=df
    return dfs_sm_simplified

dfs_tw = simplify_sm(dfs_tw, 'ts',['smid','user','place','ymd','ym'])
dfs_fl = simplify_sm(dfs_fl, 'date_taken', ['id','owner', 'museum', 'ymd','ym'])

In [101]:
MONTHS = ['2014_01','2014_02','2014_03','2014_04','2014_05','2014_06','2014_07','2014_08','2014_09','2014_10','2014_11','2014_12','2015_01','2015_02','2015_03','2015_04','2015_05','2015_06','2015_07','2015_08','2015_09','2015_10','2015_11','2015_12','2016_01','2016_02','2016_03','2016_04','2016_05','2016_06','2016_07','2016_08']

def agg_month_lvl(dfs_sm, months, complete):
    dfs_sm_mn_lvl = {}
    for bfr, df_sm in dfs_sm.items():
        sm_mn_lvl = pd.DataFrame(columns=['place','ym','visit'])
        for place, gb in df_sm.groupby('place'):
            place_df = gb.copy()
            place_df = place_df.drop_duplicates(['ymd','user'])
            mn_lvl = place_df.groupby('ym').count()['ymd']
            min_month_idx = months.index(mn_lvl.index.min()) if not complete else 0
            if min_month_idx<12:
                mn_lvl = mn_lvl.reindex(months[min_month_idx:]).fillna(0)
                mn_lvl = mn_lvl.reset_index()
                mn_lvl.columns = ['ym','visit']
                mn_lvl['place'] = place
                mv_lvl = mn_lvl[['place','ym','visit']]
                sm_mn_lvl = pd.concat([sm_mn_lvl, mn_lvl], ignore_index=True)
#                 print mn_lvl.head()
        dfs_sm_mn_lvl[bfr] = sm_mn_lvl
    return dfs_sm_mn_lvl

dfs_tw_mn_lvl = agg_month_lvl(dfs_tw, MONTHS, False)
dfs_fl_mn_lvl = agg_month_lvl(dfs_fl, MONTHS, True)

In [102]:
assert dfs_tw_mn_lvl.keys()==dfs_fl_mn_lvl.keys()

In [103]:
def output_mn_lvl(dfs_sm_mn_lvl, smtype):
    for bfr, df_sm_mn_lvl in dfs_sm_mn_lvl.items():
        df_sm_mn_lvl.to_csv('../data/cor_sm_{}_{}.csv'.format(smtype, bfr))

In [104]:
output_mn_lvl(dfs_tw_mn_lvl, 'tw')
output_mn_lvl(dfs_fl_mn_lvl, 'fl')