In [1]:
import pandas as pd

In [2]:
def mem_usage(pandas_obj):
    if isinstance(pandas_obj,pd.DataFrame):
        usage_b = pandas_obj.memory_usage(deep=True).sum()
    else:
        usage_b = pandas_obj.memory_usage(deep=True)
    usage_mb = usage_b/1024**2
    return "{:03.2f}MB".format(usage_mb)


def optimize_memory(df):
    for col in ['appid','use_date']:
        df[col] = df[col].astype('category')
    df['uid'] = df['uid'].astype('uint32')
    for col in ['duration','times']:
        df[col] = df[col].astype('float32')
    return df

In [3]:
app_info = pd.read_csv('../data/app_info.csv',names=['appid','category'])
user_app_actived = pd.read_csv('../data/user_app_actived.csv',names=['uid','appid'])

In [4]:
app_cate_dict = {}
appid_list = list(app_info['appid'].values)
category_list = list(app_info['category'].values)
for i in range(len(appid_list)):
    if appid_list[i] in  app_cate_dict:
        app_cate_dict[appid_list[i]].append(category_list[i])
    else:
        app_cate_dict[appid_list[i]] = []
        app_cate_dict[appid_list[i]].append(category_list[i])
user_app_num = {}
for i in user_app_actived['appid'].values:
    idlist = i.split('#')
    for j in idlist:
        if j in user_app_num:
            user_app_num[j] = user_app_num[j]+1
        else:
            user_app_num[j] = 1

In [5]:
user_app_num2 = sorted(user_app_num.items(),key=lambda d:d[1],reverse=True)

In [8]:
user_app_num_sort = {}
for i,j in enumerate(user_app_num2):
    user_app_num_sort[j[0]] = i+1
print(user_app_num_sort)

{'a00289791': 1, 'a0048467': 2, 'a00287085': 3, 'a00109386': 4, 'a00289826': 5, 'a00311680': 6, 'a0021880': 7, 'a00244790': 8, 'a0046242': 9, 'a00170432': 10, 'a00261220': 11, 'a0048562': 12, 'a00290015': 13, 'a00278905': 14, 'a00271087': 15, 'a00247519': 16, 'a00289850': 17, 'a00145168': 18, 'a00348507': 19, 'a00290027': 20, 'a00278858': 21, 'a00289519': 22, 'a0048522': 23, 'a0071100': 24, 'a00289511': 25, 'a00263057': 26, 'a00373197': 27, 'a00187480': 28, 'a00157201': 29, 'a00289866': 30, 'a00275200': 31, 'a00336224': 32, 'a0092898': 33, 'a00224427': 34, 'a00331160': 35, 'a0048497': 36, 'a0037189': 37, 'a00292780': 38, 'a00299200': 39, 'a00299316': 40, 'a00307775': 41, 'a00271777': 42, 'a00292472': 43, 'a00287057': 44, 'a00276196': 45, 'a00274701': 46, 'a00324307': 47, 'a009932': 48, 'a0071095': 49, 'a0015685': 50, 'a00481059': 51, 'a00461814': 52, 'a00307756': 53, 'a00290037': 54, 'a00135785': 55, 'a00219676': 56, 'a00363352': 57, 'a00157220': 58, 'a00289728': 59, 'a0032847': 60, 'a

In [10]:
reader = pd.read_csv('../data/user_app_usage.csv',iterator=True,names=['uid','appid','duration','times','use_date'])
chunkSize = 10000000
loop = True
allchunk = pd.DataFrame()

In [11]:
user_basic_info = pd.read_csv('../data/user_basic_info.csv',names=['uid','gender','city','prodName','ramCapacity',
                                                           'ramLeftRation','romCapacity','romLeftRation','color',
                                                          'fontSize','ct','carrier','os'])

In [12]:
user_basic_info.drop(columns=['city','prodName','color','ct','carrier'],inplace=True)

In [13]:
while loop:
    try:
        chunk = reader.get_chunk(chunkSize)
        chunk = optimize_memory(chunk)
        chunk['duration_average'] = chunk['duration']/chunk['times']
        print(len(chunk),mem_usage(chunk))
        chunk = chunk.merge(user_basic_info,on=['uid'],how='left')
        chunk.drop(columns=['uid'],axis=1,inplace=True)
        print(len(chunk),mem_usage(chunk))
        for col in ['duration','duration_average','times','gender','ramCapacity',
                                                           'ramLeftRation','romCapacity','romLeftRation',
                                                          'fontSize','os']:
            _ = chunk.groupby(['appid'],as_index=False)[col].agg({col+'_max':'max',col+'_min':'min',col+'_mean':'mean'})
            chunk = chunk.merge(_,on=['appid'],how='left')
            chunk.drop(columns=[col],axis=1,inplace=True)
        print(len(chunk))
        chunk = chunk.drop_duplicates(['appid'])
        print(len(chunk))
        allchunk = pd.concat([allchunk,chunk],ignore_index=True)
        print("allchunk:",len(allchunk),mem_usage(allchunk))
    except StopIteration:
        loop = False
        print("Iteration is stopped")     

10000000 206.21MB
10000000 778.41MB
10000000
54737
allchunk: 54737 16.85MB
10000000 206.16MB
10000000 778.37MB
10000000
54065
allchunk: 108802 28.10MB
10000000 206.14MB
10000000 778.35MB
10000000
53677
allchunk: 162479 41.96MB
10000000 206.09MB
10000000 778.29MB
10000000
52845
allchunk: 215324 55.60MB
10000000 206.21MB
10000000 778.41MB
10000000
54735
allchunk: 270059 69.73MB
10000000 206.14MB
10000000 778.34MB
10000000
53621
allchunk: 323680 83.58MB
10000000 206.19MB
10000000 778.39MB
10000000
54453
allchunk: 378133 97.64MB
10000000 206.10MB
10000000 778.31MB
10000000
53082
allchunk: 431215 111.35MB
10000000 206.17MB
10000000 778.37MB
10000000
54133
allchunk: 485348 125.32MB
10000000 206.11MB
10000000 778.32MB
10000000
53207
allchunk: 538555 139.06MB
10000000 206.17MB
10000000 778.38MB
10000000
54156
allchunk: 592711 153.05MB
10000000 206.13MB
10000000 778.34MB
10000000
53584
allchunk: 646295 166.88MB
10000000 206.11MB
10000000 778.31MB
10000000
53177
allchunk: 699472 180.61MB
1000000

In [14]:
allchunk.drop(columns=['use_date'],axis=1,inplace=True)

In [15]:
allchunk.columns

Index(['appid', 'duration_max', 'duration_min', 'duration_mean',
       'duration_average_max', 'duration_average_min', 'duration_average_mean',
       'times_max', 'times_min', 'times_mean', 'gender_max', 'gender_min',
       'gender_mean', 'ramCapacity_max', 'ramCapacity_min', 'ramCapacity_mean',
       'ramLeftRation_max', 'ramLeftRation_min', 'ramLeftRation_mean',
       'romCapacity_max', 'romCapacity_min', 'romCapacity_mean',
       'romLeftRation_max', 'romLeftRation_min', 'romLeftRation_mean',
       'fontSize_max', 'fontSize_min', 'fontSize_mean', 'os_max', 'os_min',
       'os_mean'],
      dtype='object')

In [16]:
for col in allchunk.columns:
    if col=='appid':
        continue
    _ = allchunk.groupby(['appid'],as_index=False)[col].agg({col+'_'+col.split('_')[-1]:col.split('_')[-1]})
    allchunk = allchunk.merge(_,on=['appid'],how='left')
    allchunk.drop(columns=[col],inplace=True)

In [17]:
allchunk = allchunk.drop_duplicates(['appid'])

In [21]:
import numpy as np
allchunk['number_sort'] = allchunk['appid'].apply(lambda x:user_app_num_sort[x] if x in user_app_num_sort else np.nan)

In [22]:
allchunk['app_cate'] = allchunk['appid'].apply(lambda x:app_cate_dict[x][0] if x in app_cate_dict else 'unknow')

In [23]:
allchunk.head()

Unnamed: 0,appid,duration_max_max,duration_min_min,duration_mean_mean,duration_average_max_max,duration_average_min_min,duration_average_mean_mean,times_max_max,times_min_min,times_mean_mean,...,romLeftRation_min_min,romLeftRation_mean_mean,fontSize_max_max,fontSize_min_min,fontSize_mean_mean,os_max_max,os_min_min,os_mean_mean,number_sort,app_cate
0,a00289850,86400.0,0.0,1490.342773,86400.0,0.0,105.206017,78312.0,1.0,12.492076,...,0.0,0.410248,2.3,0.75,1.060341,9.0,4.2,8.261211,17.0,实用工具
1,a00287085,86400.0,0.0,1115.518433,86400.0,0.0,38.23539,28574.0,1.0,93.812828,...,0.0,0.417783,2.3,0.52,1.052336,9.0,4.2,8.305384,3.0,便捷生活
2,a00289826,86400.0,0.0,1415.172852,86400.0,0.0,20.632214,59938.0,1.0,116.777618,...,0.0,0.417021,2.3,0.75,1.04506,9.0,4.2,8.273567,5.0,社交通讯
3,a00278858,86400.0,0.0,4029.929688,86400.0,0.0,55.036362,542547.0,1.0,116.108795,...,0.0,0.435356,2.3,0.52,1.073925,9.0,4.2,8.255797,21.0,新闻阅读
4,a00289791,86400.0,0.0,4545.259277,86400.0,0.0,27.388208,78297.0,1.0,275.746613,...,0.0,0.431894,2.3,0.52,1.063281,9.0,4.2,8.16527,1.0,实用工具


In [24]:
allchunk.to_csv('../feature/app_feature.csv',index=False)