In [1]:
import pandas as pd
import datetime
import sys
mid_pre = "./data/"

In [2]:
def time_now():
    return datetime.datetime.now()

In [3]:
train = pd.read_csv(mid_pre + "train.csv", header=None, sep='\t', names=['user_id', 'item_id', 'behavior', 'date'])
train.head(10)

Unnamed: 0,user_id,item_id,behavior,date
0,1732029186,1175762408,clk,20190620
1,1732029186,319336376,clk,20190619
2,1732029186,1162343852,clk,20190611
3,1732029186,1197151915,clk,20190619
4,1732029186,1145629824,clk,20190619
5,1732029186,1188080625,clk,20190617
6,1732029186,1162473275,clk,20190619
7,1732029186,1162421364,clk,20190611
8,1732029186,1128523790,clk,20190619
9,1732029186,1178406787,clk,20190620


In [4]:
item_feature = pd.read_csv(mid_pre + "item_feature.csv", header=None, sep="\t", names=['item_id', 'cate_1_id', 'cate_id', 'brand_id', 'price'])
item_feature.head(10)

Unnamed: 0,item_id,cate_1_id,cate_id,brand_id,price
0,43152,50022703,50003881,30652,1749.0
1,133107,11,110502,26683,189.98
2,138801,50010788,50010815,20067,538.5
3,140174,50010788,50010815,20112,814.0
4,237779,1801,50011980,20000,39.0
5,241352,1801,50011993,20096,990.0
6,280943,50010788,50010796,20074,19.9
7,287275,50010788,50010805,20060,198.0
8,304312,1801,50011980,20105,268.0
9,3024515,50002768,350213,30822,137.9


In [5]:
def for_loop_count():
    t1 = time_now()
    count = {}
    for item in train.values:
        if item[2] in count:
            count[item[2]] += 1
        else:
            count[item[2]] = 1
    return (time_now() - t1).microseconds

In [6]:
def grouby_count():
    t1 = time_now()
    count = train.groupby(by='behavior')['user_id'].count()
    return (time_now() - t1).microseconds

In [7]:
time_cost1 = 0
for i in range(10):
    time_cost1 += grouby_count()
print('groupby耗时：', time_cost1 / 10, '微秒')

time_cost2 = 0
for i in range(10):
    time_cost2 += for_loop_count()
print('for 循环耗时：',time_cost2 / 10, '微秒')

print('差距:', time_cost2 / time_cost1, '倍')

groupby耗时： 3911.9 微秒
for 循环耗时： 53546.6 微秒
差距: 13.688131087195481 倍


In [8]:
item_cate_group = item_feature.groupby('item_id')
def replace_item_with_target(item_id, cluster_target):
    try:
        return item_cate_group.get_group(item_id)[cluster_target].values[0]
    except KeyError:
        return -1
train['brand_id'] = train['item_id'].apply(lambda x: replace_item_with_target(x, 'brand_id'))
train.drop(train[train['brand_id'] == -1].index, inplace=True)
user_group = train.groupby(['user_id'])
user_set = user_group.user_id.indices.keys()
brand_item_group = item_feature.groupby('brand_id')
user_dict = {}

In [9]:
def query_with_loc():
    t1 = time_now()
    for user in user_set:
        brand_id_list = user_group.get_group(user)['brand_id'].values
        temp_df = item_feature.loc[item_feature.brand_id.isin(brand_id_list)]['item_id']
        # 将int64转化为int，因为int64不可hash
        item_list = list(map(int, temp_df.values))
        user_dict[user] = item_list
    return (time_now() - t1).microseconds

In [10]:
def query_with_groupby():
    t1 = time_now()
    for user in user_set:
        brand_id_list = user_group.get_group(user)['brand_id'].values
        item_list = []
        for brand in brand_id_list:
            item_list += list(brand_item_group.get_group(brand)['item_id'])
        user_dict[user] = list(map(int, item_list))
    return (time_now() - t1).microseconds

In [11]:
time_cost1 = 0
for i in range(10):
    time_cost1 += query_with_loc()
print('loc 耗时：', time_cost1 / 10, '微秒')

time_cost2 = 0
for i in range(10):
    time_cost2 += query_with_groupby()
print('groupby 耗时：',time_cost2 / 10, '微秒')

print('差距:', str(round((time_cost1 - time_cost2)/ time_cost1 * 100, 2)) + "%")

loc 耗时： 554681.3 微秒
groupby 耗时： 413089.5 微秒
差距: 25.53%


In [12]:
from multiprocessing import Manager
from multiprocessing import Process

def target_func(i):
    d[i] = 1

if __name__=="__main__":
    m=Manager()
    # 使用Manager创建字典
    d=m.dict({})
    P=[]
    for i in range(10):
        p=Process(target=target_func,args=[i])
        # 非守护进程，即主进程会等待子进程结束
        p.daemon = True
        P.append(p)
    # 开始进程
    [p.start() for p in P]
    # 等待子进程结束
    [p.join() for p in P]
    print(d)

{0: 1, 1: 1, 2: 1, 3: 1, 4: 1, 5: 1, 6: 1, 7: 1, 8: 1, 9: 1}
