In [14]:
# import libraries
import pandas as pd
from datetime import timedelta

In [15]:
# load data into dataframe
df = pd.read_csv('data/order_brush_order.csv')
df.head()

Unnamed: 0,orderid,shopid,userid,event_time
0,31076582227611,93950878,30530270,2019-12-27 00:23:03
1,31118059853484,156423439,46057927,2019-12-27 11:54:20
2,31123355095755,173699291,67341739,2019-12-27 13:22:35
3,31122059872723,63674025,149380322,2019-12-27 13:01:00
4,31117075665123,127249066,149493217,2019-12-27 11:37:55


In [16]:
# convert type of event_time to datetime
df['event_time'] = pd.to_datetime(df['event_time'])

In [17]:
# group by shopid and convert event_time, orderid and userid into list of dictionaries
grouped_df = df.groupby('shopid')['event_time', 'orderid', 'userid'].apply(lambda x: x.to_dict(orient='records')).reset_index(name='details')

  grouped_df = df.groupby('shopid')['event_time', 'orderid', 'userid'].apply(lambda x: x.to_dict(orient='records')).reset_index(name='details')


In [18]:
# convert to list of dictionaries for better performance
data_list = grouped_df.to_dict(orient='records')

In [19]:
# iterate through each shop
for shop_dict in data_list:

    # sort details' list of dictionaries by event_time
    shop_dict['details'] = sorted(shop_dict['details'], key=lambda item: item['event_time'])

    # group orders into all possible 1-hour intervals
    shop_dict['interval_orders'] = []
    for detail_dict in shop_dict['details']:
        shop_dict['interval_orders'].append({
            'start': detail_dict['event_time'],
            'end': detail_dict['event_time'] + timedelta(hours=1),
            'orders': []
        })
        for interval_order_dict in shop_dict['interval_orders']:
            if detail_dict['event_time'] <= interval_order_dict['end']:
                interval_order_dict['orders'].append(detail_dict)
    
    shop_dict['sus_userid'] = []

    # iterate through each interval
    for interval_order_dict in shop_dict['interval_orders']:

        # sum number of orders and number of unique users for each interval, then calculate concentrate rate
        interval_order_dict['order_count'] = len(interval_order_dict['orders'])
        interval_order_dict['unique_user_count'] = len(set([item['userid'] for item in interval_order_dict['orders']]))
        interval_order_dict['concentrate_rate'] = interval_order_dict['order_count'] / interval_order_dict['unique_user_count']

        # check whether interval is suspicious and get suspicious userids
        if interval_order_dict['concentrate_rate'] >= 3:
            orders_df = pd.DataFrame(interval_order_dict['orders'])
            orders_grouped = orders_df.groupby('userid')['orderid'].aggregate('count').to_dict()
            max_order_count = orders_grouped[max(orders_grouped, key=orders_grouped.get)]
            shop_dict['sus_userid'] += [userid for userid, order_count in orders_grouped.items() if order_count == max_order_count]

    # remove duplicated suspicious userids
    shop_dict['sus_userid'] = list(set(shop_dict['sus_userid']))


In [20]:
# format result
result_df = pd.DataFrame(columns=['shopid', 'userid'])
for shop_dict in data_list:
    if shop_dict['sus_userid']:
        sus_userid = '&'.join([str(userid) for userid in sorted(shop_dict['sus_userid'])])
    else:
        sus_userid = '0'
    result_df = result_df.append({'shopid': shop_dict['shopid'], 'userid': sus_userid}, ignore_index=True)

In [21]:
# save result
result_df.to_csv('output/result.csv', index=False)
print(len(result_df))
# 70.3s (1m 10.3s)

18770
