In [43]:
import pandas as pd
import datetime as datetime

In [47]:
# Load CSV
readPath = 'order_brush_order.csv'
Order_df = pd.read_csv(readPath)
df = Order_df.copy()
df.head()

Unnamed: 0,orderid,shopid,userid,event_time
0,31076582227611,93950878,30530270,2019-12-27 00:23:03
1,31118059853484,156423439,46057927,2019-12-27 11:54:20
2,31123355095755,173699291,67341739,2019-12-27 13:22:35
3,31122059872723,63674025,149380322,2019-12-27 13:01:00
4,31117075665123,127249066,149493217,2019-12-27 11:37:55


In [46]:
# Remove unused orderId column
del df['orderid']
df.head()

Unnamed: 0,shopid,userid,event_time
0,93950878,30530270,2019-12-27 00:23:03
1,156423439,46057927,2019-12-27 11:54:20
2,173699291,67341739,2019-12-27 13:22:35
3,63674025,149380322,2019-12-27 13:01:00
4,127249066,149493217,2019-12-27 11:37:55


In [38]:
def format_row_df(row_df):
    # covert string to date time, and sort based on event_time
    row_df['event_time'] = pd.to_datetime(row_df['event_time'])
    row_df = row_df.sort_values(by='event_time')
    return row_df

In [39]:
def get_suspicious_buyer(row_df):
    row_sus_buyers_list = []

    list_time_window_start = row_df['event_time'].to_list() # get sorted event_time list

    # Loop the list_time_window_start, get the dataframe window between start_time and end_time
    for start_time in list_time_window_start:
        end_time = start_time + datetime.timedelta(hours=1)
        window_df = row_df.loc[(row_df['event_time'] >= start_time) & (row_df['event_time'] <= end_time)]
        name_list = get_buyer_name(window_df) # get sus_buyer_name list from dataframe window

        # only add distinct name in row_sus_buyers_list
        if(len(name_list) > 0):
            for name in name_list:
                if(name not in row_sus_buyers_list):
                    row_sus_buyers_list.append(name)
    
    return row_sus_buyers_list

In [40]:
def get_buyer_name(window_df):
    name_list = []
    order_num = len(window_df)
    user_num = window_df['userid'].nunique()

    if(order_num/user_num >= 3): 
        series = window_df.loc[:,'userid'].value_counts() # get series based on order total number descending if it is a order brush period
        max_num = 3
        for item in series.iteritems():
            if(item[1])>= max_num:
                max_num = item[1]
                name_list.append(str(item[0])) # only get the most order number names

    return name_list

In [41]:
# convert df to dict
df_dict= dict(list(df.groupby(['shopid'])))

# disctinct shopid list from dict keys
shopid_list = list(df_dict.keys())

# declare syspicious buyer's list
sus_buyers_list = []

print ("Start : " + str(datetime.datetime.now()))

for shopid in shopid_list:

    row_df = df_dict[shopid] # get row df for shopid

    row_df = format_row_df(row_df)

    row_sus_buyers_list = get_suspicious_buyer(row_df) # get this shopid sus_buyer_list with distinct name

    row_sus_buyers_list.sort() # smaller numerical userid first
    row_sus_buyers_string = '&'.join(row_sus_buyers_list)

    if len(row_sus_buyers_string) > 0 :
        sus_buyers_list.append(row_sus_buyers_string)
    else:
        sus_buyers_list.append('0')

# write CSV  
out_put_dict = {'shopid': shopid_list, 'userid': sus_buyers_list}  
     
out_put_df = pd.DataFrame(out_put_dict)

out_put_df.to_csv('Solution_jack.csv', index=False)

print ("End : " + str(datetime.datetime.now()))

Start : 2020-06-14 16:07:10.145886
End : 2020-06-14 16:12:22.763834


In [42]:
    # Test
    row_sus_buyers_list = ['162508227','13135622','137245836','215382704']
    row_sus_buyers_list.sort()
    row_sus_buyers_string = '&'.join(row_sus_buyers_list)

    print(row_sus_buyers_string)

13135622&137245836&162508227&215382704
