In [1]:
import pandas as pd
import datetime as datetime

In [2]:
# Load CSV
readPath = 'order_brush_order.csv'
Order_df = pd.read_csv(readPath)
df = Order_df.copy()
df.head()

Unnamed: 0,orderid,shopid,userid,event_time
0,31076582227611,93950878,30530270,2019-12-27 00:23:03
1,31118059853484,156423439,46057927,2019-12-27 11:54:20
2,31123355095755,173699291,67341739,2019-12-27 13:22:35
3,31122059872723,63674025,149380322,2019-12-27 13:01:00
4,31117075665123,127249066,149493217,2019-12-27 11:37:55


In [3]:
# Remove unused orderId column
del df['orderid']
df.head()

Unnamed: 0,shopid,userid,event_time
0,93950878,30530270,2019-12-27 00:23:03
1,156423439,46057927,2019-12-27 11:54:20
2,173699291,67341739,2019-12-27 13:22:35
3,63674025,149380322,2019-12-27 13:01:00
4,127249066,149493217,2019-12-27 11:37:55


In [4]:
def format_row_df(row_df):
    # covert string to date time, and sort based on event_time
    row_df['event_time'] = pd.to_datetime(row_df['event_time'])
    row_df = row_df.sort_values(by='event_time')
    return row_df

In [5]:
def get_buyer_name(window_df):
    name_list = []
    order_num = len(window_df)
    user_num = window_df['userid'].nunique()

    if(order_num/user_num >= 3): 
        series = window_df.loc[:,'userid'].value_counts() # get series based on order total number descending if it is a order brush period
        max_num = 3
        for item in series.iteritems():
            if(item[1])>= max_num:
                max_num = item[1]
                name_list.append(str(item[0])) # only get the most order number names

    return name_list

In [6]:
def get_suspicious_buyer(row_df):
    row_sus_buyers_list = []

    row_df_time_list = row_df['event_time'].to_list() 

    previous_event_time = row_df_time_list[0] - datetime.timedelta(hours=1)

    record_length = len(row_df_time_list)

    for start_pointer in range(record_length - 2):
        start_time = row_df_time_list[start_pointer]
        
        for end_pointer in range(start_pointer + 2, record_length):
            end_time = row_df_time_list[end_pointer]

            if end_time > start_time + datetime.timedelta(hours=1):
                break

            if end_pointer == record_length - 1:
                next_record_time = end_time
            else:
                next_record_time = row_df_time_list[end_pointer + 1]

            if next_record_time <= previous_event_time + datetime.timedelta(hours=1):
                end_time = start_time + datetime.timedelta(hours=1)
            
            window_df = row_df.loc[(row_df['event_time'] >= start_time) & (row_df['event_time'] <= end_time)]

            name_list = get_buyer_name(window_df) 
            # only add distinct name in row_sus_buyers_list
            if(len(name_list) > 0):
                for name in name_list:
                    if(name not in row_sus_buyers_list):
                        row_sus_buyers_list.append(name)

        previous_event_time = start_time
    
    return row_sus_buyers_list

In [7]:
# convert df to dict
df_dict= dict(list(df.groupby(['shopid'])))

# disctinct shopid list from dict keys
shopid_list = list(df_dict.keys())

# declare syspicious buyer's list
sus_buyers_list = []

print ("Start : " + str(datetime.datetime.now()))

row_counter = 0
for shopid in shopid_list:

    row_counter += 1

    row_df = df_dict[shopid] # get row df for shopid
    
    row_df = format_row_df(row_df)

    row_sus_buyers_list = get_suspicious_buyer(row_df) # get this shopid sus_buyer_list with distinct name

    row_sus_buyers_list.sort(key=int) # smaller numerical userid first
    row_sus_buyers_string = '&'.join(row_sus_buyers_list)

    if len(row_sus_buyers_string) > 0 :
        sus_buyers_list.append(row_sus_buyers_string)
        print('Row'+str(row_counter) + ': id: ' +str(shopid) + ' user: ' + row_sus_buyers_string)
    else:
        sus_buyers_list.append('0')

# write CSV  
out_put_dict = {'shopid': shopid_list[:len(sus_buyers_list)], 'userid': sus_buyers_list}  
     
out_put_df = pd.DataFrame(out_put_dict)

out_put_df.to_csv('Solution_jack.csv', index=False)

print ("End : " + str(datetime.datetime.now()))

Start : 2020-06-16 21:14:13.268896
Row14: id: 10159 user: 214988798
Row41: id: 10402 user: 77819
Row58: id: 10536 user: 672345
Row112: id: 42472 user: 740844
Row115: id: 42818 user: 170385453
Row130: id: 76934 user: 190449497
Row181: id: 195531 user: 214992524
Row252: id: 425364 user: 72914921
Row278: id: 599533 user: 264511
Row279: id: 605561 user: 181682008
Row313: id: 722132 user: 7670129
Row315: id: 731606 user: 75558350
Row316: id: 736620 user: 62618064
Row333: id: 769445 user: 141006168
Row345: id: 823357 user: 188942105
Row425: id: 1175477 user: 122277324
Row463: id: 1532569 user: 181408876
Row567: id: 2831130 user: 15053804
Row571: id: 2856708 user: 123959597
Row600: id: 3124091 user: 214568881
Row650: id: 3701311 user: 80690628
Row671: id: 4149345 user: 212325226
Row735: id: 4888564 user: 143847348
Row837: id: 6765647 user: 740844
Row960: id: 8566282 user: 556867
Row967: id: 8715449 user: 9753706
Row991: id: 8996761 user: 13135622&137245836&162508227&215382704
Row1004: id: 915