In [1]:
import pandas as pd
import datetime as datetime

In [2]:
# Load CSV
readPath = 'order_brush_order.csv'
Order_df = pd.read_csv(readPath)
df = Order_df.copy()
df.head()

Unnamed: 0,orderid,shopid,userid,event_time
0,31076582227611,93950878,30530270,2019-12-27 00:23:03
1,31118059853484,156423439,46057927,2019-12-27 11:54:20
2,31123355095755,173699291,67341739,2019-12-27 13:22:35
3,31122059872723,63674025,149380322,2019-12-27 13:01:00
4,31117075665123,127249066,149493217,2019-12-27 11:37:55


In [3]:
# Remove unused orderId column
del df['orderid']
df.head()

Unnamed: 0,shopid,userid,event_time
0,93950878,30530270,2019-12-27 00:23:03
1,156423439,46057927,2019-12-27 11:54:20
2,173699291,67341739,2019-12-27 13:22:35
3,63674025,149380322,2019-12-27 13:01:00
4,127249066,149493217,2019-12-27 11:37:55


In [4]:
def format_row_df(row_df):
    # covert string to date time, and sort based on event_time
    row_df['event_time'] = pd.to_datetime(row_df['event_time'])
    row_df = row_df.sort_values(by='event_time')
    return row_df

In [5]:
def get_buyer_name(window_df):
    name_list = []
    order_num = len(window_df)
    user_num = window_df['userid'].nunique()

    if(order_num/user_num >= 3): 
        series = window_df.loc[:,'userid'].value_counts() # get series based on order total number descending if it is a order brush period
        max_num = 3
        for item in series.iteritems():
            if(item[1])>= max_num:
                max_num = item[1]
                name_list.append(str(item[0])) # only get the most order number names

    return name_list

In [6]:
def get_suspicious_buyer(row_df):
    row_sus_buyers_list = []

    row_df_time_list = row_df['event_time'].to_list() 

    # Loop the list_time_window_start, get the dataframe window between start_time and end_time
    for event_time in row_df_time_list:
        time_pointer_start_time = event_time - datetime.timedelta(hours=1)
        time_pointer_end_time = event_time
        time_pointer = time_pointer_start_time
        
        pre_window_df = pd.DataFrame(columns = row_df.columns)

        while time_pointer <= time_pointer_end_time:
            window_start_time = time_pointer
            window_end_time = time_pointer + datetime.timedelta(hours=1)
            
            window_df = row_df.loc[(row_df['event_time'] >= window_start_time) & (row_df['event_time'] <= window_end_time)]
            time_pointer = time_pointer + datetime.timedelta(seconds=60)

            if(not pre_window_df.equals(window_df)):
                if(len(window_df) >= 3):
                    name_list = get_buyer_name(window_df) 
                    # only add distinct name in row_sus_buyers_list
                    if(len(name_list) > 0):
                        for name in name_list:
                            if(name not in row_sus_buyers_list):
                                row_sus_buyers_list.append(name)

                pre_window_df = window_df
    
    return row_sus_buyers_list

In [7]:
# convert df to dict
df_dict= dict(list(df.groupby(['shopid'])))

# disctinct shopid list from dict keys
shopid_list = list(df_dict.keys())

# declare syspicious buyer's list
sus_buyers_list = []

print ("Start : " + str(datetime.datetime.now()))

num = 0
for shopid in shopid_list:

    if num >= 500:
        break
    num += 1

    row_df = df_dict[shopid] # get row df for shopid
    
    row_df = format_row_df(row_df)

    row_sus_buyers_list = get_suspicious_buyer(row_df) # get this shopid sus_buyer_list with distinct name

    row_sus_buyers_list.sort() # smaller numerical userid first
    row_sus_buyers_string = '&'.join(row_sus_buyers_list)

    if len(row_sus_buyers_string) > 0 :
        sus_buyers_list.append(row_sus_buyers_string)
    else:
        sus_buyers_list.append('0')

# write CSV  
out_put_dict = {'shopid': shopid_list[:len(sus_buyers_list)], 'userid': sus_buyers_list}  
     
out_put_df = pd.DataFrame(out_put_dict)

out_put_df.to_csv('Solution_jack.csv', index=False)

print ("End : " + str(datetime.datetime.now()))

Start : 2020-06-15 19:30:56.453029
End : 2020-06-15 19:35:27.030248
