## [Students] Shopee Code League - Order Brushing

###### Task
1. Identify all shops that are deemed to have conducted order brushing.
2. For each shop that is identified to have conducted order brushing, identify the buyers suspected to have conducted order brushing for that shop.

by: Hendrik Lesmana

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('order_brush_order.csv')

In [3]:
df.head()

Unnamed: 0,orderid,shopid,userid,event_time
0,31076582227611,93950878,30530270,2019-12-27 00:23:03
1,31118059853484,156423439,46057927,2019-12-27 11:54:20
2,31123355095755,173699291,67341739,2019-12-27 13:22:35
3,31122059872723,63674025,149380322,2019-12-27 13:01:00
4,31117075665123,127249066,149493217,2019-12-27 11:37:55


In [4]:
df.shape

(222750, 4)

In [5]:
len(df['shopid'].unique())

18770

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 222750 entries, 0 to 222749
Data columns (total 4 columns):
orderid       222750 non-null int64
shopid        222750 non-null int64
userid        222750 non-null int64
event_time    222750 non-null object
dtypes: int64(3), object(1)
memory usage: 5.9+ MB


In [7]:
df['event_time'] = pd.to_datetime(df['event_time'])

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 222750 entries, 0 to 222749
Data columns (total 4 columns):
orderid       222750 non-null int64
shopid        222750 non-null int64
userid        222750 non-null int64
event_time    222750 non-null datetime64[ns]
dtypes: datetime64[ns](1), int64(3)
memory usage: 6.8 MB


In [9]:
df['day'] = df['event_time'].dt.day
df['hour'] = df['event_time'].dt.hour

In [10]:
df.head()

Unnamed: 0,orderid,shopid,userid,event_time,day,hour
0,31076582227611,93950878,30530270,2019-12-27 00:23:03,27,0
1,31118059853484,156423439,46057927,2019-12-27 11:54:20,27,11
2,31123355095755,173699291,67341739,2019-12-27 13:22:35,27,13
3,31122059872723,63674025,149380322,2019-12-27 13:01:00,27,13
4,31117075665123,127249066,149493217,2019-12-27 11:37:55,27,11


In [11]:
df_group_day = df.groupby(['shopid','userid','day']).count()[['orderid']]

In [12]:
df_group_day = df_group_day.reset_index()

In [13]:
df_group_day.rename(columns={'orderid':'n_order'}, inplace=True)

In [14]:
import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(18,8))
sns.displot(df_group_day['n_order'])
plt.show()

<Figure size 1800x800 with 0 Axes>

<Figure size 500x500 with 1 Axes>

In [15]:
border = df_group_day['n_order'].mean() + 3*df_group_day['n_order'].std()

In [16]:
border

1.879746508553598

In [17]:
#Berarti data yang lebih besar dari 2 adalah data brushing

In [18]:
#Filter data untuk ambil data brushing saja
suspect = df_group_day[df_group_day['n_order'] > 2].sort_values(by=['shopid','n_order'], ascending=False)

In [19]:
suspect

Unnamed: 0,shopid,userid,day,n_order
212107,213141071,213646699,27,3
212061,212089630,105664365,31,3
211846,210197928,52867898,27,4
211695,209326384,7794832,31,3
211221,208696908,214111334,30,4
211042,208265257,194273448,27,4
210692,204225676,198662175,31,5
210587,203744774,7794832,31,3
210372,203663118,800721,29,3
209612,203531250,114282846,29,5


In [20]:
result = {
    'shopid' : [],
    'userid' : []
    
}

In [21]:
shopid = -1
sellers = []
users = []
for drow in suspect[['shopid','userid']].iterrows():
    if drow[1][0] != shopid:
        if shopid != -1:
            sellers.append(shopid)
            users.append(userid)
        shopid = drow[1][0]
        userid = str(drow[1][1])
    else :
        userid += '&'+str(drow[1][1])

In [22]:
sellers

[213141071,
 212089630,
 210197928,
 209326384,
 208696908,
 208265257,
 204225676,
 203744774,
 203663118,
 203531250,
 203440274,
 203070501,
 201557550,
 201503467,
 201428849,
 201185983,
 201015462,
 200667339,
 199314784,
 199248667,
 198172755,
 195870375,
 195855021,
 195394274,
 193424291,
 192919438,
 192785588,
 192608876,
 191674006,
 191449441,
 191285578,
 191060361,
 190434361,
 189544563,
 189308408,
 189031483,
 188546697,
 188359661,
 188323737,
 187570150,
 187396596,
 187342268,
 186661539,
 186369695,
 185229671,
 182853378,
 182630780,
 182630287,
 182541768,
 182347785,
 181744029,
 181050132,
 181009364,
 180780819,
 180676972,
 180656412,
 178273138,
 177817260,
 176338861,
 175531295,
 173702831,
 173508019,
 173478708,
 173454640,
 173318671,
 173186657,
 173155851,
 172439621,
 172406176,
 171554188,
 171496968,
 171407673,
 169916944,
 169902791,
 168750844,
 168388504,
 168334632,
 168046193,
 167068287,
 166672032,
 166356346,
 165500538,
 165297714,
 165

In [23]:
users

['213646699',
 '105664365',
 '52867898',
 '7794832',
 '214111334',
 '194273448',
 '198662175',
 '7794832',
 '800721',
 '114282846',
 '73308605',
 '189834273',
 '214605778',
 '92521144&130587573',
 '89254393',
 '34132265',
 '515461',
 '25268179',
 '121537666',
 '214769937',
 '129799840',
 '105935455',
 '215061237&214993170',
 '50496873',
 '1762129',
 '4624716',
 '94497233&15383956&21181740&92537427&118829872&171371785&183745983&207108679',
 '213646699',
 '179171579',
 '191211430',
 '214925963',
 '145795810',
 '67950475',
 '799445',
 '94497233&22800224&27456547',
 '211943763',
 '31916119',
 '33259671',
 '31215088',
 '2677380',
 '212167756',
 '48412388',
 '208075118',
 '174783274',
 '214962860',
 '144902703',
 '8330029',
 '158048102',
 '210932914',
 '78206381',
 '199382229',
 '126543234',
 '101832161&214208720',
 '62447784',
 '121537666',
 '137684092',
 '71152760',
 '98709440',
 '132397558',
 '187697407&215009429',
 '211021802',
 '132704747',
 '81928284',
 '12597591',
 '215301243',
 '9604

In [24]:
result['shopid'] = result['shopid'] + sellers
result['userid'] = result['userid'] + users

In [25]:
result

{'shopid': [213141071,
  212089630,
  210197928,
  209326384,
  208696908,
  208265257,
  204225676,
  203744774,
  203663118,
  203531250,
  203440274,
  203070501,
  201557550,
  201503467,
  201428849,
  201185983,
  201015462,
  200667339,
  199314784,
  199248667,
  198172755,
  195870375,
  195855021,
  195394274,
  193424291,
  192919438,
  192785588,
  192608876,
  191674006,
  191449441,
  191285578,
  191060361,
  190434361,
  189544563,
  189308408,
  189031483,
  188546697,
  188359661,
  188323737,
  187570150,
  187396596,
  187342268,
  186661539,
  186369695,
  185229671,
  182853378,
  182630780,
  182630287,
  182541768,
  182347785,
  181744029,
  181050132,
  181009364,
  180780819,
  180676972,
  180656412,
  178273138,
  177817260,
  176338861,
  175531295,
  173702831,
  173508019,
  173478708,
  173454640,
  173318671,
  173186657,
  173155851,
  172439621,
  172406176,
  171554188,
  171496968,
  171407673,
  169916944,
  169902791,
  168750844,
  168388504,
  

In [26]:
for seller in set(df.set_index('shopid').drop(result['shopid'], axis=0).index):
    result['shopid'].append(seller)
    result['userid'].append(0)

In [27]:
len(result['shopid'])

18770

In [28]:
len(df['shopid'].unique())

18770

In [29]:
df_result = pd.DataFrame(result)

In [30]:
df_result

Unnamed: 0,shopid,userid
0,213141071,213646699
1,212089630,105664365
2,210197928,52867898
3,209326384,7794832
4,208696908,214111334
5,208265257,194273448
6,204225676,198662175
7,203744774,7794832
8,203663118,800721
9,203531250,114282846


In [31]:
#df_result.to_csv('Hasil.csv')

In [32]:
pd.read_csv('Hasil.csv')

Unnamed: 0.1,Unnamed: 0,shopid,userid
0,0,213141071,213646699
1,1,212089630,105664365
2,2,210197928,52867898
3,3,209326384,7794832
4,4,208696908,214111334
...,...,...,...
18765,18765,52494327,0
18766,18766,146833400,0
18767,18767,42303483,0
18768,18768,182910973,0
