In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [2]:
bids_df = pd.read_csv("./Resources/cleaned_bids.csv", index_col=0)
print(bids_df.shape)
bids_df.head()

(7647475, 9)


Unnamed: 0,bid_id,bidder_id,auction,merchandise,device,time,country,ip,url
0,0,8dac2b259fd1c6d1120e519fb1ac14fbqvax8,ewmzr,jewelry,phone0,9759243157894736,us,69.166.231.58,vasstdc27m7nks3
1,1,668d393e858e8126275433046bbd35c6tywop,aeqok,furniture,phone1,9759243157894736,in,50.201.125.84,jmqlhflrzwuay9c
2,2,aa5f360084278b35d746fa6af3a7a1a5ra3xe,wa00e,homegoods,phone2,9759243157894736,py,112.54.208.157,vasstdc27m7nks3
3,3,3939ac3ef7d472a59a9c5f893dd3e39fh9ofi,jefix,jewelry,phone4,9759243157894736,in,18.99.175.133,vasstdc27m7nks3
4,4,8393c48eaf4b8fa96886edc7cf27b372dsibi,jefix,jewelry,phone5,9759243157894736,in,145.138.5.37,vasstdc27m7nks3


#### Calculate time interval<br>
First sort the bidder_id and time, so we can calculate the bidder's bidding time interval later.<br>
Second get time difference between two bids of one bidder. 

In [3]:
# Step 1: Sort the bidder_id and time
bids_df = bids_df.sort_values(["bidder_id", "time"], ascending= True)
print(bids_df.shape)

(7647475, 9)


In [4]:
# Step 2: Get the time interval of each bidder between each bid
timediff = bids_df.groupby("bidder_id")["time"].diff()
bids_df["timediff"] = timediff

Question:
However, if the time interval is over the auction duration, the time_diff may not useful.<br>
1. Replace the time_diff if the interval is over the max of auction duration
2. Replace the time_diff if the interval is over the mean of auction duration
3. Ignore this??

#### Bagging for country

In [5]:
bids_df.nunique()

bid_id         7647475
bidder_id         6609
auction          15051
merchandise         10
device            7350
time            776523
country            199
ip             2297413
url            1784331
timediff         37679
dtype: int64

In [6]:
countries = bids_df["country"].value_counts()

In [7]:
replace_countries = list(countries[countries < 1000].index)
for country in replace_countries:
    bids_df.country = bids_df.country.replace(country, "Others")
    
bids_df.country.value_counts()

in    1788731
ng     586586
id     453611
tr     319574
us     318103
       ...   
gl        157
gu        144
gy        131
sr        120
ps        107
Name: country, Length: 170, dtype: int64

In [78]:
# pd.set_option('display.max_rows', 200) # Display all the country
# countries

country_in    1788731
country_ng     586586
country_id     453611
country_tr     319574
country_us     318103
country_za     297942
country_ke     287962
country_th     238172
country_my     216799
country_vn     190342
country_gh     179213
country_ru     140288
country_uk     138692
country_mx     138593
country_br     107716
country_az      88555
country_ir      86551
country_bd      83983
country_pk      79457
country_sa      74860
country_et      74055
country_ro      63664
country_ph      61380
country_ua      54346
country_de      54285
country_lk      52781
country_pl      52012
country_ae      49141
country_gt      45564
country_tz      43051
country_fr      42140
country_ca      41777
country_au      40946
country_rs      39281
country_es      36244
country_it      35799
country_jo      34955
country_ar      34717
country_bn      33079
country_eg      32429
country_sg      31206
country_zm      28064
country_ma      27633
country_iq      26365
country_jp      26162
country_cn

## Group by bidders

In [17]:
# Create a dataframe to store groupby bider_id
# bidders = pd.DataFrame(data = bids_df['bidder_id'].unique(), columns = ['bidder_id'],
#                     index = bids_df['bidder_id'].unique())
# text_cols = ['auction', 'merchandise', 'device', 'country', 'ip', 'url']
# for var in text_cols:
#     bids_df[var] = var + "_" + bids_df[var].fillna("")
#     count_str = var + '_nunique_num'
#     bidders[count_str]  = bids_df.groupby('bidder_id')[var].nunique()

# bidders.head()

In [7]:
# Count number of auctions, merchandise, devices, countries, ips, urls per bidder has
def count_per_bidder(data):
    object_cols = ["auction", "merchandise", "device", "country", "ip", "url"]
    counts = {}
    
    for var in object_cols:
        count_str = var + "_count"
        counts[count_str]  = data[var].nunique()
    
    counts = pd.Series(counts)
    return counts

In [8]:
bidders = bids_df.groupby("bidder_id").apply(count_per_bidder)
print(bidders.shape)
bidders.head()

(6609, 6)


Unnamed: 0_level_0,auction_count,merchandise_count,device_count,country_count,ip_count,url_count
bidder_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
001068c415025a009fee375a12cff4fcnht8y,1,1,1,1,1,1
002d229ffb247009810828f648afc2ef593rb,1,1,2,1,1,1
0030a2dd87ad2733e0873062e4f83954mkj86,1,1,1,1,1,1
003180b29c6a5f8f1d84a6b7b6f7be57tjj1o,3,1,3,1,3,2
00486a11dff552c4bd7696265724ff81yeo9v,13,1,8,1,10,7


In [9]:
mean_diff = bids_df.groupby(["bidder_id"])["timediff"].mean()
mean_diff = mean_diff.fillna(0) # mean_diff.mean()
bidders["time_mean"] = mean_diff

median_diff = bids_df.groupby(["bidder_id"])["timediff"].median()
median_diff = median_diff.fillna(0) # median_diff.mean()
bidders["time_median"] = median_diff

std_diff = bids_df.groupby(["bidder_id"])["timediff"].std()
std_diff = std_diff.fillna(0) # std_diff.mean()
bidders["time_std"] = std_diff

print(bidders.shape)
bidders.head()

(6609, 9)


Unnamed: 0_level_0,auction_count,merchandise_count,device_count,country_count,ip_count,url_count,time_mean,time_median,time_std
bidder_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
001068c415025a009fee375a12cff4fcnht8y,1,1,1,1,1,1,0.0,0.0,0.0
002d229ffb247009810828f648afc2ef593rb,1,1,2,1,1,1,105263200.0,105263200.0,0.0
0030a2dd87ad2733e0873062e4f83954mkj86,1,1,1,1,1,1,0.0,0.0,0.0
003180b29c6a5f8f1d84a6b7b6f7be57tjj1o,3,1,3,1,3,2,32977840000000.0,32977840000000.0,39044430000000.0
00486a11dff552c4bd7696265724ff81yeo9v,13,1,8,1,10,7,4018413000000.0,917736800000.0,11537300000000.0


In [23]:
# Save bidders
bidders.to_csv("./Resources/bidders.csv")

In [11]:
# Encode merchandise and country
country = (bids_df["country"].unique()).tolist()
merchandise = (bids_df["merchandise"].unique()).tolist()

def dummy_country_merch(data):
    count_merch = dict.fromkeys(merchandise, 0)
    count_merch.update(dict.fromkeys(country, 0))
        
    for merch, value in data["merchandise"].value_counts().iteritems():
        count_merch[merch] = value
        
    for count in data["country"].unique():
        count_merch[count] = 1
        
    count_merch = pd.Series(count_merch)
    return count_merch

In [12]:
bidders_count_merch = bids_df.groupby("bidder_id").apply(dummy_country_merch)
print(bidders_count_merch.shape)
bidders_count_merch.head()

(6609, 209)


Unnamed: 0_level_0,jewelry,mobile,homegoods,booksandmusic,sportinggoods,officeequipment,furniture,clothing,computers,autoparts,...,vc,je,zz,an,gi,nc,aw,vi,tc,gp
bidder_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
001068c415025a009fee375a12cff4fcnht8y,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
002d229ffb247009810828f648afc2ef593rb,0,2,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0030a2dd87ad2733e0873062e4f83954mkj86,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
003180b29c6a5f8f1d84a6b7b6f7be57tjj1o,0,3,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
00486a11dff552c4bd7696265724ff81yeo9v,0,0,20,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [26]:
# Save bidders_count_merch
bidders_count_merch.to_csv("./Resources/bidders_count_merch.csv", index=False)

## Group by auctions and bidders

In [14]:
def bidder_auction (data):
    object_cols_new = ["merchandise", "device", "country", "ip", "url"]
    counts = {}
    
    for var in object_cols_new:
        count_str = var + "_count"
        counts[count_str]  = data[var].nunique()
    
    counts = pd.Series(counts)
    return counts

In [15]:
bidder_auc = bids_df.groupby(["bidder_id", "auction"]).apply(bidder_auction)
print(bidder_auc.shape)
bidder_auc.head()

(382324, 5)


Unnamed: 0_level_0,Unnamed: 1_level_0,merchandise_count,device_count,country_count,ip_count,url_count
bidder_id,auction,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
001068c415025a009fee375a12cff4fcnht8y,4ifac,1,1,1,1,1
002d229ffb247009810828f648afc2ef593rb,2tdw2,1,2,1,1,1
0030a2dd87ad2733e0873062e4f83954mkj86,obbny,1,1,1,1,1
003180b29c6a5f8f1d84a6b7b6f7be57tjj1o,cqsh6,1,1,1,1,1
003180b29c6a5f8f1d84a6b7b6f7be57tjj1o,efh5o,1,1,1,1,1


In [16]:
mean_diff = bids_df.groupby(["bidder_id", "auction"])["timediff"].mean()
mean_diff = mean_diff.fillna(0) # mean_diff.mean()
bidder_auc["time_mean"] = mean_diff

median_diff = bids_df.groupby(["bidder_id", "auction"])["timediff"].median()
median_diff = median_diff.fillna(0) # median_diff.mean()
bidder_auc["time_median"] = median_diff

std_diff = bids_df.groupby(["bidder_id", "auction"])["timediff"].std()
std_diff = std_diff.fillna(0) # std_diff.mean()
bidder_auc["time_std"] = std_diff

print(bidder_auc.shape)
bidder_auc.head()

(382324, 8)


Unnamed: 0_level_0,Unnamed: 1_level_0,merchandise_count,device_count,country_count,ip_count,url_count,time_mean,time_median,time_std
bidder_id,auction,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
001068c415025a009fee375a12cff4fcnht8y,4ifac,1,1,1,1,1,0.0,0.0,0.0
002d229ffb247009810828f648afc2ef593rb,2tdw2,1,2,1,1,1,105263200.0,105263200.0,0.0
0030a2dd87ad2733e0873062e4f83954mkj86,obbny,1,1,1,1,1,0.0,0.0,0.0
003180b29c6a5f8f1d84a6b7b6f7be57tjj1o,cqsh6,1,1,1,1,1,60586420000000.0,60586420000000.0,0.0
003180b29c6a5f8f1d84a6b7b6f7be57tjj1o,efh5o,1,1,1,1,1,5369263000000.0,5369263000000.0,0.0


In [25]:
# Save bidder_auc to csv
bidder_auc.to_csv("./Resources/bidder_auc.csv", index=False)

In [18]:
bidder_auc_cm = bids_df.groupby(["bidder_id", "auction"]).apply(dummy_country_merch)
print(bidder_auc_cm.shape)
bidder_auc_cm.head()

(382324, 209)


Unnamed: 0_level_0,Unnamed: 1_level_0,jewelry,mobile,homegoods,booksandmusic,sportinggoods,officeequipment,furniture,clothing,computers,autoparts,...,vc,je,zz,an,gi,nc,aw,vi,tc,gp
bidder_id,auction,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
001068c415025a009fee375a12cff4fcnht8y,4ifac,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
002d229ffb247009810828f648afc2ef593rb,2tdw2,0,2,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0030a2dd87ad2733e0873062e4f83954mkj86,obbny,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
003180b29c6a5f8f1d84a6b7b6f7be57tjj1o,cqsh6,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
003180b29c6a5f8f1d84a6b7b6f7be57tjj1o,efh5o,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [24]:
# Save bidder_auc_cm to csv
bidder_auc_cm.to_csv("./Resources/bidder_auc_cm.csv", index=False)