In [60]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [20]:
bids_df = pd.read_csv("./Resources/cleaned_bids.csv", index_col=0)
print(bids_df.shape)
bids_df.head()

(3068523, 10)


Unnamed: 0,bidder_id,outcome,bid_id,auction,merchandise,device,time,country,ip,url
0,91a3c57b13234af24875c56fb7e2b2f4rb56a,0.0,85371.0,yitr4,homegoods,phone35,9759489000000000.0,ke,82.34.177.248,vasstdc27m7nks3
1,91a3c57b13234af24875c56fb7e2b2f4rb56a,0.0,274487.0,btpyy,homegoods,phone1004,9760249000000000.0,zm,202.130.71.167,vasstdc27m7nks3
2,91a3c57b13234af24875c56fb7e2b2f4rb56a,0.0,527436.0,kj2ko,homegoods,phone4,9762261000000000.0,ke,22.54.76.225,vasstdc27m7nks3
3,91a3c57b13234af24875c56fb7e2b2f4rb56a,0.0,554135.0,1m5t8,homegoods,phone4,9762431000000000.0,ke,32.13.237.126,vasstdc27m7nks3
4,91a3c57b13234af24875c56fb7e2b2f4rb56a,0.0,607628.0,btpyy,homegoods,phone4,9762789000000000.0,ke,202.236.188.240,vasstdc27m7nks3


In [21]:
timediff = bids_df.groupby('bidder_id')['time'].diff()
bids_df['timediff'] = timediff

## Group by bidders

In [22]:
# Groupby bidders
bidders = pd.DataFrame(data = bids_df['bidder_id'].unique(), columns = ['bidder_id'],
                    index = bids_df['bidder_id'].unique())

In [23]:
text_cols = ['auction', 'merchandise', 'device', 'country', 'ip', 'url']
for var in text_cols:
    bids_df[var] = var + "_" + bids_df[var].fillna("")
    count_str = var + '_nunique_num'
    bidders[count_str]  = bids_df.groupby('bidder_id')[var].nunique()

bidders.head()

Unnamed: 0,bidder_id,auction_nunique_num,merchandise_nunique_num,device_nunique_num,country_nunique_num,ip_nunique_num,url_nunique_num
91a3c57b13234af24875c56fb7e2b2f4rb56a,91a3c57b13234af24875c56fb7e2b2f4rb56a,18,1,14,6,20,1
624f258b49e77713fc34034560f93fb3hu3jo,624f258b49e77713fc34034560f93fb3hu3jo,1,1,2,1,3,2
1c5f4fc669099bfbfac515cd26997bd12ruaj,1c5f4fc669099bfbfac515cd26997bd12ruaj,4,1,2,1,4,2
4bee9aba2abda51bf43d639013d6efe12iycd,4bee9aba2abda51bf43d639013d6efe12iycd,1,1,1,1,1,1
4ab12bc61c82ddd9c2d65e60555808acqgos1,4ab12bc61c82ddd9c2d65e60555808acqgos1,23,1,53,2,123,91


In [26]:
mean_diff = bids_df.groupby(["bidder_id"])["timediff"].mean()
mean_diff = mean_diff.fillna(mean_diff.mean())
bidders["time_mean"] = mean_diff
median_diff = bids_df.groupby(["bidder_id"])["timediff"].median()
median_diff = median_diff.fillna(median_diff.mean())
bidders["time_median"] = median_diff
bidders.head()

Unnamed: 0,bidder_id,auction_nunique_num,merchandise_nunique_num,device_nunique_num,country_nunique_num,ip_nunique_num,url_nunique_num,time_mean,time_median
91a3c57b13234af24875c56fb7e2b2f4rb56a,91a3c57b13234af24875c56fb7e2b2f4rb56a,18,1,14,6,20,1,571112100000.0,345842100000.0
624f258b49e77713fc34034560f93fb3hu3jo,624f258b49e77713fc34034560f93fb3hu3jo,1,1,2,1,3,2,3233579000000.0,3233579000000.0
1c5f4fc669099bfbfac515cd26997bd12ruaj,1c5f4fc669099bfbfac515cd26997bd12ruaj,4,1,2,1,4,2,2379000000000.0,2532053000000.0
4bee9aba2abda51bf43d639013d6efe12iycd,4bee9aba2abda51bf43d639013d6efe12iycd,1,1,1,1,1,1,3201947000000.0,1864477000000.0
4ab12bc61c82ddd9c2d65e60555808acqgos1,4ab12bc61c82ddd9c2d65e60555808acqgos1,23,1,53,2,123,91,77778880000.0,13684210000.0


In [33]:
# Save bidders
bidders.to_csv("./Resources/bidders.csv", index=False)

In [41]:
# Encode merchandise and country
country = (bids_df["country"].unique()).tolist()
merchandise = (bids_df["merchandise"].unique()).tolist()

def dummy_country_merch(data):
    count_merch = dict.fromkeys(merchandise, 0)
    count_merch.update(dict.fromkeys(country, 0))
        
    for merch, value in data["merchandise"].value_counts().iteritems():
        count_merch[merch] = value
        
    for count in data["country"].unique():
        count_merch[count] = 1
        
    count_merch = pd.Series(count_merch)
    return count_merch

In [42]:
bidders_count_merch = bids_df.groupby("bidder_id").apply(dummy_country_merch)
bidders_count_merch.head()

Unnamed: 0_level_0,merchandise_homegoods,merchandise_officeequipment,merchandise_sportinggoods,merchandise_booksandmusic,merchandise_mobile,merchandise_jewelry,merchandise_computers,merchandise_clothing,merchandise_furniture,merchandise_autoparts,...,country_vi,country_nc,country_tc,country_gb,country_mp,country_gp,country_an,country_gi,country_gn,country_mh
bidder_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
001068c415025a009fee375a12cff4fcnht8y,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0030a2dd87ad2733e0873062e4f83954mkj86,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
00a0517965f18610417ee784a05f494d4dw6e,0,0,0,141,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
00cc97158e6f4cb8eac3c0075918b7ffi5k8o,0,3,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
01067975436d123f717ee5aba0dd4bbfa0937,0,0,0,0,0,534,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [43]:
# Save bidders_count_merch
bidders_count_merch.to_csv("./Resources/bidders_count_merch.csv", index=False)

## Group by auctions and bidders

In [44]:
mean_diff = bids_df.groupby(["bidder_id", "auction"])["timediff"].mean()
median_diff = bids_df.groupby(["bidder_id", "auction"])["timediff"].median()