In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.ensemble import BalancedRandomForestClassifier

## Pre-Modeling

#### Assemble the data

In [3]:
bidders = pd.read_csv("./Resources/bidders.csv")
country_merch = pd.read_csv("./Resources/bidders_count_merch.csv")
train = pd.read_csv("./Resources/train.csv")

bidders = bidders.merge(country_merch, right_index=True, left_index=True)

merged_bidders = train.merge(bidders, how= "left", on="bidder_id")
merged_bidders.head()

Unnamed: 0,bidder_id,payment_account,address,outcome,auction_nunique_num,merchandise_nunique_num,device_nunique_num,country_nunique_num,ip_nunique_num,url_nunique_num,...,country_vi,country_nc,country_tc,country_gb,country_mp,country_gp,country_an,country_gi,country_gn,country_mh
0,91a3c57b13234af24875c56fb7e2b2f4rb56a,a3d2de7675556553a5f08e4c88d2c228754av,a3d2de7675556553a5f08e4c88d2c228vt0u4,0.0,18.0,1.0,14.0,6.0,20.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,624f258b49e77713fc34034560f93fb3hu3jo,a3d2de7675556553a5f08e4c88d2c228v1sga,ae87054e5a97a8f840a3991d12611fdcrfbq3,0.0,1.0,1.0,2.0,1.0,3.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1c5f4fc669099bfbfac515cd26997bd12ruaj,a3d2de7675556553a5f08e4c88d2c2280cybl,92520288b50f03907041887884ba49c0cl0pd,0.0,4.0,1.0,2.0,1.0,4.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4bee9aba2abda51bf43d639013d6efe12iycd,51d80e233f7b6a7dfdee484a3c120f3b2ita8,4cb9717c8ad7e88a9a284989dd79b98dbevyi,0.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4ab12bc61c82ddd9c2d65e60555808acqgos1,a3d2de7675556553a5f08e4c88d2c22857ddh,2a96c3ce94b3be921e0296097b88b56a7x1ji,0.0,23.0,1.0,53.0,2.0,123.0,91.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Since each bidder in train dataset has corresponding unique payment account and address, and bidder_id is also a unique id that won't be useful for machine lerning. <br>
These three columns could be dropped.

In [4]:
merged_bidders = merged_bidders.drop(columns=["bidder_id", "payment_account", "address"])
merged_bidders.head()

Unnamed: 0,outcome,auction_nunique_num,merchandise_nunique_num,device_nunique_num,country_nunique_num,ip_nunique_num,url_nunique_num,time_mean,time_median,merchandise_homegoods,...,country_vi,country_nc,country_tc,country_gb,country_mp,country_gp,country_an,country_gi,country_gn,country_mh
0,0.0,18.0,1.0,14.0,6.0,20.0,1.0,571112100000.0,345842100000.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,1.0,1.0,2.0,1.0,3.0,2.0,3233579000000.0,3233579000000.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,4.0,1.0,2.0,1.0,4.0,2.0,2379000000000.0,2532053000000.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,1.0,1.0,1.0,1.0,1.0,1.0,3201947000000.0,1864477000000.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,23.0,1.0,53.0,2.0,123.0,91.0,77778880000.0,13684210000.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
merged_bidders.isnull().sum()

outcome                     0
auction_nunique_num        30
merchandise_nunique_num    30
device_nunique_num         30
country_nunique_num        30
                           ..
country_gp                 30
country_an                 30
country_gi                 30
country_gn                 30
country_mh                 30
Length: 217, dtype: int64

In [14]:
bidders.isnull().sum()

bidder_id                  0
auction_nunique_num        0
merchandise_nunique_num    0
device_nunique_num         0
country_nunique_num        0
                          ..
country_gp                 0
country_an                 0
country_gi                 0
country_gn                 0
country_mh                 0
Length: 217, dtype: int64

#### Spilt and scale the data

In [6]:
# Split our preprocessed data into features and target arrays
y = merged_bidders["outcome"].values
X = merged_bidders.drop(columns= "outcome", axis= 1).values

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)

# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [7]:
model = BalancedRandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

ValueError: Input contains NaN, infinity or a value too large for dtype('float32').