## Sequential Data Preprocess - RNN with LSTM/GRU and Transformer Models Training

### Prepare Train and Val dataset

In [1]:
import pandas as pd
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder, StandardScaler

In [2]:
# Load dataset
df = pd.read_csv("datasets/train_merged_df.csv")

In [3]:
# Drop sensitive features that are likely unique per bidder
df = df.drop(['payment_account', 'address'], axis=1)

In [4]:
# Encode categorical features
categorical_cols = ['merchandise', 'country']
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    le.fit(np.append(df[col].unique(), ['unknown']))
    label_encoders[col] = le
    df[col] = le.transform(df[col])

In [5]:
# Frequency encoding for high-cardinality (>1000) features (ip, url, auction, device)
freq_encoders = {}
for col in ['ip', 'url', 'auction', 'device']:
    freq = df[col].value_counts(normalize=True)
    freq_encoders[col] = freq
    df[f'{col}_freq'] = df[col].map(freq)
df = df.drop(['ip', 'url', 'auction', 'device'], axis=1)

In [6]:
# Enhance Temporal Features
df['time_dt'] = pd.to_datetime(df['time'], unit='ns')
df['hour'] = df['time_dt'].dt.hour
df['day_of_week'] = df['time_dt'].dt.dayofweek

In [7]:
# Normalize numerical feature (time)
std_scalers = {}
for col in ['time_dt', 'hour', 'day_of_week']:
    scaler = StandardScaler()
    scaler.fit(df[[col]])
    std_scalers[col] = scaler
    df[col] = scaler.transform(df[[col]])
    

In [8]:
# Group bids by bidder_id and sort by time
df = df.sort_values(['bidder_id', 'time'])
sequences = []
labels = []
bidder_ids = []
feature_cols = ['auction_freq', 'merchandise', 'device_freq', 'time_dt', 'hour', 'day_of_week', 'country', 'ip_freq', 'url_freq']

In [9]:
for bidder_id, group in df.groupby('bidder_id'):
    # Extract features for the sequence
    seq = group[feature_cols].values
    sequences.append(seq)
    bidder_ids.append(bidder_id)
    labels.append(group['outcome'].iloc[0])


In [10]:
sequences[0]

array([[ 4.44448207e-04,  6.00000000e+00,  1.23403568e-04,
         1.85280872e-01, -1.88332317e-01,  2.30607300e-01,
         2.30000000e+01,  2.55272816e-04,  5.03401901e-01]])

In [11]:
# Pad sequences to the same length (use max length or a reasonable fixed length)
max_len = min(max(len(seq) for seq in sequences), 100)  # Cap at 100 for efficiency
X = pad_sequences(sequences, maxlen=max_len, padding='post', dtype='float32')
bidder_ids = np.array(bidder_ids) 
y = np.array(labels)

In [12]:
X.shape, y.shape

((1984, 100, 9), (1984,))

In [13]:
# Train-Test Split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [14]:
print("Train shape: (X, y):", (X_train.shape, y_train.shape))
print("Validation shape: (X, y):", (X_test.shape, y_test.shape))

Train shape: (X, y): ((1587, 100, 9), (1587,))
Validation shape: (X, y): ((397, 100, 9), (397,))


In [15]:
unique, frequency = np.unique(y_train, return_counts=True)
unique, frequency

(array([0., 1.]), array([1505,   82]))

In [16]:
unique, frequency = np.unique(y_test, return_counts=True)
unique, frequency

(array([0., 1.]), array([376,  21]))

### Handle Imbalance 

In [17]:
# # Apply SMOTE to balance the classes (SMOTE on flattened sequences)
# from imblearn.over_sampling import SMOTE

# n_samples, max_len, n_features = X_train.shape
# X_train_flat = X_train.reshape(n_samples, -1)  # Flatten for SMOTE

# smote = SMOTE(random_state=42)
# X_train_flat_resampled, y_train_resampled = smote.fit_resample(X_train_flat, y_train)

# # Reshape back to 3D for LSTM/GRU
# X_train_resampled = X_train_flat_resampled.reshape(-1, max_len, n_features)

In [19]:
# print("Train shape: (X, y):", (X_train_resampled.shape, y_train_resampled.shape))

In [20]:
# Save into npy, for LSTM/GRU and Transformer Models Training
np.save('datasets/X_train.npy', X_train)
np.save('datasets/X_val.npy', X_test)
np.save('datasets/y_train.npy', y_train)
np.save('datasets/y_val.npy', y_test)

## Prepare for Test Dataset

In [21]:
# Load dataset
df = pd.read_csv("datasets/test_merged_df.csv")

In [22]:
# Drop sensitive features that are likely unique per bidder
df = df.drop(['payment_account', 'address'], axis=1)

In [23]:
# Encode categorical features
categorical_cols = ['merchandise', 'country']
# label_encoders = [le, le, le, ...] # previous stored le while fit_transform train dataset
for col in categorical_cols:
    le = label_encoders[col]
    # Replace unseen categories with 'unknown' before transforming
    df[col] = df[col].apply(lambda x: x if x in le.classes_ else 'unknown')
    df[col] = le.transform(df[col])

In [24]:
# Frequency encoding for high-cardinality (>1000) features (ip, url, auction, device) 
for col in ['ip', 'url', 'auction', 'device']:
    freq = freq_encoders[col]
    df[f'{col}_freq'] = df[col].map(freq).fillna(freq.min())
df = df.drop(['ip', 'url', 'auction', 'device'], axis=1)

In [25]:
# Enhance Temporal Features
df['time_dt'] = pd.to_datetime(df['time'], unit='ns')
df['hour'] = df['time_dt'].dt.hour
df['day_of_week'] = df['time_dt'].dt.dayofweek

In [26]:
# Normalize numerical feature (time)
# resue std_scalers in training to transform
for col in ['time_dt', 'hour', 'day_of_week']:
    scaler = std_scalers[col]
    df[col] = scaler.transform(df[[col]])

In [30]:
# Group bids by bidder_id and sort by time
df = df.sort_values(['bidder_id', 'time'])
sequences = []
bidder_ids = []
feature_cols = ['auction_freq', 'merchandise', 'device_freq', 'time_dt', 'hour', 'day_of_week', 'country', 'ip_freq', 'url_freq']

In [31]:
for bidder_id, group in df.groupby('bidder_id'):
    # Extract features for the sequence
    seq = group[feature_cols].values
    sequences.append(seq)
    bidder_ids.append(bidder_id)


In [32]:
sequences[0]

array([[ 1.79081695e-05,  7.00000000e+00,  1.41083815e-03,
         1.33435164e+00, -1.46481722e+00,  1.55248093e+00,
         1.61000000e+02,  3.25603082e-07,  5.03401901e-01],
       [ 1.79081695e-05,  7.00000000e+00,  1.22198837e-03,
         1.33435364e+00, -1.46481722e+00,  1.55248093e+00,
         1.61000000e+02,  3.25603082e-07,  5.03401901e-01]])

In [33]:
# Pad sequences to the same length (use max length or a reasonable fixed length)
# reuse max_len from training
X = pad_sequences(sequences, maxlen=max_len, padding='post', dtype='float32')
bidder_ids = np.array(bidder_ids) 


In [34]:
X.shape

(4630, 100, 9)

In [35]:
bidder_ids.shape

(4630,)

In [36]:
# Save into npy, for LSTM/GRU and Transformer Models Training
np.save('datasets/X_test.npy', X)
np.save('datasets/bidders_ids.npy', bidder_ids)

### How to load the dataset ?

In [37]:
# Load Train data
X_train = np.load("datasets/X_train.npy")
y_train = np.load("datasets/y_train.npy")

In [38]:
X_train.shape, y_train.shape

((1587, 100, 9), (1587,))

In [39]:
# Load Val data
X_val = np.load("datasets/X_val.npy")
y_val = np.load("datasets/y_val.npy")

In [40]:
X_val.shape, y_val.shape

((397, 100, 9), (397,))

In [41]:
# Load Val data
X_test = np.load("datasets/X_test.npy")
bidder_ids = np.load("datasets/bidders_ids.npy")

In [42]:
X_test.shape, bidder_ids.shape

((4630, 100, 9), (4630,))

In [40]:
# Load dataset
df_test = pd.read_csv("datasets/train_merged_df.csv")
df_test.head()

Unnamed: 0,bidder_id,payment_account,address,outcome,bid_id,auction,merchandise,device,time,country,ip,url
0,91a3c57b13234af24875c56fb7e2b2f4rb56a,a3d2de7675556553a5f08e4c88d2c228754av,a3d2de7675556553a5f08e4c88d2c228vt0u4,0.0,85371.0,yitr4,home goods,phone35,9759489000000000.0,ke,82.34.177.248,vasstdc27m7nks3
1,91a3c57b13234af24875c56fb7e2b2f4rb56a,a3d2de7675556553a5f08e4c88d2c228754av,a3d2de7675556553a5f08e4c88d2c228vt0u4,0.0,274487.0,btpyy,home goods,phone1004,9760249000000000.0,zm,202.130.71.167,vasstdc27m7nks3
2,91a3c57b13234af24875c56fb7e2b2f4rb56a,a3d2de7675556553a5f08e4c88d2c228754av,a3d2de7675556553a5f08e4c88d2c228vt0u4,0.0,527436.0,kj2ko,home goods,phone4,9762261000000000.0,ke,22.54.76.225,vasstdc27m7nks3
3,91a3c57b13234af24875c56fb7e2b2f4rb56a,a3d2de7675556553a5f08e4c88d2c228754av,a3d2de7675556553a5f08e4c88d2c228vt0u4,0.0,554135.0,1m5t8,home goods,phone4,9762431000000000.0,ke,32.13.237.126,vasstdc27m7nks3
4,91a3c57b13234af24875c56fb7e2b2f4rb56a,a3d2de7675556553a5f08e4c88d2c228754av,a3d2de7675556553a5f08e4c88d2c228vt0u4,0.0,607628.0,btpyy,home goods,phone4,9762789000000000.0,ke,202.236.188.240,vasstdc27m7nks3


In [41]:

for col in categorical_cols:
    print(f"{col} unique values: {df[col].nunique()}")

auction unique values: 11689
merchandise unique values: 9
device unique values: 4953
country unique values: 196


In [4]:

for col in ['ip', 'url']:
    print(f"{col} unique values: {df[col].nunique()}")

ip unique values: 1030950
url unique values: 663873
