In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

from category_encoders import TargetEncoder
from category_encoders import BinaryEncoder
from catboost import CatBoostRegressor

In [2]:
train_data = pd.read_csv('./data/train_data.csv')

In [3]:
def data_prep(df: pd.DataFrame, p: float = 0.95, is_train: bool = True ) -> pd.DataFrame:
    # Exclude unnecessary columns
    df = df.drop(['c2', 
            'c4', 
            'appVersion',
            'bidFloorPrice'
            ], axis=1)
    
    if is_train:
        # filter_outliers
        # Calculate the 95th percentile
        quantile_p = df['winBid'].quantile(p)

        # Filter the DataFrame
        df = df[df['winBid'] <= quantile_p]

        # Drop rows where countryCode and connectionType are missing
        df = df.dropna(subset=['countryCode', 'connectionType'])
    return df

In [4]:
def train_feature_engineering(df: pd.DataFrame) -> tuple:

    encoders = {}
    top_vals = {}

    #brandName
    # Number of top brands categories to keep
    n_brands = 100 
    top_N_brands = df['brandName'].value_counts().index[:n_brands]
    top_vals['brands'] = top_N_brands
    df['brandName'] = np.where(df['brandName'].isin(top_N_brands), df['brandName'], 'Other')
    brand_counts = df['brandName'].value_counts().to_dict()
    df['brandName_freq'] = df['brandName'].map(brand_counts)
    # Drop the original 'brandName' column
    df = df.drop('brandName', axis=1)

    # bundleId
    target_encoder = TargetEncoder(smoothing=0.5)
    target_encoder.fit(df['bundleId'], df['winBid'])
    encoders['bundleId_encoder'] = target_encoder
    df['bundleId_encoded'] = target_encoder.transform(df['bundleId'])
    # Drop the original 'bundleId' column
    df = df.drop('bundleId', axis=1)



    # countryCode
    binary_encoder = BinaryEncoder()
    binary_encoder.fit(df['countryCode'])
    encoders['countryCode_encoder'] = binary_encoder
    df_binary = binary_encoder.transform(df['countryCode'])
    # Drop the original 'countryCode' column
    df = df.drop('countryCode', axis=1)
    # Concatenate the binary encoded 'countryCode' to the original dataframe
    df = pd.concat([df, df_binary], axis=1)
    df = df.drop('countryCode_0', axis=1)
    

    # deviceId
    device_counts = df['deviceId'].value_counts().to_dict()
    df['deviceId_counts'] = df['deviceId'].map(device_counts)
    # Drop the original 'deviceId' column
    df = df.drop('deviceId', axis=1)


    # correctModelName
    n_models = 200 
    top_N_models = df['correctModelName'].value_counts().index[:n_models]
    top_vals['correctModelName'] = top_N_models
    df['correctModelName'] = np.where(df['correctModelName'].isin(top_N_models), df['correctModelName'], 'Other')
    binary_encoder =  BinaryEncoder()
    binary_encoder.fit(df['correctModelName'])
    encoders['correctModelName'] = binary_encoder
    df_binary = binary_encoder.transform(df['correctModelName'])
    # Concatenate the binary encoded 'countryCode' to the original dataframe
    df = pd.concat([df, df_binary], axis=1)
    # Drop the original 'correctModelName' column
    df = df.drop('correctModelName', axis=1)


    # eventTimestamp 
    df['timestamp'] = pd.to_datetime(df['eventTimestamp'], unit='ms')
    # Extract date, hour, and day features
    df['day_of_month'] = df['timestamp'].dt.day
    df['hour'] =df['timestamp'].dt.hour
    # Drop the original 'eventTimestamp' and 'timestamp' columns
    df = df.drop(['eventTimestamp', 'timestamp'], axis=1)


    return  (df, encoders, top_vals)
            


In [5]:
df = data_prep(train_data)
df, train_encoders, train_top_values = train_feature_engineering(df)


In [6]:
def feature_engineering(df: pd.DataFrame, train_encoders, train_top_values, most_frequent_category) -> pd.DataFrame:
    #brandName
    df['brandName'] = np.where(df['brandName'].isin(train_top_values['brands']), df['brandName'], 'Other')
    brand_counts = df['brandName'].value_counts().to_dict()
    df['brandName_freq'] = df['brandName'].map(brand_counts)
    # Drop the original 'brandName' column
    df = df.drop('brandName', axis=1)

    # bundleId
    target_encoder = train_encoders['bundleId_encoder']
    df['bundleId_encoded'] = target_encoder.transform(df['bundleId'])
    # Drop the original 'bundleId' column
    df = df.drop('bundleId', axis=1)



    # countryCode
    binary_encoder = train_encoders['countryCode_encoder']
    df_binary = binary_encoder.transform(df['countryCode'])
    # Drop the original 'countryCode' column
    df = df.drop('countryCode', axis=1)
    # Concatenate the binary encoded 'countryCode' to the original dataframe
    df = pd.concat([df, df_binary], axis=1)
    df = df.drop('countryCode_0', axis=1)
    

    # deviceId
    device_counts = df['deviceId'].value_counts().to_dict()
    df['deviceId_counts'] = df['deviceId'].map(device_counts)
    # Drop the original 'deviceId' column
    df = df.drop('deviceId', axis=1)


    # correctModelName
    df['correctModelName'] = np.where(df['correctModelName'].isin(train_top_values['correctModelName']), df['correctModelName'], 'Other')
    binary_encoder =  train_encoders['correctModelName']
    df_binary = binary_encoder.transform(df['correctModelName'])
    # Concatenate the binary encoded 'countryCode' to the original dataframe
    df = pd.concat([df, df_binary], axis=1)
    # Drop the original 'correctModelName' column
    df = df.drop('correctModelName', axis=1)



    # eventTimestamp 
    df['timestamp'] = pd.to_datetime(df['eventTimestamp'], unit='ms')
    # Extract date, hour, and day features
    df['day_of_month'] = df['timestamp'].dt.day
    df['hour'] =df['timestamp'].dt.hour
    # Drop the original 'eventTimestamp' and 'timestamp' columns
    df = df.drop(['eventTimestamp', 'timestamp'], axis=1)


    cat_cols = ['unitDisplayType',
                'osAndVersion',
                'connectionType',
                'c1',
                'c3',
                'size',
                'mediationProviderVersion']
    for col in cat_cols:
        df[col] = df[col].fillna(most_frequent_category[col])

    
    return  df


In [7]:
# Set the target
target = df['winBid']

# Exclude unnecessary columns and the target column
features = df.drop(['winBid', 
                    'has_won'
                    ], axis=1)

## Training

In [8]:
# Split the data into training and validation sets
train_X, val_X, train_y, val_y = train_test_split(features, target, test_size=0.2, random_state=1)

In [9]:
# Categorical features
cat_cols = df.select_dtypes(include=['object']).columns.tolist()
cat_cols

['unitDisplayType',
 'osAndVersion',
 'connectionType',
 'c1',
 'c3',
 'size',
 'mediationProviderVersion']

In [10]:
most_frequent_category = {}
for col in cat_cols:
    print(f"{col}: {train_data[col].nunique()} unique values")
    most_frequent_category[col] = df[col].mode()[0]
most_frequent_category

unitDisplayType: 3 unique values
osAndVersion: 96 unique values
connectionType: 3 unique values
c1: 50 unique values
c3: 4 unique values
size: 6 unique values
mediationProviderVersion: 35 unique values


{'unitDisplayType': 'banner',
 'osAndVersion': 'Android-4.0',
 'connectionType': 'WIFI',
 'c1': '7d3',
 'c3': '6b',
 'size': '320x50',
 'mediationProviderVersion': '11.4.3'}

In [11]:
cat_features_index = [features.columns.get_loc(col) for col in cat_cols]
cat_features_index

[0, 1, 2, 3, 4, 5, 6]

In [12]:
# Define the model
model = CatBoostRegressor(iterations=300, learning_rate=0.8, depth=8, loss_function='RMSE')

In [13]:
# Fit the model
model.fit(train_X, train_y, cat_features=cat_features_index, eval_set=(val_X, val_y), plot=True)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 1.4855870	test: 1.4856336	best: 1.4856336 (0)	total: 2.3s	remaining: 11m 28s
1:	learn: 1.3741421	test: 1.3735879	best: 1.3735879 (1)	total: 4.17s	remaining: 10m 21s
2:	learn: 1.3459697	test: 1.3458599	best: 1.3458599 (2)	total: 5.9s	remaining: 9m 44s
3:	learn: 1.3307930	test: 1.3306414	best: 1.3306414 (3)	total: 7.6s	remaining: 9m 22s
4:	learn: 1.3206179	test: 1.3208671	best: 1.3208671 (4)	total: 9.14s	remaining: 8m 59s
5:	learn: 1.3144299	test: 1.3146828	best: 1.3146828 (5)	total: 10.6s	remaining: 8m 37s
6:	learn: 1.3068110	test: 1.3073129	best: 1.3073129 (6)	total: 12.2s	remaining: 8m 30s
7:	learn: 1.2888506	test: 1.2899789	best: 1.2899789 (7)	total: 13.9s	remaining: 8m 26s
8:	learn: 1.2812649	test: 1.2824492	best: 1.2824492 (8)	total: 15.6s	remaining: 8m 25s
9:	learn: 1.2778757	test: 1.2788768	best: 1.2788768 (9)	total: 17.5s	remaining: 8m 26s
10:	learn: 1.2746062	test: 1.2759493	best: 1.2759493 (10)	total: 19.4s	remaining: 8m 28s
11:	learn: 1.2677861	test: 1.2693249	best:

<catboost.core.CatBoostRegressor at 0x2d0877340>

## Model Analysis

In [14]:
feature_importance = model.get_feature_importance()
# Create a dictionary to map feature names to their importance values
feature_importance_dict = {feature_name: importance for feature_name, importance in zip(model.feature_names_, feature_importance)}

# Sort feature importance by importance value in descending order
sorted_importance_dict = dict(sorted(feature_importance_dict.items(), key=lambda item: item[1], reverse=True))

# Print feature importance in descending order
for feature_name, importance in sorted_importance_dict.items():
    print(f'{feature_name}: {importance}')

unitDisplayType: 37.39598405058714
countryCode_5: 9.16535646584781
deviceId_counts: 9.050600664486833
sentPrice: 8.132026844253845
countryCode_4: 6.990656903989932
countryCode_6: 5.164439576178308
countryCode_7: 2.9516745145608203
c1: 2.502502196359139
countryCode_3: 2.3101203734371145
bundleId_encoded: 2.2488884335224877
brandName_freq: 2.094723885323548
osAndVersion: 1.741564975527172
countryCode_2: 1.6911571899069346
c3: 1.3564184311019913
size: 1.1142514632978715
day_of_month: 1.0817997557643304
mediationProviderVersion: 0.9552976716583967
hour: 0.8573974041660122
countryCode_1: 0.5414949872104753
correctModelName_7: 0.48470644767637566
correctModelName_3: 0.42086770263819245
connectionType: 0.39379991388076574
correctModelName_2: 0.29962772880006383
correctModelName_5: 0.281578304758964
correctModelName_0: 0.2124590084332821
correctModelName_4: 0.19866251759941148
correctModelName_1: 0.18528740157879905
correctModelName_6: 0.17665518745386297


In [15]:
y_true = val_y.to_numpy()
y_true

array([1.29, 1.12, 0.08, ..., 0.4 , 0.18, 0.39])

In [16]:
y_pred = model.predict(val_X)
y_pred

array([1.07369419, 0.20807517, 0.16162359, ..., 0.30785617, 0.14010312,
       0.31048028])

In [17]:
def get_rsme_by_percentiles(y_true: np.array, y_pred: np.array) -> pd.DataFrame:
    # Let's define percentiles
    percentiles = [0, .1, .2, .3, .4, .5, .6, .7, .8, .9, 1.]

    # create a dataframe from predictions and actual values
    df_eval = pd.DataFrame({'y_true': y_true, 'y_pred': y_pred})

    # Calculate percentiles on actual values
    df_eval['percentile'] = pd.qcut(df_eval['y_true'], q=percentiles, labels=False)

    # Calculate RMSE for each percentile
    rmse_values = df_eval.groupby('percentile').apply(lambda group: np.sqrt(mean_squared_error(group['y_true'], group['y_pred'])))

    # Creating DataFrame to store all the information
    percentile_data = pd.DataFrame(index=range(len(percentiles)-1))

    # Calculate min, max and mean for each percentile
    percentile_data['min_value'] = df_eval.groupby('percentile')['y_true'].min()
    percentile_data['max_value'] = df_eval.groupby('percentile')['y_true'].max()
    percentile_data['mean_value'] = df_eval.groupby('percentile')['y_true'].mean()

    # Add RMSE
    percentile_data['rmse'] = rmse_values
    return percentile_data

In [18]:
'Root Mean Squared Error (RMSE):', np.sqrt(mean_squared_error(y_true, y_pred))

('Root Mean Squared Error (RMSE):', 1.0971960714167488)

In [19]:
get_rsme_by_percentiles(y_true, y_pred)

Unnamed: 0,min_value,max_value,mean_value,rmse
0,0.01,0.03,0.019756,0.264237
1,0.04,0.07,0.057175,0.237794
2,0.08,0.12,0.091782,0.246888
3,0.13,0.25,0.184817,0.342262
4,0.26,0.45,0.335894,0.430798
5,0.46,0.74,0.582791,0.549747
6,0.75,1.07,0.898974,0.630592
7,1.08,1.63,1.29886,0.741441
8,1.64,3.52,2.326411,1.345758
9,3.53,15.92,7.957463,2.917514


## Test model on all training data

In [20]:
train_data_all = pd.read_csv('./data/train_data.csv')

In [21]:
df = data_prep(df=train_data_all, p=0, is_train=False)
df = feature_engineering(df, train_encoders=train_encoders, train_top_values=train_top_values, most_frequent_category=most_frequent_category)

y_true = df['winBid'].to_numpy()

X = df.drop(['winBid', 
            'has_won'
        ], axis=1)

In [22]:
y_true.shape

(7321633,)

In [23]:
y_pred = model.predict(X)
y_pred.shape

(7321633,)

In [24]:
'Root Mean Squared Error (RMSE):', np.sqrt(mean_squared_error(y_true, y_pred))

('Root Mean Squared Error (RMSE):', 19.509501629119267)

In [25]:
get_rsme_by_percentiles(y_true, y_pred)



Unnamed: 0,min_value,max_value,mean_value,rmse
0,0.01,0.03,0.019761,0.265465
1,0.04,0.08,0.065204,0.229623
2,0.09,0.14,0.108197,0.309213
3,0.15,0.29,0.212704,0.354519
4,0.3,0.51,0.390939,0.515496
5,0.52,0.84,0.674398,0.586301
6,0.85,1.21,1.01982,0.687266
7,1.22,2.06,1.581981,0.893938
8,2.07,6.94,3.686935,2.040373
9,6.95,3405.72,34.185956,61.669538


### Export Test Data

Load test data

In [29]:
test_data = pd.read_csv('./data/test_data.csv')
device_id_test = test_data['deviceId'].copy()
df = data_prep(df=test_data, is_train=False)
df = feature_engineering(df, train_encoders=train_encoders, train_top_values=train_top_values, most_frequent_category=most_frequent_category)

In [30]:
test_preds = model.predict(df)
result_df = pd.DataFrame({
    'deviceID': device_id_test,
    'winBid': test_preds
})

result_df.to_csv('submission.csv', index=False)